Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

.gitignore +215 -0
README.md +19 -0
main.py +198 -0
requirements.txt +5 -0
transnetv2-pytorch-weights.pth +3 -0
transnetv2_pytorch.py +318 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,215 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# Kaggle
+.kaggle/
+data/
+volumes/
+json/
+data/
+kaggle.json

README.md CHANGED Viewed

@@ -1,3 +1,22 @@
 ---
 license: mit
 ---

 ---
 license: mit
 ---
+# TransNetV2 (PyTorch Version)
+This repository provides a PyTorch version of [TransNet V2](https://github.com/soCzech/TransNetV2), a state-of-the-art neural network for shot boundary detection in videos.
+## Installation
+Clone the repository and install the required dependencies.
+```sh
+sudo apt-get install ffmpeg
+pip install requirements.txt
+```
+## Usage
+```sh
+python -m main --files="path/to/your/file/or/folder"  --weights="path/to/the/model/weights" --visualize
+```

main.py ADDED Viewed

	@@ -0,0 +1,198 @@

+from TransNetV2.transnetv2_pytorch import TransNetV2
+from typing import Optional
+import torch
+import os
+import numpy as np
+from PIL import Image, ImageDraw
+import argparse
+from tqdm import tqdm
+try:
+    import ffmpeg
+except ModuleNotFoundError:
+    raise ModuleNotFoundError("For `predict_video` function `ffmpeg` needs to be installed in order to extract "
+                                "individual frames from video file. Install `ffmpeg` command line tool and then "
+                                "install python wrapper by `pip install ffmpeg-python`.")
+class TransNetV2Torch:
+    def __init__(self, model_path: Optional[str] = None):
+        weights_path = model_path or os.path.join(os.path.dirname(__file__), "transnetv2-pytorch-weights.pth")
+        if not os.path.isfile(weights_path):
+            raise FileNotFoundError(f"[TransNetV2] ERROR: weights file not found at {weights_path}.")
+        else:
+            print(f"[TransNetV2] Using weights from {weights_path}.")
+        self._input_size = (27, 48, 3)
+        self.model = TransNetV2()
+        try:
+            self.model.load_state_dict(torch.load(weights_path))
+        except Exception as exc:
+            raise IOError(f"[TransNetV2] Could not load weights from {weights_path}.") from exc
+        self.model.eval()
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model.to(self.device)
+    def predict_raw(self, frames: np.ndarray):
+        assert len(frames.shape) == 5 and frames.shape[2:] == self._input_size, \
+            "[TransNetV2] Input shape must be [batch, frames, height, width, 3]."
+        frames_tensor = torch.from_numpy(frames)
+        with torch.no_grad():
+            single_frame_pred, all_frames_pred = self.model(frames_tensor.to(self.device))
+            single_frame_pred = torch.sigmoid(single_frame_pred).cpu().numpy()
+            all_frames_pred = torch.sigmoid(all_frames_pred["many_hot"]).cpu().numpy()
+        return single_frame_pred, all_frames_pred
+    def predict_frames(self, frames: np.ndarray):
+        assert len(frames.shape) == 4 and frames.shape[1:] == self._input_size, \
+            "[TransNetV2] Input shape must be [frames, height, width, 3]."
+        total = len(frames)
+        def input_iterator():
+            # return windows of size 100 where the first/last 25 frames are from the previous/next batch
+            # the first and last window must be padded by copies of the first and last frame of the video
+            no_padded_frames_start = 25
+            no_padded_frames_end = 25 + 50 - (total % 50 if total % 50 != 0 else 50)  # 25 - 74
+            start_frame = np.expand_dims(frames[0], 0)
+            end_frame = np.expand_dims(frames[-1], 0)
+            padded_inputs = np.concatenate(
+                [start_frame] * no_padded_frames_start + [frames] + [end_frame] * no_padded_frames_end, 0
+            )
+            ptr = 0
+            while ptr + 100 <= len(padded_inputs):
+                out = padded_inputs[ptr:ptr + 100]
+                ptr += 50
+                yield out[np.newaxis]
+        predictions = []
+        for inp in input_iterator():
+            single_frame_pred, all_frames_pred = self.predict_raw(inp)
+            predictions.append((single_frame_pred[0, 25:75, 0],
+                                all_frames_pred[0, 25:75, 0]))
+            print("\r[TransNetV2] Processing video frames {}/{}".format(
+                min(len(predictions) * 50, total), total
+            ), end="")
+        print("")
+        single_frame_pred = np.concatenate([single_ for single_, _ in predictions])
+        all_frames_pred = np.concatenate([all_ for _, all_ in predictions])
+        return single_frame_pred[:total], all_frames_pred[:total]
+    def predict_video(self, video_fn: str):
+        print("[TransNetV2] Extracting frames from {}".format(video_fn))
+        video_stream, _ = ffmpeg.input(video_fn).output(
+            "pipe:", format="rawvideo", pix_fmt="rgb24", s="48x27"
+        ).run(capture_stdout=True, capture_stderr=True)
+        video = np.frombuffer(video_stream, np.uint8).reshape([-1, 27, 48, 3])
+        return (video, *self.predict_frames(video))
+    @staticmethod
+    def predictions_to_scenes(predictions: np.ndarray, threshold: float = 0.5):
+        predictions = (predictions > threshold).astype(np.uint8)
+        scenes = []
+        t_prev, start = 0, 0
+        for i, t in enumerate(predictions):
+            if t_prev == 1 and t == 0:
+                start = i
+            if t_prev == 0 and t == 1 and i != 0:
+                scenes.append([start, i])
+            t_prev = t
+        if t == 0:
+            scenes.append([start, i])
+        if len(scenes) == 0: # just fix if all predictions are 1
+            return np.array([[0, len(predictions) - 1]], dtype=np.int32)
+        return np.array(scenes, dtype=np.int32)
+    @staticmethod
+    def visualize_predictions(frames: np.ndarray, predictions):
+        if isinstance(predictions, np.ndarray):
+            predictions = [predictions]
+        ih, iw, ic = frames.shape[1:]
+        width = 25
+        # pad frames so that length of the video is divisible by width
+        # pad frames also by len(predictions) pixels in width in order to show predictions
+        pad_with = width - len(frames) % width if len(frames) % width != 0 else 0
+        frames = np.pad(frames, [(0, pad_with), (0, 1), (0, len(predictions)), (0, 0)])
+        predictions = [np.pad(x, (0, pad_with)) for x in predictions]
+        height = len(frames) // width
+        img = frames.reshape([height, width, ih + 1, iw + len(predictions), ic])
+        img = np.concatenate(np.split(
+            np.concatenate(np.split(img, height), axis=2)[0], width
+        ), axis=2)[0, :-1]
+        img = Image.fromarray(img)
+        draw = ImageDraw.Draw(img)
+        for i, pred in enumerate(zip(*predictions)):
+            x, y = i % width, i // width
+            x, y = x * (iw + len(predictions)) + iw, y * (ih + 1) + ih - 1
+            # we can visualize multiple predictions per single frame
+            for j, p in enumerate(pred):
+                color = [0, 0, 0]
+                color[(j + 1) % 3] = 255
+                value = round(p * (ih - 1))
+                if value != 0:
+                    draw.line((x + j, y, x + j, y - value), fill=tuple(color), width=1)
+        return img
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--files", type=str, help="path to video files to process")
+    parser.add_argument("--weights", type=str, default=None,
+                        help="path to TransNet V2 weights, tries to infer the location if not specified")
+    parser.add_argument('--visualize', action="store_true",
+                        help="save a png file with prediction visualization for each extracted video")
+    args = parser.parse_args()
+    return args
+def main(args):
+    model = TransNetV2Torch(args.weights)
+    files = []
+    if os.path.isdir(args.files):
+        for f in os.listdir(args.files):
+            if f.lower().endswith(".mp4"):
+                files.append(os.path.join(args.files, f))
+    else:
+        files = [args.files]
+    for file in files:
+        video_frames, single_frame_predictions, all_frames_predictions = \
+            model.predict_video(file)
+        predictions = np.stack([single_frame_predictions, all_frames_predictions], 1)
+        np.savetxt(file + ".predictions.txt", predictions, fmt="%.6f")
+        scenes = model.predictions_to_scenes(single_frame_predictions)
+        np.savetxt(file + ".scenes.txt", scenes, fmt="%d")
+        if args.visualize:
+            pil_image = model.visualize_predictions(
+                video_frames, predictions=(single_frame_predictions, all_frames_predictions))
+            pil_image.save(file + ".vis.png")
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+numpy==2.3.2
+pillow==11.3.0
+tqdm==4.67.1
+torch==2.8.0
+ffmpeg-python==0.2.0

transnetv2-pytorch-weights.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eed5336d5d6a013c67f5863505a26e7e835053e64a9ce413d6b089ccba07bb53
+size 30509621

transnetv2_pytorch.py ADDED Viewed

	@@ -0,0 +1,318 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as functional
+import random
+class TransNetV2(nn.Module):
+    def __init__(self,
+                 F=16, L=3, S=2, D=1024,
+                 use_many_hot_targets=True,
+                 use_frame_similarity=True,
+                 use_color_histograms=True,
+                 use_mean_pooling=False,
+                 dropout_rate=0.5,
+                 use_convex_comb_reg=False,  # not supported
+                 use_resnet_features=False,  # not supported
+                 use_resnet_like_top=False,  # not supported
+                 frame_similarity_on_last_layer=False):  # not supported
+        super(TransNetV2, self).__init__()
+        if use_resnet_features or use_resnet_like_top or use_convex_comb_reg or frame_similarity_on_last_layer:
+            raise NotImplemented("Some options not implemented in Pytorch version of Transnet!")
+        self.SDDCNN = nn.ModuleList(
+            [StackedDDCNNV2(in_filters=3, n_blocks=S, filters=F, stochastic_depth_drop_prob=0.)] +
+            [StackedDDCNNV2(in_filters=(F * 2 ** (i - 1)) * 4, n_blocks=S, filters=F * 2 ** i) for i in range(1, L)]
+        )
+        self.frame_sim_layer = FrameSimilarity(
+            sum([(F * 2 ** i) * 4 for i in range(L)]), lookup_window=101, output_dim=128, similarity_dim=128, use_bias=True
+        ) if use_frame_similarity else None
+        self.color_hist_layer = ColorHistograms(
+            lookup_window=101, output_dim=128
+        ) if use_color_histograms else None
+        self.dropout = nn.Dropout(dropout_rate) if dropout_rate is not None else None
+        output_dim = ((F * 2 ** (L - 1)) * 4) * 3 * 6  # 3x6 for spatial dimensions
+        if use_frame_similarity: output_dim += 128
+        if use_color_histograms: output_dim += 128
+        self.fc1 = nn.Linear(output_dim, D)
+        self.cls_layer1 = nn.Linear(D, 1)
+        self.cls_layer2 = nn.Linear(D, 1) if use_many_hot_targets else None
+        self.use_mean_pooling = use_mean_pooling
+        self.eval()
+    def forward(self, inputs):
+        assert isinstance(inputs, torch.Tensor) and list(inputs.shape[2:]) == [27, 48, 3] and inputs.dtype == torch.uint8, \
+            "incorrect input type and/or shape"
+        # uint8 of shape [B, T, H, W, 3] to float of shape [B, 3, T, H, W]
+        x = inputs.permute([0, 4, 1, 2, 3]).float()
+        x = x.div_(255.)
+        block_features = []
+        for block in self.SDDCNN:
+            x = block(x)
+            block_features.append(x)
+        if self.use_mean_pooling:
+            x = torch.mean(x, dim=[3, 4])
+            x = x.permute(0, 2, 1)
+        else:
+            x = x.permute(0, 2, 3, 4, 1)
+            x = x.reshape(x.shape[0], x.shape[1], -1)
+        if self.frame_sim_layer is not None:
+            x = torch.cat([self.frame_sim_layer(block_features), x], 2)
+        if self.color_hist_layer is not None:
+            x = torch.cat([self.color_hist_layer(inputs), x], 2)
+        x = self.fc1(x)
+        x = functional.relu(x)
+        if self.dropout is not None:
+            x = self.dropout(x)
+        one_hot = self.cls_layer1(x)
+        if self.cls_layer2 is not None:
+            return one_hot, {"many_hot": self.cls_layer2(x)}
+        return one_hot
+class StackedDDCNNV2(nn.Module):
+    def __init__(self,
+                 in_filters,
+                 n_blocks,
+                 filters,
+                 shortcut=True,
+                 use_octave_conv=False,  # not supported
+                 pool_type="avg",
+                 stochastic_depth_drop_prob=0.0):
+        super(StackedDDCNNV2, self).__init__()
+        if use_octave_conv:
+            raise NotImplemented("Octave convolution not implemented in Pytorch version of Transnet!")
+        assert pool_type == "max" or pool_type == "avg"
+        if use_octave_conv and pool_type == "max":
+            print("WARN: Octave convolution was designed with average pooling, not max pooling.")
+        self.shortcut = shortcut
+        self.DDCNN = nn.ModuleList([
+            DilatedDCNNV2(in_filters if i == 1 else filters * 4, filters, octave_conv=use_octave_conv,
+                          activation=functional.relu if i != n_blocks else None) for i in range(1, n_blocks + 1)
+        ])
+        self.pool = nn.MaxPool3d(kernel_size=(1, 2, 2)) if pool_type == "max" else nn.AvgPool3d(kernel_size=(1, 2, 2))
+        self.stochastic_depth_drop_prob = stochastic_depth_drop_prob
+    def forward(self, inputs):
+        x = inputs
+        shortcut = None
+        for block in self.DDCNN:
+            x = block(x)
+            if shortcut is None:
+                shortcut = x
+        x = functional.relu(x)
+        if self.shortcut is not None:
+            if self.stochastic_depth_drop_prob != 0.:
+                if self.training:
+                    if random.random() < self.stochastic_depth_drop_prob:
+                        x = shortcut
+                    else:
+                        x = x + shortcut
+                else:
+                    x = (1 - self.stochastic_depth_drop_prob) * x + shortcut
+            else:
+                x += shortcut
+        x = self.pool(x)
+        return x
+class DilatedDCNNV2(nn.Module):
+    def __init__(self,
+                 in_filters,
+                 filters,
+                 batch_norm=True,
+                 activation=None,
+                 octave_conv=False):  # not supported
+        super(DilatedDCNNV2, self).__init__()
+        if octave_conv:
+            raise NotImplemented("Octave convolution not implemented in Pytorch version of Transnet!")
+        assert not (octave_conv and batch_norm)
+        self.Conv3D_1 = Conv3DConfigurable(in_filters, filters, 1, use_bias=not batch_norm)
+        self.Conv3D_2 = Conv3DConfigurable(in_filters, filters, 2, use_bias=not batch_norm)
+        self.Conv3D_4 = Conv3DConfigurable(in_filters, filters, 4, use_bias=not batch_norm)
+        self.Conv3D_8 = Conv3DConfigurable(in_filters, filters, 8, use_bias=not batch_norm)
+        self.bn = nn.BatchNorm3d(filters * 4, eps=1e-3) if batch_norm else None
+        self.activation = activation
+    def forward(self, inputs):
+        conv1 = self.Conv3D_1(inputs)
+        conv2 = self.Conv3D_2(inputs)
+        conv3 = self.Conv3D_4(inputs)
+        conv4 = self.Conv3D_8(inputs)
+        x = torch.cat([conv1, conv2, conv3, conv4], dim=1)
+        if self.bn is not None:
+            x = self.bn(x)
+        if self.activation is not None:
+            x = self.activation(x)
+        return x
+class Conv3DConfigurable(nn.Module):
+    def __init__(self,
+                 in_filters,
+                 filters,
+                 dilation_rate,
+                 separable=True,
+                 octave=False,  # not supported
+                 use_bias=True,
+                 kernel_initializer=None):  # not supported
+        super(Conv3DConfigurable, self).__init__()
+        if octave:
+            raise NotImplemented("Octave convolution not implemented in Pytorch version of Transnet!")
+        if kernel_initializer is not None:
+            raise NotImplemented("Kernel initializers are not implemented in Pytorch version of Transnet!")
+        assert not (separable and octave)
+        if separable:
+            # (2+1)D convolution https://arxiv.org/pdf/1711.11248.pdf
+            conv1 = nn.Conv3d(in_filters, 2 * filters, kernel_size=(1, 3, 3),
+                              dilation=(1, 1, 1), padding=(0, 1, 1), bias=False)
+            conv2 = nn.Conv3d(2 * filters, filters, kernel_size=(3, 1, 1),
+                              dilation=(dilation_rate, 1, 1), padding=(dilation_rate, 0, 0), bias=use_bias)
+            self.layers = nn.ModuleList([conv1, conv2])
+        else:
+            conv = nn.Conv3d(in_filters, filters, kernel_size=3,
+                             dilation=(dilation_rate, 1, 1), padding=(dilation_rate, 1, 1), bias=use_bias)
+            self.layers = nn.ModuleList([conv])
+    def forward(self, inputs):
+        x = inputs
+        for layer in self.layers:
+            x = layer(x)
+        return x
+class FrameSimilarity(nn.Module):
+    def __init__(self,
+                 in_filters,
+                 similarity_dim=128,
+                 lookup_window=101,
+                 output_dim=128,
+                 stop_gradient=False,  # not supported
+                 use_bias=False):
+        super(FrameSimilarity, self).__init__()
+        if stop_gradient:
+            raise NotImplemented("Stop gradient not implemented in Pytorch version of Transnet!")
+        self.projection = nn.Linear(in_filters, similarity_dim, bias=use_bias)
+        self.fc = nn.Linear(lookup_window, output_dim)
+        self.lookup_window = lookup_window
+        assert lookup_window % 2 == 1, "`lookup_window` must be odd integer"
+    def forward(self, inputs):
+        x = torch.cat([torch.mean(x, dim=[3, 4]) for x in inputs], dim=1)
+        x = torch.transpose(x, 1, 2)
+        x = self.projection(x)
+        x = functional.normalize(x, p=2, dim=2)
+        batch_size, time_window = x.shape[0], x.shape[1]
+        similarities = torch.bmm(x, x.transpose(1, 2))  # [batch_size, time_window, time_window]
+        similarities_padded = functional.pad(similarities, [(self.lookup_window - 1) // 2, (self.lookup_window - 1) // 2])
+        batch_indices = torch.arange(0, batch_size, device=x.device).view([batch_size, 1, 1]).repeat(
+            [1, time_window, self.lookup_window])
+        time_indices = torch.arange(0, time_window, device=x.device).view([1, time_window, 1]).repeat(
+            [batch_size, 1, self.lookup_window])
+        lookup_indices = torch.arange(0, self.lookup_window, device=x.device).view([1, 1, self.lookup_window]).repeat(
+            [batch_size, time_window, 1]) + time_indices
+        similarities = similarities_padded[batch_indices, time_indices, lookup_indices]
+        return functional.relu(self.fc(similarities))
+class ColorHistograms(nn.Module):
+    def __init__(self,
+                 lookup_window=101,
+                 output_dim=None):
+        super(ColorHistograms, self).__init__()
+        self.fc = nn.Linear(lookup_window, output_dim) if output_dim is not None else None
+        self.lookup_window = lookup_window
+        assert lookup_window % 2 == 1, "`lookup_window` must be odd integer"
+    @staticmethod
+    def compute_color_histograms(frames):
+        frames = frames.int()
+        def get_bin(frames):
+            # returns 0 .. 511
+            R, G, B = frames[:, :, 0], frames[:, :, 1], frames[:, :, 2]
+            R, G, B = R >> 5, G >> 5, B >> 5
+            return (R << 6) + (G << 3) + B
+        batch_size, time_window, height, width, no_channels = frames.shape
+        assert no_channels == 3
+        frames_flatten = frames.view(batch_size * time_window, height * width, 3)
+        binned_values = get_bin(frames_flatten)
+        frame_bin_prefix = (torch.arange(0, batch_size * time_window, device=frames.device) << 9).view(-1, 1)
+        binned_values = (binned_values + frame_bin_prefix).view(-1)
+        histograms = torch.zeros(batch_size * time_window * 512, dtype=torch.int32, device=frames.device)
+        histograms.scatter_add_(0, binned_values, torch.ones(len(binned_values), dtype=torch.int32, device=frames.device))
+        histograms = histograms.view(batch_size, time_window, 512).float()
+        histograms_normalized = functional.normalize(histograms, p=2, dim=2)
+        return histograms_normalized
+    def forward(self, inputs):
+        x = self.compute_color_histograms(inputs)
+        batch_size, time_window = x.shape[0], x.shape[1]
+        similarities = torch.bmm(x, x.transpose(1, 2))  # [batch_size, time_window, time_window]
+        similarities_padded = functional.pad(similarities, [(self.lookup_window - 1) // 2, (self.lookup_window - 1) // 2])
+        batch_indices = torch.arange(0, batch_size, device=x.device).view([batch_size, 1, 1]).repeat(
+            [1, time_window, self.lookup_window])
+        time_indices = torch.arange(0, time_window, device=x.device).view([1, time_window, 1]).repeat(
+            [batch_size, 1, self.lookup_window])
+        lookup_indices = torch.arange(0, self.lookup_window, device=x.device).view([1, 1, self.lookup_window]).repeat(
+            [batch_size, time_window, 1]) + time_indices
+        similarities = similarities_padded[batch_indices, time_indices, lookup_indices]
+        if self.fc is not None:
+            return functional.relu(self.fc(similarities))
+        return similarities