Spaces:

bfshi
/

AutoGaze

Running on Zero

App Files Files Community

bfshi commited on Feb 19

Commit

c0c592e

1 Parent(s): 7e3b296

update

Browse files

Files changed (38) hide show

Dockerfile +35 -0
README.md +4 -6
app.py +18 -8
autogaze/__init__.py +1 -0
autogaze/__pycache__/__init__.cpython-310.pyc +0 -0
autogaze/__pycache__/utils.cpython-310.pyc +0 -0
autogaze/datasets/__init__.py +1 -0
autogaze/datasets/__pycache__/__init__.cpython-310.pyc +0 -0
autogaze/datasets/__pycache__/video_utils.cpython-310.pyc +0 -0
autogaze/datasets/video_utils.py +133 -0
autogaze/models/__init__.py +1 -0
autogaze/models/__pycache__/__init__.cpython-310.pyc +0 -0
autogaze/models/autogaze/__init__.py +17 -0
autogaze/models/autogaze/__pycache__/__init__.cpython-310.pyc +0 -0
autogaze/models/autogaze/__pycache__/autogaze.cpython-310.pyc +0 -0
autogaze/models/autogaze/__pycache__/configuration_autogaze.cpython-310.pyc +0 -0
autogaze/models/autogaze/__pycache__/modeling_autogaze.cpython-310.pyc +0 -0
autogaze/models/autogaze/__pycache__/modeling_llama_multi_token_pred.cpython-310.pyc +0 -0
autogaze/models/autogaze/autogaze.py +432 -0
autogaze/models/autogaze/configuration_autogaze.py +326 -0
autogaze/models/autogaze/modeling_autogaze.py +431 -0
autogaze/models/autogaze/modeling_llama_multi_token_pred.py +471 -0
autogaze/tasks/__init__.py +1 -0
autogaze/tasks/__pycache__/__init__.cpython-310.pyc +0 -0
autogaze/tasks/video_mae_reconstruction/__init__.py +1 -0
autogaze/tasks/video_mae_reconstruction/__pycache__/__init__.cpython-310.pyc +0 -0
autogaze/tasks/video_mae_reconstruction/__pycache__/configuration_video_mae.cpython-310.pyc +0 -0
autogaze/tasks/video_mae_reconstruction/__pycache__/modeling_video_mae.cpython-310.pyc +0 -0
autogaze/tasks/video_mae_reconstruction/__pycache__/task_video_mae_reconstruction.cpython-310.pyc +0 -0
autogaze/tasks/video_mae_reconstruction/__pycache__/visualize_video_mae_reconstruction.cpython-310.pyc +0 -0
autogaze/tasks/video_mae_reconstruction/configuration_video_mae.py +159 -0
autogaze/tasks/video_mae_reconstruction/modeling_video_mae.py +1412 -0
autogaze/tasks/video_mae_reconstruction/task_video_mae_reconstruction.py +182 -0
autogaze/tasks/video_mae_reconstruction/visualize_video_mae_reconstruction.py +134 -0
autogaze/utils.py +205 -0
demo_utils.py +18 -16
packages.txt +16 -0
requirements.txt +2 -2

Dockerfile ADDED Viewed

	@@ -0,0 +1,35 @@

+FROM python:3.11-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y \
+    git \
+    git-lfs \
+    ffmpeg \
+    pkg-config \
+    libavcodec-dev \
+    libavformat-dev \
+    libavutil-dev \
+    libswscale-dev \
+    libswresample-dev \
+    libavdevice-dev \
+    libavfilter-dev \
+    libsm6 \
+    libxext6 \
+    cmake \
+    rsync \
+    libgl1 \
+    && rm -rf /var/lib/apt/lists/* \
+    && git lfs install
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 7860
+ENV GRADIO_SERVER_NAME="0.0.0.0"
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,14 +1,12 @@
 ---
 title: AutoGaze
-emoji: 📉
 colorFrom: blue
-colorTo: gray
 sdk: gradio
-sdk_version: 6.3.0
-python_version: '3.12'
 app_file: app.py
 pinned: false
-short_description: AutoGaze can remove redundant patches in any video.
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: AutoGaze
+emoji: 👀
 colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: 6.5.1
 app_file: app.py
 pinned: false
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,3 +1,11 @@
 import gradio as gr
 import tempfile
 import os
@@ -8,13 +16,6 @@ import av
 from PIL import Image
 import numpy as np
-try:
-    import spaces
-    ZEROGPU_AVAILABLE = True
-except ImportError:
-    ZEROGPU_AVAILABLE = False
-    print("Warning: spaces module not available. Running without ZeroGPU support.")
 model_cache = {}
 def get_model(device):
@@ -22,7 +23,16 @@ def get_model(device):
         model_cache[device] = load_model(device=device)
     return model_cache[device]
-device = "cuda" if torch.cuda.is_available() or ZEROGPU_AVAILABLE else "cpu"
 def cleanup_gpu():
     """Clean up GPU memory."""

+# IMPORTANT: Import spaces first, before any CUDA-related packages (torch, etc.)
+try:
+    import spaces
+    ZEROGPU_AVAILABLE = True
+except ImportError:
+    ZEROGPU_AVAILABLE = False
+    print("Warning: spaces module not available. Running without ZeroGPU support.")
 import gradio as gr
 import tempfile
 import os
 from PIL import Image
 import numpy as np
 model_cache = {}
 def get_model(device):
         model_cache[device] = load_model(device=device)
     return model_cache[device]
+# Determine device: use CUDA if available locally or if ZeroGPU will provide it
+if ZEROGPU_AVAILABLE:
+    device = "cuda"  # ZeroGPU will provide GPU
+    print("Using ZeroGPU (CUDA device will be allocated on demand)")
+elif torch.cuda.is_available():
+    device = "cuda"
+    print(f"Using CUDA GPU: {torch.cuda.get_device_name(0)}")
+else:
+    device = "cpu"
+    print("No GPU available, using CPU")
 def cleanup_gpu():
     """Clean up GPU memory."""

autogaze/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """AutoGaze package for video patch reduction."""

autogaze/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (198 Bytes). View file

autogaze/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (7.09 kB). View file

autogaze/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """AutoGaze datasets and utilities."""

autogaze/datasets/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (196 Bytes). View file

autogaze/datasets/__pycache__/video_utils.cpython-310.pyc ADDED Viewed

Binary file (4.05 kB). View file

autogaze/datasets/video_utils.py ADDED Viewed

	@@ -0,0 +1,133 @@

+"""Common utilities for video loading and processing."""
+import av
+import numpy as np
+import torch
+def get_relative_video_path(path):
+    """
+    Get the last three levels of the path as the relative path to the video.
+    Args:
+        path (str): Path to get the last three levels of.
+    Returns:
+        last_three (str): Last three levels of the path.
+    """
+    parts = path.replace("\\", "/").split("/")
+    return "/".join(parts[-3:]) if len(parts) >= 3 else path
+def read_video_pyav(container, indices):
+    """
+    Decode the video with PyAV decoder.
+    Args:
+        container (`av.container.input.InputContainer`): PyAV container.
+        indices (`List[int]`): List of frame indices to decode.
+    Returns:
+        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+    """
+    frames = []
+    container.seek(0)
+    start_index = indices[0]
+    end_index = indices[-1]
+    for i, frame in enumerate(container.decode(video=0)):
+        if i > end_index:
+            break
+        if i >= start_index and i in indices:
+            frames.append(frame)
+    return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+def sample_frame_indices(clip_len, frame_sample_rate, seg_len, random_sample_frame=False):
+    """
+    Sample a given number of frame indices from the video.
+    Args:
+        clip_len (`int`): Total number of frames to sample.
+        frame_sample_rate (`int`): Sample every n-th frame.
+        seg_len (`int`): Maximum allowed index of sample's last frame.
+    Returns:
+        indices (`List[int]`): List of sampled frame indices
+    """
+    converted_len = int(clip_len * frame_sample_rate)
+    if seg_len <= converted_len:
+        # Not enough frames, just return the first clip_len frames (or as many as possible)
+        indices = np.arange(min(clip_len, seg_len))
+        indices = np.pad(indices, (0, max(0, clip_len - len(indices))), mode="edge")
+        return indices.astype(np.int64)
+    if random_sample_frame:
+        end_idx = np.random.randint(converted_len, seg_len)
+        start_idx = end_idx - converted_len
+    else:
+        start_idx = 0
+        end_idx = converted_len
+    indices = np.linspace(start_idx, end_idx, num=clip_len)
+    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
+    return indices
+def process_video_frames(video, clip_len):
+    """
+    Process video frames to ensure correct shape and length.
+    Args:
+        video (np.ndarray): Video frames of shape (num_frames, H, W, 3)
+        clip_len (int): Target number of frames
+    Returns:
+        video (np.ndarray): Processed video of shape (clip_len, H, W, 3)
+    """
+    # Ensure video has shape (clip_len, H, W, 3)
+    if video.shape[0] != clip_len:
+        # Pad or repeat last frame if needed
+        if video.shape[0] < clip_len:
+            pad_frames = clip_len - video.shape[0]
+            last_frame = video[-1:]
+            video = np.concatenate(
+                [video, np.repeat(last_frame, pad_frames, axis=0)], axis=0
+            )
+        else:
+            video = video[:clip_len]
+    assert video.shape[0] == clip_len, (
+        f"Video has {video.shape[0]} frames, expected {clip_len}"
+    )
+    assert video.ndim == 4 and video.shape[-1] == 3, (
+        f"Video shape is {video.shape}, expected (clip_len, H, W, 3)"
+    )
+    return video
+def transform_video_for_pytorch(video, transform=None):
+    """
+    Transform video frames and convert to PyTorch format.
+    Args:
+        video (np.ndarray): Video frames of shape (clip_len, H, W, 3)
+        transform: Optional transform to apply
+    Returns:
+        img (np.ndarray): Transformed video of shape (clip_len, C, H, W)
+    """
+    if transform is not None:
+        imgs = transform(list(video)).pixel_values
+        if isinstance(imgs[0], list):  # frames are wrapped in a python list
+            img = imgs[0]
+        else:
+            img = imgs  # frames are not wrapped in a python list
+        img = np.stack(img)
+    else:
+        img = video  # fallback: return raw video
+    # Ensure output is (clip_len, C, H, W) for pytorch
+    if img.shape[1] == 3 and img.shape[-1] != 3:
+        # Already (clip_len, C, H, W)
+        pass
+    elif img.shape[-1] == 3:
+        # (clip_len, H, W, 3) -> (clip_len, 3, H, W)
+        img = np.transpose(img, (0, 3, 1, 2))
+    else:
+        raise ValueError(f"Unexpected image shape after transform: {img.shape}")
+    clip_len = img.shape[0]
+    assert img.shape[0] == clip_len and img.shape[1] == 3, (
+        f"Output img shape: {img.shape}"
+    )
+    return torch.tensor(img)

autogaze/models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """AutoGaze models."""

autogaze/models/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (178 Bytes). View file

autogaze/models/autogaze/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from .autogaze import AutoGaze
+from .configuration_autogaze import (
+    AutoGazeConfig,
+    GazeModelConfig,
+    VisionModelConfig,
+    ConnectorConfig,
+    GazeDecoderConfig,
+)
+__all__ = [
+    "AutoGaze",
+    "AutoGazeConfig",
+    "GazeModelConfig",
+    "VisionModelConfig",
+    "ConnectorConfig",
+    "GazeDecoderConfig",
+]

autogaze/models/autogaze/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (420 Bytes). View file

autogaze/models/autogaze/__pycache__/autogaze.cpython-310.pyc ADDED Viewed

Binary file (15.7 kB). View file

autogaze/models/autogaze/__pycache__/configuration_autogaze.cpython-310.pyc ADDED Viewed

Binary file (9.83 kB). View file

autogaze/models/autogaze/__pycache__/modeling_autogaze.cpython-310.pyc ADDED Viewed

Binary file (14.2 kB). View file

autogaze/models/autogaze/__pycache__/modeling_llama_multi_token_pred.cpython-310.pyc ADDED Viewed

Binary file (12.3 kB). View file

autogaze/models/autogaze/autogaze.py ADDED Viewed

	@@ -0,0 +1,432 @@

+import random
+from copy import deepcopy
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+from contextlib import nullcontext
+from einops import rearrange
+from omegaconf import OmegaConf
+from transformers.modeling_utils import PreTrainedModel
+from autogaze.utils import get_gazing_pos_from_gazing_mask
+from .modeling_autogaze import AutoGazeModel
+from .configuration_autogaze import AutoGazeConfig
+class AutoGaze(PreTrainedModel):
+    config_class = AutoGazeConfig
+    def __init__(self, config: AutoGazeConfig):
+        super().__init__(config)
+        self.config = config
+        self.gazing_ratio_config = config.gazing_ratio_config
+        self.gazing_ratio_each_frame_config = config.gazing_ratio_each_frame_config
+        self.scales = sorted([int(scale) for scale in str(config.scales).split('+')])
+        self.num_vision_tokens_each_frame = config.num_vision_tokens_each_frame
+        self.num_vision_tokens_each_scale_each_frame = [int(scale**2 / sum([scale**2 for scale in self.scales]) * self.num_vision_tokens_each_frame) for scale in self.scales]
+        self.frame_sampling_rate = config.gaze_model_config.vision_model_config.temporal_patch_size
+        self.image_mean = config.image_mean
+        self.image_std = config.image_std
+        self.attn_mode = config.attn_mode
+        # Create the gazing model
+        self.gazing_model = AutoGazeModel(config.gaze_model_config)
+        # Task loss requirement
+        self.has_task_loss_requirement_during_training = config.has_task_loss_requirement_during_training
+        self.has_task_loss_requirement_during_inference = config.has_task_loss_requirement_during_inference
+        self.task_loss_requirement_config = config.task_loss_requirement_config
+    def get_gazing_ratio(self, sync_across_ranks=True):
+        """
+        Sample the gazing ratio for the whole video according to the config.
+        """
+        sample_strategy = self.gazing_ratio_config['sample_strategy_during_training'] if self.training else self.gazing_ratio_config['sample_strategy_during_inference']
+        if sample_strategy == 'fixed':
+            ratio = self.gazing_ratio_config['fixed']['gazing_ratio']
+        elif sample_strategy == 'uniform':
+            ratio = random.uniform(self.gazing_ratio_config['uniform']['gazing_ratio_min'], self.gazing_ratio_config['uniform']['gazing_ratio_max'])
+        elif sample_strategy == 'exponential':
+            ratio = random.expovariate(self.gazing_ratio_config['exponential']['lambda'])
+            while ratio < self.gazing_ratio_config['exponential']['gazing_ratio_min'] or ratio > self.gazing_ratio_config['exponential']['gazing_ratio_max']:
+                ratio = random.expovariate(self.gazing_ratio_config['exponential']['lambda'])
+        if sync_across_ranks:
+            ratio = torch.tensor(ratio).cuda()
+            if torch.distributed.is_initialized():
+                torch.distributed.broadcast(ratio, src=0)  # Make every rank use the same gazing ratio. Otherwise, each rank will have different gazing ratio, and the train/inference time is bounded by the slowest rank (with highest gazing ratio).
+            ratio = ratio.item()
+        return ratio
+    def get_gazing_ratio_each_frame(self, inputs, video, gazing_ratio_mean, num_frames, temperature, use_cache):
+        """
+        Sample the gazing ratio for each frame according to the config.
+        """
+        sample_strategy = self.gazing_ratio_each_frame_config['sample_strategy_during_training'] if self.training else self.gazing_ratio_each_frame_config['sample_strategy_during_inference']
+        if sample_strategy == 'uniform':
+            gazing_ratio_each_frame = torch.ones(num_frames) * gazing_ratio_mean
+        elif sample_strategy == 'dirichlet':
+            gazing_ratio_agg = gazing_ratio_mean * num_frames
+            alpha = self.gazing_ratio_each_frame_config['dirichlet']['alpha']
+            if isinstance(alpha, str):
+                alpha = torch.tensor([float(a) for a in alpha.split(',')])
+                assert len(alpha) == num_frames, "The number of alpha values must be equal to the number of frames"
+            gazing_ratio_each_frame = torch.distributions.dirichlet.Dirichlet(torch.ones(num_frames) * alpha).sample() * gazing_ratio_agg
+            gazing_ratio_each_frame = gazing_ratio_each_frame.clamp(min=0, max=1)
+        elif sample_strategy == 'self':
+            assert use_cache == False, "using cache is not supported for self-predicted gazing ratio"
+            # Only preserve one sample for each group
+            if "group_size" in inputs:
+                video = rearrange(video, '(g b) t c h w -> g b t c h w', g=inputs["group_size"])[0]
+            assert video.shape[0] == 1, "Currently only batch_size=1 is supported because otherwise we need to support different gazing ratio constraints in the same batch in model.generate()"
+            # Max gazing ratio for each frame
+            max_gazing_ratio_each_frame = torch.ones(num_frames) * gazing_ratio_mean
+            max_num_gaze_tokens_each_frame = (max_gazing_ratio_each_frame * self.num_vision_tokens_each_frame).to(torch.long).clamp(min=1)
+            # Sample task loss requirement
+            task_loss_requirement = self.get_task_loss_requirement(video, force_sampling=True)
+            # Sample the gazing
+            with torch.no_grad():
+                if self.training:
+                    gazing_info = self.gazing_model.generate(
+                        video,
+                        max_gaze_tokens_each_frame=max_num_gaze_tokens_each_frame,
+                        task_loss_requirement=task_loss_requirement,
+                        do_sample=True,
+                        temperature=temperature,
+                    )
+                else:
+                    gazing_info = self.gazing_model.generate(
+                        video,
+                        max_gaze_tokens_each_frame=max_num_gaze_tokens_each_frame,
+                        task_loss_requirement=task_loss_requirement,
+                        do_sample=False,
+                    )
+            if_padded_gazing = gazing_info["if_padded_gazing"]
+            num_gazing_each_frame = gazing_info["num_gazing_each_frame"]
+            if_padded_gazing = if_padded_gazing.split(num_gazing_each_frame.tolist(), dim=1)
+            num_non_padded_gazing_each_frame = torch.stack([(~if_padded_gazing[i]).sum(dim=-1) for i in range(len(if_padded_gazing))], dim=1)  # (B, num_frames)
+            gazing_ratio_each_frame = num_non_padded_gazing_each_frame[0] / self.num_vision_tokens_each_frame
+        else:
+            raise NotImplementedError(f"Sample strategy {sample_strategy} not implemented.")
+        return gazing_ratio_each_frame
+    def get_task_loss_requirement(self, video, sync_across_ranks=True, force_sampling=False):
+        """
+        Sample the task loss requirement for each frame according to the config.
+        inputs:
+            video: tensor of shape (B, T, C, H, W)
+        returns:
+            task_loss_requirement: tensor of shape (B, T // frame_sampling_rate), representing the task loss requirement for each frame of each video. None if no task loss requirement is used.
+        """
+        has_task_loss_requirement = self.has_task_loss_requirement_during_training if self.training else self.has_task_loss_requirement_during_inference
+        if not has_task_loss_requirement and not force_sampling:
+            return None
+        B, T = video.shape[:2]
+        sample_strategy = self.task_loss_requirement_config['sample_strategy_during_training'] if self.training else self.task_loss_requirement_config['sample_strategy_during_inference']
+        if sample_strategy == 'fixed':
+            task_loss_requirement = self.task_loss_requirement_config['fixed']['task_loss_requirement']
+            task_loss_requirement = torch.ones(B, T // self.frame_sampling_rate, device=video.device) * task_loss_requirement
+        elif sample_strategy == 'uniform':
+            task_loss_requirement_min = self.task_loss_requirement_config['uniform']['task_loss_requirement_min']
+            task_loss_requirement_max = self.task_loss_requirement_config['uniform']['task_loss_requirement_max']
+            task_loss_requirement = random.uniform(task_loss_requirement_min, task_loss_requirement_max)
+            task_loss_requirement = torch.ones(B, T // self.frame_sampling_rate, device=video.device) * task_loss_requirement
+        else:
+            raise NotImplementedError(f"Task loss requirement sample strategy {self.task_loss_requirement_config['sample_strategy']} not implemented")
+        if sync_across_ranks:
+            if torch.distributed.is_initialized():
+                torch.distributed.broadcast(task_loss_requirement, src=0)  # Make every rank use the same gazing ratio. Otherwise, each rank will have different gazing ratio, and the train/inference time is bounded by the slowest rank (with highest gazing ratio).
+        return task_loss_requirement
+    def get_mask_from_gazing_pos(self, video, gazing_pos, if_padded_gazing):
+        """
+        Create the video gazing mask from the gazing positions.
+        inputs:
+            video: B, T, C, H, W
+            gazing_pos: B, N
+            if_padded_gazing: B, N
+        returns:
+            mask: list of B * T * N_each_scale
+        """
+        B, T = video.shape[:2]
+        mask = torch.zeros(B, self.num_vision_tokens_each_frame * (T // self.frame_sampling_rate) + 1, device=video.device)  # +1 for the padded gazing positions
+        tmp_gazing_pos = gazing_pos.clone()
+        tmp_gazing_pos[if_padded_gazing] = mask.shape[1] - 1  # Set the padded gazing positions to the last position
+        mask[torch.arange(B)[:, None], tmp_gazing_pos] = 1
+        mask = mask[:, :-1]  # Remove the last position (padded gazing positions)
+        mask = mask.reshape(B, T // self.frame_sampling_rate, self.num_vision_tokens_each_frame)
+        mask = [mask[:, :, sum(self.num_vision_tokens_each_scale_each_frame[:i]):sum(self.num_vision_tokens_each_scale_each_frame[:i+1])] for i in range(len(self.scales))]  # list of B * T * N_each_scale
+        return mask
+    def input_res_adapt(self, pixel_values, target_scales, target_patch_size):
+        """
+        Preprocess the input to adapt to the target scales and patch size.
+        inputs:
+            pixel_values: B, T, C, H, W
+        returns:
+            pixel_values: B, T, C, H, W
+            res_adapt_info: dict, the information of resolution adaptation, for future recovery.
+        """
+        B, T, C, H, W = pixel_values.shape
+        assert H == W == target_scales[-1], "Now we need the input video to be the same size as the largest scale of the vision model"  # FIXME: in the future we should use relative resize ratio as the scales, e.g., 0.125+0.25+0.5+1. In this way we can also support naflex ViT.
+        assert len(self.scales) == len(target_scales), "The scales of the gaze model and the vision model must be the same"
+        tile_feature_map_size_each_scale = [int(self.num_vision_tokens_each_scale_each_frame[i] ** 0.5) for i in range(len(self.scales))]
+        original_feature_map_height_each_scale = [target_scales[i] // target_patch_size for i in range(len(target_scales))]
+        original_feature_map_width_each_scale = [target_scales[i] // target_patch_size for i in range(len(target_scales))]
+        num_tiles_height = math.ceil(original_feature_map_height_each_scale[-1] / tile_feature_map_size_each_scale[-1])
+        num_tiles_width = math.ceil(original_feature_map_width_each_scale[-1] / tile_feature_map_size_each_scale[-1])
+        pad_H = num_tiles_height * tile_feature_map_size_each_scale[-1] * target_patch_size - H
+        pad_W = num_tiles_width * tile_feature_map_size_each_scale[-1] * target_patch_size - W
+        pixel_values = F.pad(pixel_values, (0, pad_W, 0, pad_H))
+        pixel_values = rearrange(pixel_values, 'b t c (nh sh) (nw sw) -> (b nh nw) t c sh sw', nh=num_tiles_height, nw=num_tiles_width)
+        res_adapt_info = {
+            'tile_feature_map_size_each_scale': tile_feature_map_size_each_scale,
+            'original_feature_map_height_each_scale': original_feature_map_height_each_scale,
+            'original_feature_map_width_each_scale': original_feature_map_width_each_scale,
+            'num_tiles_height': num_tiles_height,
+            'num_tiles_width': num_tiles_width,
+            'pad_H': pad_H,
+            'pad_W': pad_W,
+        }
+        return pixel_values, res_adapt_info
+    def recover_output_from_res_adapt(self, gaze_outputs, res_adapt_info):
+        """
+        Postprocess the output to recover from resolution adaptation.
+        inputs:
+            gaze_outputs: dict, the outputs of the gazing model.
+            res_adapt_info: dict, the information of resolution adaptation.
+        returns:
+            gaze_outputs: dict, the outputs of the gazing model.
+        """
+        num_tiles_height = res_adapt_info['num_tiles_height']
+        num_tiles_width = res_adapt_info['num_tiles_width']
+        tile_feature_map_size_each_scale = res_adapt_info['tile_feature_map_size_each_scale']
+        original_feature_map_height_each_scale = res_adapt_info['original_feature_map_height_each_scale']
+        original_feature_map_width_each_scale = res_adapt_info['original_feature_map_width_each_scale']
+        # Recover the gazing mask. Remove the gazing for the padded regions.
+        new_gazing_mask = []
+        for scale_idx in range(len(gaze_outputs['scales'])):
+            cur_gazing_mask = gaze_outputs['gazing_mask'][scale_idx]
+            cur_gazing_mask = rearrange(cur_gazing_mask, '(b nh nw) t (sh sw) -> b t (nh sh) (nw sw)', nh=num_tiles_height, nw=num_tiles_width, sh=tile_feature_map_size_each_scale[scale_idx], sw=tile_feature_map_size_each_scale[scale_idx])
+            cur_gazing_mask = cur_gazing_mask[:, :, :original_feature_map_height_each_scale[scale_idx], :original_feature_map_width_each_scale[scale_idx]]
+            cur_gazing_mask = cur_gazing_mask.flatten(-2, -1)  # (b t (nh sh) (nw sw)) -> (b t (nh sh * nw sw))
+            new_gazing_mask.append(cur_gazing_mask)
+        # Recover the num_gazing_each_frame and num_vision_tokens_each_frame
+        new_num_vision_tokens_each_frame = sum([mask.shape[-1] for mask in new_gazing_mask])
+        # Recover the gazing pos, if_padded_gazing, and num_gazing_each_frame, by inderring from the gazing mask. Note this will lose the original order of the gazing!
+        new_gazing_mask_all_scales = torch.cat(new_gazing_mask, dim=-1)  # B, T, N
+        B, T = new_gazing_mask_all_scales.shape[:2]
+        new_gazing_pos, new_if_padded_gazing = get_gazing_pos_from_gazing_mask(new_gazing_mask_all_scales.flatten(0, 1))
+        new_gazing_pos, new_if_padded_gazing = rearrange(new_gazing_pos, '(b t) n -> b t n', b=B, t=T), rearrange(new_if_padded_gazing, '(b t) n -> b t n', b=B, t=T)
+        max_num_gazing_each_frame = (~new_if_padded_gazing).sum(dim=-1).max(dim=0)[0]
+        assert all([torch.all(new_if_padded_gazing[:, t, num:] == True) for t, num in enumerate(max_num_gazing_each_frame)]), "The removed gazing should all be padded."
+        new_gazing_pos = [new_gazing_pos[:, t, :num] for t, num in enumerate(max_num_gazing_each_frame)]
+        new_if_padded_gazing = [new_if_padded_gazing[:, t, :num] for t, num in enumerate(max_num_gazing_each_frame)]
+        new_gazing_pos = [gazing_pos + new_num_vision_tokens_each_frame * t for t, gazing_pos in enumerate(new_gazing_pos)]
+        new_gazing_pos, new_if_padded_gazing = torch.cat(new_gazing_pos, dim=1), torch.cat(new_if_padded_gazing, dim=1)
+        new_num_gazing_each_frame = max_num_gazing_each_frame
+        # Update the outputs
+        gaze_outputs['gazing_pos'] = new_gazing_pos
+        gaze_outputs['gazing_mask'] = new_gazing_mask
+        gaze_outputs['frame_sampling_rate'] = gaze_outputs['frame_sampling_rate']
+        gaze_outputs['num_vision_tokens_each_frame'] = new_num_vision_tokens_each_frame
+        gaze_outputs['num_gazing_each_frame'] = new_num_gazing_each_frame
+        gaze_outputs['if_padded_gazing'] = new_if_padded_gazing
+        # Currently we haven't reordered actions probs and task loss prediction based on the new gazing pos, so delete it for now for safety.
+        del(gaze_outputs['log_action_probs'])
+        del(gaze_outputs['task_loss_prediction'])
+        return gaze_outputs
+    #FIXME: separate forward and generate functions
+    def forward(
+        self,
+        inputs,
+        target_scales=None,
+        target_patch_size=None,
+        target_image_mean=None,
+        target_image_std=None,
+        gazing_info=None,
+        temperature=1,
+        gazing_ratio=None,
+        task_loss_requirement=None,
+        generate_only=False,
+        use_cache=False,
+        past_key_values=None,
+        past_inputs_embeds=None,
+        past_attention_mask=None,
+        past_conv_values=None,
+    ):
+        """
+        inputs:
+            video: B, T, C, H, W
+            target_scales: list of scales for downstream vision model. If None, then use the scales in the gaze model.
+            target_patch_size: patch size for downstream vision model. If None, then use the patch size in the gaze model.
+            target_image_mean: image mean for downstream vision model. If None, then use the image mean in the gaze model.
+            target_image_std: image std for downstream vision model. If None, then use the image std in the gaze model.
+            gazing_info: dict, the ground truth gazing information for NTP pre-training. If None, then run the gazing model to predict gazing positions.
+            temperature: temperature for generating gazing.
+            gazing_ratio: gazing ratio for the gazing model. If None, then sample the gazing ratio according to the config.
+            task_loss_requirement: task loss requirement for the gazing model. If None, then sample the task loss requirement according to the config.
+            generate_only: whether to only generate the gazing positions, or to also calculate the probability of taking such gaze.
+            use_cache: whether to use the cache for the gazing model.
+            past_key_values: the past key values for the gazing model.
+            past_inputs_embeds: the past inputs embeds for the gazing model.
+            past_attention_mask: the past attention mask for the gazing model.
+            past_conv_values: the past conv values for the gazing model.
+        returns:
+            to_return: dict, the outputs of the gazing model.
+        """
+        if not generate_only:
+            assert past_key_values is None and past_inputs_embeds is None and past_attention_mask is None and past_conv_values is None, \
+                "If not in generate-only mode, we don't support past_key_values, past_inputs_embeds, past_attention_mask, and past_conv_values yet."
+        video = inputs['video']
+        # Preprocess the input to fix the image mean and std
+        if target_image_mean is not None and target_image_std is not None:
+            video = rearrange(video, 'b t c h w -> b t h w c')
+            video = video * torch.tensor(target_image_std, device=video.device, dtype=video.dtype) + torch.tensor(target_image_mean, device=video.device, dtype=video.dtype)
+            video = video * 2 - 1  # Vivit preprocesssor has a rescaling factor of 1/127.5 instead of 1/255, and it has an offset of -1.
+            video = (video - torch.tensor(self.image_mean, device=video.device, dtype=video.dtype)) / torch.tensor(self.image_std, device=video.device, dtype=video.dtype)
+            video = rearrange(video, 'b t h w c -> b t c h w')
+        # Preprocess the input for resolution adaptation
+        if target_scales is not None and target_patch_size is not None:
+            if not (target_scales == self.scales and [(scale // target_patch_size) ** 2 for scale in target_scales] == self.num_vision_tokens_each_scale_each_frame):
+                video, res_adapt_info = self.input_res_adapt(video, target_scales, target_patch_size)
+        B, T = video.shape[:2]
+        # If gazing_pos is already provided, then directly calculate the probability of taking such gaze. Usually in the cases of calculating pi(a|s) in PPO/GRPO/etc.
+        # Otherwise, run the gazing model first to predict gazing positions.
+        if gazing_info is None or len(gazing_info) == 0:
+            with torch.autocast("cuda", dtype=torch.bfloat16) if self.attn_mode == "flash_attention_2" else nullcontext():
+                if gazing_ratio is not None and task_loss_requirement is not None:
+                    # If the user specifies the gazing ratio and task loss requirement, then use gazing ratio as the max gazing ratio and use task loss requirement to control when to stop
+                    if isinstance(gazing_ratio, list):
+                        assert len(gazing_ratio) == T // self.frame_sampling_rate, "The number of gazing ratios must be equal to the number of frames"
+                        gazing_ratio = torch.tensor(gazing_ratio)
+                    gazing_ratio_each_frame = torch.ones(T // self.frame_sampling_rate) * gazing_ratio
+                    num_gaze_tokens_each_frame = (gazing_ratio_each_frame * self.num_vision_tokens_each_frame).to(torch.long).clamp(min=1)
+                    task_loss_requirement = torch.ones(B, T // self.frame_sampling_rate, device=video.device) * task_loss_requirement
+                elif gazing_ratio is not None:
+                    # If the user specifies the gazing ratio, then turn off the task loss requirement
+                    if isinstance(gazing_ratio, list):
+                        assert len(gazing_ratio) == T // self.frame_sampling_rate, "The number of gazing ratios must be equal to the number of frames"
+                        gazing_ratio = torch.tensor(gazing_ratio)
+                    gazing_ratio_each_frame = torch.ones(T // self.frame_sampling_rate) * gazing_ratio
+                    num_gaze_tokens_each_frame = (gazing_ratio_each_frame * self.num_vision_tokens_each_frame).to(torch.long).clamp(min=1)
+                    task_loss_requirement = None
+                elif task_loss_requirement is not None:
+                    # If the user specifies the task loss requirement, then turn off the gazing ratio limit
+                    gazing_ratio = 1
+                    gazing_ratio_each_frame = torch.ones(T // self.frame_sampling_rate) * gazing_ratio
+                    num_gaze_tokens_each_frame = (gazing_ratio_each_frame * self.num_vision_tokens_each_frame).to(torch.long).clamp(min=1)
+                    task_loss_requirement = torch.ones(B, T // self.frame_sampling_rate, device=video.device) * task_loss_requirement
+                else:
+                    gazing_ratio = self.get_gazing_ratio()
+                    gazing_ratio_each_frame = self.get_gazing_ratio_each_frame(inputs, video, gazing_ratio, T // self.frame_sampling_rate, temperature, use_cache)
+                    num_gaze_tokens_each_frame = (gazing_ratio_each_frame * self.num_vision_tokens_each_frame).to(torch.long).clamp(min=1)
+                    task_loss_requirement = self.get_task_loss_requirement(video)
+                if self.training:
+                    gazing_info = self.gazing_model.generate(
+                        video,
+                        max_gaze_tokens_each_frame=num_gaze_tokens_each_frame,
+                        task_loss_requirement=task_loss_requirement,
+                        do_sample=True,
+                        temperature=temperature,
+                        use_cache=use_cache,
+                        past_key_values=past_key_values,
+                        past_inputs_embeds=past_inputs_embeds,
+                        past_attention_mask=past_attention_mask,
+                        past_conv_values=past_conv_values,
+                    )
+                else:
+                    gazing_info = self.gazing_model.generate(
+                        video,
+                        max_gaze_tokens_each_frame=num_gaze_tokens_each_frame,
+                        task_loss_requirement=task_loss_requirement,
+                        do_sample=False,
+                        use_cache=use_cache,
+                        past_key_values=past_key_values,
+                        past_inputs_embeds=past_inputs_embeds,
+                        past_attention_mask=past_attention_mask,
+                        past_conv_values=past_conv_values,
+                    )
+        # Unpack gazing_info
+        gazing_pos = gazing_info["gazing_pos"]
+        num_gazing_each_frame = gazing_info["num_gazing_each_frame"]
+        if_padded_gazing = gazing_info["if_padded_gazing"]
+        task_loss_requirement = gazing_info.get("task_loss_requirement", None)
+        new_past_key_values = gazing_info.get("past_key_values", None)
+        new_past_inputs_embeds = gazing_info.get("past_inputs_embeds", None)
+        new_past_attention_mask = gazing_info.get("past_attention_mask", None)
+        new_past_conv_values = gazing_info.get("past_conv_values", None)
+        # Get the log probablity of taking such gaze (log_action_probs)
+        if not generate_only:
+            with torch.autocast("cuda", dtype=torch.bfloat16):
+                forward_outputs = self.gazing_model(video, gazing_info)  # B * N
+                action_probs = forward_outputs.gaze_probs
+                task_loss_prediction = forward_outputs.task_loss_prediction
+            log_action_probs = torch.log(action_probs + 1e-8)  # B * N
+        else:
+            log_action_probs = None
+            task_loss_prediction = None
+        # Generate (multi-scale) gazing masks for ease of visualization
+        mask = self.get_mask_from_gazing_pos(video, gazing_pos, if_padded_gazing)
+        to_return = {
+            'gazing_pos': gazing_pos,
+            'log_action_probs': log_action_probs,
+            'gazing_mask': mask,
+            "scales": self.scales,
+            "frame_sampling_rate": self.frame_sampling_rate,
+            "num_vision_tokens_each_frame": self.num_vision_tokens_each_frame,
+            "num_gazing_each_frame": num_gazing_each_frame,
+            "if_padded_gazing": if_padded_gazing,
+            "task_loss_prediction": task_loss_prediction,
+            "has_task_loss_requirement": task_loss_requirement is not None,
+            "task_loss_requirement": task_loss_requirement,
+            "past_key_values": new_past_key_values if use_cache else None,
+            "past_inputs_embeds": new_past_inputs_embeds if use_cache else None,
+            "past_attention_mask": new_past_attention_mask if use_cache else None,
+            "past_conv_values": new_past_conv_values if use_cache else None,
+        }
+        # Postprocess the output to recover from resolution adaptation
+        if target_scales is not None and target_patch_size is not None:
+            if not (target_scales == self.scales and [(scale // target_patch_size) ** 2 for scale in target_scales] == self.num_vision_tokens_each_scale_each_frame):
+                to_return.update(self.recover_output_from_res_adapt(to_return, res_adapt_info))
+        return to_return

autogaze/models/autogaze/configuration_autogaze.py ADDED Viewed

	@@ -0,0 +1,326 @@

+# coding=utf-8
+"""AutoGaze model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from omegaconf import OmegaConf
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+logger = logging.get_logger(__name__)
+class GazeDecoderConfig(PretrainedConfig):
+    r"""
+    Based on LLamaConfig from transformers.
+    ```"""
+    model_type = "llama"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `LlamaModel`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        head_dim=None,
+        attn_mode="sdpa",
+        num_multi_token_pred=1,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
+        self.num_multi_token_pred = num_multi_token_pred
+        self._attn_implementation = attn_mode
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+class VisionModelConfig(PretrainedConfig):
+    r"""
+    Configuration for the vision model component of AutoGaze.
+    Args:
+        hidden_dim (`int`, *optional*, defaults to `192`):
+            Hidden dimension of the vision model.
+        out_dim (`int`, *optional*, defaults to `192`):
+            Output dimension of the vision model.
+        depth (`int`, *optional*, defaults to `1`):
+            Depth of the vision model.
+        kernel_size (`int`, *optional*, defaults to `16`):
+            Kernel size for spatial convolution.
+        temporal_patch_size (`int`, *optional*, defaults to `1`):
+            Temporal patch size for video processing.
+        trunk_temporal_kernel_size (`int`, *optional*, defaults to `3`):
+            Temporal kernel size for trunk blocks.
+        trunk_spatial_kernel_size (`int`, *optional*, defaults to `3`):
+            Spatial kernel size for trunk blocks.
+    """
+    def __init__(
+        self,
+        hidden_dim=192,
+        out_dim=192,
+        depth=1,
+        kernel_size=16,
+        temporal_patch_size=1,
+        trunk_temporal_kernel_size=3,
+        trunk_spatial_kernel_size=3,
+        **kwargs,
+    ):
+        self.hidden_dim = hidden_dim
+        self.out_dim = out_dim
+        self.depth = depth
+        self.kernel_size = kernel_size
+        self.temporal_patch_size = temporal_patch_size
+        self.trunk_temporal_kernel_size = trunk_temporal_kernel_size
+        self.trunk_spatial_kernel_size = trunk_spatial_kernel_size
+        super().__init__(**kwargs)
+class ConnectorConfig(PretrainedConfig):
+    r"""
+    Configuration for the connector component between vision encoder and gaze model.
+    Args:
+        hidden_dim (`int`, *optional*, defaults to `192`):
+            Hidden dimension of the connector.
+    """
+    def __init__(
+        self,
+        hidden_dim=192,
+        num_tokens=196,
+        **kwargs,
+    ):
+        self.hidden_dim = hidden_dim
+        self.num_tokens = num_tokens
+        super().__init__(**kwargs)
+class GazeModelConfig(PretrainedConfig):
+    r"""
+    Configuration for the gaze model, containing vision model, connector, and decoder configs.
+    Args:
+        num_multi_token_pred (`int`, *optional*, defaults to `1`):
+            Number of tokens to predict in parallel.
+        input_img_size (`int`, *optional*, defaults to `224`):
+            Input image size.
+        vision_model_config (`VisionModelConfig` or `dict`, *optional*):
+            Configuration for the vision model.
+        connector_config (`ConnectorConfig` or `dict`, *optional*):
+            Configuration for the connector.
+        gaze_decoder_config (`GazeDecoderConfig` or `dict`, *optional*):
+            Configuration for the gaze decoder (LLaMA-based).
+    """
+    def __init__(
+        self,
+        input_img_size=224,
+        vision_model_config={},
+        connector_config={},
+        gaze_decoder_config={},
+        num_vision_tokens_each_frame=196,
+        attn_mode="sdpa",
+        **kwargs,
+    ):
+        self.input_img_size = input_img_size
+        self.vision_model_config = VisionModelConfig(**vision_model_config)
+        connector_config.update({
+            "num_tokens": (input_img_size // self.vision_model_config.kernel_size)**2,
+        })
+        self.connector_config = ConnectorConfig(**connector_config)
+        gaze_decoder_config.update({
+            "vocab_size": num_vision_tokens_each_frame + 1,
+            "eos_token_id": num_vision_tokens_each_frame,
+            "attn_mode": attn_mode,
+        })
+        self.gaze_decoder_config = GazeDecoderConfig(**gaze_decoder_config)
+        self.num_vision_tokens_each_frame = num_vision_tokens_each_frame
+        super().__init__(**kwargs)
+class AutoGazeConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`AutoGaze`] model. It is used to instantiate an
+    AutoGaze model according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        gazing_ratio_config (`dict`, *optional*):
+            Configuration for sampling gazing ratio during training and inference.
+        scales (`str` or `int`, *optional*, defaults to `"224"`):
+            Scales for the vision model. Can be a single scale or multiple scales separated by '+'.
+        num_vision_tokens_each_frame (`int`, *optional*, defaults to `196`):
+            Number of vision tokens per frame.
+        gaze_model_config (`GazeModelConfig` or `dict`, *optional*):
+            Configuration for the gaze model, including vision_model_config, connector_config, and gaze_decoder_config.
+        gazing_ratio_each_frame_config (`dict`, *optional*):
+            Configuration for sampling gazing ratio for each frame.
+        has_task_loss_requirement_during_training (`bool`, *optional*, defaults to `False`):
+            Whether to use task loss requirement during training.
+        has_task_loss_requirement_during_inference (`bool`, *optional*, defaults to `False`):
+            Whether to use task loss requirement during inference.
+        task_loss_requirement_config (`dict`, *optional*):
+            Configuration for task loss requirement sampling.
+        image_mean (`list`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
+            Image mean for normalization.
+        image_std (`list`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
+            Image std for normalization.
+        use_flash_attn (`bool`, *optional*, defaults to `True`):
+            Whether to use flash attention.
+        max_batch_size (`int`, *optional*):
+            Maximum batch size.
+    ```python
+    >>> from autogaze.models.autogaze import AutoGaze, AutoGazeConfig
+    >>> # Initializing an AutoGaze configuration
+    >>> configuration = AutoGazeConfig()
+    >>> # Initializing a model from the configuration
+    >>> model = AutoGaze(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "autogaze"
+    def __init__(
+        self,
+        gazing_ratio_config=None,
+        scales="224",
+        num_vision_tokens_each_frame=196,
+        gaze_model_config={},
+        gazing_ratio_each_frame_config=None,
+        has_task_loss_requirement_during_training=False,
+        has_task_loss_requirement_during_inference=False,
+        task_loss_requirement_config=None,
+        image_mean=[0.485, 0.456, 0.406],
+        image_std=[0.229, 0.224, 0.225],
+        use_flash_attn=True,
+        max_batch_size=None,
+        **kwargs,
+    ):
+        self.gazing_ratio_config = gazing_ratio_config or {
+            "sample_strategy_during_training": "fixed",
+            "sample_strategy_during_inference": "fixed",
+            "fixed": {"gazing_ratio": 0.5},
+            "uniform": {"gazing_ratio_min": 0, "gazing_ratio_max": 1},
+            "exponential": {"gazing_ratio_min": 0, "gazing_ratio_max": 1, "lambda": 10},
+        }
+        self.scales = scales
+        self.num_vision_tokens_each_frame = num_vision_tokens_each_frame
+        self.attn_mode = "flash_attention_2" if use_flash_attn else "sdpa"
+        gaze_model_config.update({
+            "num_vision_tokens_each_frame": num_vision_tokens_each_frame,
+            "attn_mode": self.attn_mode,
+        })
+        self.gaze_model_config = GazeModelConfig(**gaze_model_config)
+        self.gazing_ratio_each_frame_config = gazing_ratio_each_frame_config or {
+            "sample_strategy_during_training": "uniform",
+            "sample_strategy_during_inference": "uniform",
+            "uniform": {},
+            "dirichlet": {"alpha": 0.5},
+            "self": {},
+        }
+        self.has_task_loss_requirement_during_training = has_task_loss_requirement_during_training
+        self.has_task_loss_requirement_during_inference = has_task_loss_requirement_during_inference
+        self.task_loss_requirement_config = task_loss_requirement_config or {
+            "sample_strategy_during_training": "fixed",
+            "sample_strategy_during_inference": "fixed",
+            "fixed": {"task_loss_requirement": 0.7},
+            "uniform": {"task_loss_requirement_min": 0.6, "task_loss_requirement_max": 0.9},
+        }
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.use_flash_attn = use_flash_attn
+        self.max_batch_size = max_batch_size
+        super().__init__(**kwargs)
+__all__ = [
+    "AutoGazeConfig",
+    "GazeModelConfig",
+    "VisionModelConfig",
+    "ConnectorConfig",
+    "GazeDecoderConfig",
+]

autogaze/models/autogaze/modeling_autogaze.py ADDED Viewed

	@@ -0,0 +1,431 @@

+from copy import deepcopy
+from typing import Optional, Tuple
+from dataclasses import dataclass
+from einops import rearrange
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.convnext import ConvNeXtBlock
+from timm.layers import LayerNorm2d
+from transformers.modeling_outputs import ModelOutput
+from transformers import LogitsProcessor, LogitsProcessorList
+from .configuration_autogaze import GazeModelConfig, VisionModelConfig, ConnectorConfig
+from .modeling_llama_multi_token_pred import LlamaForCausalLM_MultiTokenPred
+@dataclass
+class AutoGazeOutput(ModelOutput):
+    gaze_logits: Optional[torch.FloatTensor] = None
+    gaze_probs: Optional[torch.FloatTensor] = None
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+    task_loss_prediction: Optional[torch.FloatTensor] = None
+class NoRepeatTokensLogitsProcessor(LogitsProcessor):
+    def __init__(self):
+        super().__init__()
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # input_ids: (batch_size, sequence_length)
+        # scores: (batch_size, vocab_size) or (batch_size, num_multi_token_pred, vocab_size)
+        if scores.ndim == 3:
+            scores[torch.arange(scores.shape[0])[..., None], :, input_ids] = -float("inf")
+        else:
+            scores[torch.arange(scores.shape[0])[..., None], input_ids] = -float("inf")
+        return scores
+class NoEosTokenLogitsProcessor(LogitsProcessor):
+    def __init__(self):
+        super().__init__()
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # input_ids: (batch_size, sequence_length)
+        # scores: (batch_size, vocab_size) or (batch_size, num_multi_token_pred, vocab_size)
+        scores[..., -1] = -float("inf")
+        return scores
+class AutoGazeModel(nn.Module):
+    def __init__(self, gaze_model_config: GazeModelConfig):
+        super().__init__()
+        self.num_vision_tokens_each_frame = gaze_model_config.num_vision_tokens_each_frame
+        self.input_img_size = gaze_model_config.input_img_size
+        self.frame_sampling_rate = gaze_model_config.vision_model_config.temporal_patch_size
+        self.num_multi_token_pred = gaze_model_config.gaze_decoder_config.num_multi_token_pred
+        self.gaze_decoder_config = gaze_model_config.gaze_decoder_config  # Store for reference
+        # Create the vision model, connector, and gaze decoder
+        self.vision_model = ShallowVideoConvNet(gaze_model_config.vision_model_config)
+        self.connector = Connector(gaze_model_config.connector_config)
+        self.gaze_decoder = LlamaForCausalLM_MultiTokenPred(gaze_model_config.gaze_decoder_config)
+        # Add logits processors to prevent the model from repeating the same token and generating eos token during gazing.
+        self.logits_processor = LogitsProcessorList()
+        self.logits_processor.append(NoRepeatTokensLogitsProcessor())  # don't allow repeated gazing
+        self.logits_processor.append(NoEosTokenLogitsProcessor())  # don't allow generating eos token duing gazing
+    def embed(self, video=None, gaze_pos_ids=None, use_cache=False, past_conv_values=None):
+        """
+        inputs:
+            video: (B x T x C x H x W).
+            gaze_pos_ids: list of (B, N), N is the number of gazing positions in each frame. The length of the list is T // frame_sampling_rate.
+        returns:
+            embeds: a list of interleaved vision and gaze embeddings. list of (B, N, C)
+            gaze_token_mask: a list of masks that indicate if the current embedding is a gaze embedding. (1 is gaze embedding, 0 is vision embedding). list of (N, )
+            gaze_pred_source_relative: a list of (relative) source index of where the gaze prediction is coming from. For example, if the gaze prediction is coming from two tokens before it, the source index is -2. list of (N, ).
+                For vision embeddings, there's no source prediction, so the source index is -1.
+            attention_mask: a list of (B, N) that indicates if the current embedding should be masked out (for EOS token). 1 is not masked, 0 is masked.
+        """
+        B, T = video.shape[:2]
+        assert (video is None or gaze_pos_ids is None) or video.shape[1] // self.frame_sampling_rate == len(gaze_pos_ids), \
+            "The number of frames in the video (after subsampling) and in gaze position IDs must be the same, but got {} and {}".format(video.shape[1] // self.frame_sampling_rate, len(gaze_pos_ids))
+        if video is not None:
+            vision_features, new_past_conv_values = self.vision_model(video, use_cache=use_cache, past_conv_values=past_conv_values)
+            vision_features = vision_features.transpose(1, 2)
+            vision_features = rearrange(vision_features, 'b t c h w -> b t (h w) c')
+            vision_features = self.connector(vision_features)
+            vision_attention_mask = [torch.ones(B, vision_features.shape[2], device=vision_features.device).long() for _ in range(vision_features.shape[1])]
+        if gaze_pos_ids is not None:
+            num_gazing_each_frame = [gaze_pos_ids[t].shape[1] for t in range(len(gaze_pos_ids))]
+            gaze_pos_ids = torch.cat(gaze_pos_ids, dim=1)
+            gaze_attention_mask = (gaze_pos_ids != self.gaze_decoder_config.eos_token_id).to(torch.long)
+            gaze_embeds = self.gaze_decoder.model.embed_tokens(gaze_pos_ids)
+            gaze_embeds = list(gaze_embeds.split(num_gazing_each_frame, dim=1))
+            gaze_attention_mask = list(gaze_attention_mask.split(num_gazing_each_frame, dim=1))
+        embeds = []
+        gaze_token_mask = []
+        gaze_pred_source_relative = []
+        attention_mask = []
+        for t in range(T // self.frame_sampling_rate):
+            if video is not None:
+                embeds.append(vision_features[:, t, :, :])
+                gaze_token_mask.append(torch.zeros(vision_features.shape[2], device=vision_features.device).long())
+                gaze_pred_source_relative.append(torch.zeros(vision_features.shape[2], device=vision_features.device).long() - 1)
+                attention_mask.append(vision_attention_mask[t])
+            if gaze_pos_ids is not None:
+                embeds.append(gaze_embeds[t])
+                gaze_token_mask.append(torch.ones(gaze_embeds[t].shape[1], device=gaze_embeds[t].device).long())
+                gaze_pred_source_relative.append(-(torch.arange(gaze_embeds[t].shape[1], device=gaze_embeds[t].device) % self.num_multi_token_pred + 1))
+                attention_mask.append(gaze_attention_mask[t])
+        return embeds, gaze_token_mask, gaze_pred_source_relative, attention_mask, new_past_conv_values if video is not None else None
+    @torch.no_grad()
+    def generate(
+        self,
+        video,
+        max_gaze_tokens_each_frame=100,
+        task_loss_requirement=None,
+        use_cache=False,
+        past_key_values=None,
+        past_inputs_embeds=None,
+        past_attention_mask=None,
+        past_conv_values=None,
+        **generation_kwargs,
+    ):
+        """
+        Inputs:
+            video: (B, T, C, H, W)
+            max_gaze_tokens_each_frame: int or (T, ). Indicating the max gazing length for each frame. If is int, then all frames have the same max gazing length.
+            task_loss_requirement (optional): (B, T). Indicating the task loss requirement for each frame.
+            past_key_values (optional): The past key values for the gaze model. Can be used for streaming generation.
+            past_inputs_embeds (optional): The past inputs embeds for the gaze model. Can be used for streaming generation.
+            past_attention_mask (optional): The past attention mask for the gaze model. Can be used for streaming generation.
+        """
+        if past_key_values is not None or past_inputs_embeds is not None or past_attention_mask is not None or past_conv_values is not None:
+            assert past_key_values is not None and past_inputs_embeds is not None and past_attention_mask is not None and past_conv_values is not None, \
+                "If past_key_values, past_inputs_embeds, past_attention_mask, or past_conv_values is provided, then all four must be provided!"
+        # Subsample frames and resize
+        B, T = video.shape[:2]
+        video = rearrange(video, 'b t c h w -> (b t) c h w')
+        video = F.interpolate(video, size=(self.input_img_size, self.input_img_size), mode="bicubic", align_corners=False)
+        video = rearrange(video, '(b t) c h w -> b t c h w', b=B)
+        # Embed all the frames
+        video_embeds, _, __, ___, past_conv_values = self.embed(video=video, use_cache=use_cache, past_conv_values=past_conv_values)
+        # Generate gaze position IDs for each frame
+        gaze_pos_ids_list = []
+        inputs_embeds = [] if past_inputs_embeds is None else past_inputs_embeds
+        attention_mask = [] if past_attention_mask is None else past_attention_mask
+        past_key_values = None if past_key_values is None else past_key_values
+        num_gazing_each_frame = []
+        if_padded_gazing = []
+        for t in range(len(video_embeds)):
+            # Update inputs_embeds and attention mask for the new frame
+            inputs_embeds.append(video_embeds[t])
+            attention_mask.append(torch.ones(video_embeds[t].shape[0], video_embeds[t].shape[1], device=video_embeds[t].device).long())
+            # Put task loss requirement into generation config
+            generation_config = self.gaze_decoder.generation_config
+            generation_config.task_loss_requirement = task_loss_requirement[:, t] if task_loss_requirement is not None else None
+            # Get the max gazing length for the current frame
+            assert isinstance(max_gaze_tokens_each_frame, int) or len(max_gaze_tokens_each_frame) == len(video_embeds), \
+                "max_gaze_tokens_each_frame must be an int or a tensor of the same length as the video embeddings, but got {} and {}".format(max_gaze_tokens_each_frame, len(video_embeds))
+            max_gaze_tokens = max_gaze_tokens_each_frame if isinstance(max_gaze_tokens_each_frame, int) else max_gaze_tokens_each_frame[t]
+            # Generate gaze position IDs for the current frame
+            is_gradient_checkpointing = self.gaze_decoder.is_gradient_checkpointing
+            if is_gradient_checkpointing:
+                self.gaze_decoder.gradient_checkpointing_disable()
+            gaze_outputs = self.gaze_decoder.generate(
+                inputs_embeds=torch.cat(inputs_embeds, dim=1),  # We need to pass the whole sequence of inputs_embeds (both current and past) to the model even when we use use_cache=True!!!
+                attention_mask=torch.cat(attention_mask, dim=1),
+                position_ids=torch.cat(attention_mask, dim=1).cumsum(dim=-1) - 1,
+                max_new_tokens=max_gaze_tokens,
+                logits_processor=self.logits_processor,
+                pad_token_id=self.gaze_decoder_config.eos_token_id,
+                eos_token_id=self.gaze_decoder_config.eos_token_id,
+                past_key_values=past_key_values,
+                use_cache=True,
+                return_dict_in_generate=True,
+                generation_config=generation_config,
+                **generation_kwargs,
+            )
+            if is_gradient_checkpointing:
+                self.gaze_decoder.gradient_checkpointing_enable()
+            # Get the predicted gaze ids
+            gaze_pos_ids = gaze_outputs.sequences  # B * N
+            gaze_pos_ids_list.append(gaze_pos_ids + self.num_vision_tokens_each_frame * t)
+            # Update inputs_embeds for the next frame
+            inputs_embeds.append(self.gaze_decoder.model.embed_tokens(gaze_pos_ids))
+            # Update past_key_values for the next frame
+            past_key_values = gaze_outputs.past_key_values
+            # Update auxiliary information
+            num_gazing_each_frame.append(gaze_pos_ids.shape[1])
+            if_padded_gazing.append(gaze_pos_ids == self.gaze_decoder_config.eos_token_id)
+            # Update attention mask
+            attention_mask.append((gaze_pos_ids != self.gaze_decoder_config.eos_token_id).to(torch.long))
+        # Concatenate gaze position IDs from all frames
+        gaze_pos_ids = torch.cat(gaze_pos_ids_list, dim=1)
+        # Get auxiliary information
+        num_gazing_each_frame = torch.tensor(num_gazing_each_frame, device=gaze_pos_ids.device).to(torch.long)
+        if_padded_gazing = torch.cat(if_padded_gazing, dim=1)
+        to_return = {
+            "gazing_pos": gaze_pos_ids,  # In gaze_pos_ids, the padded gazing positions are not necessarily eos_token_id, so one needs to use if_padded_gazing to determine if the gazing position is padded!!!
+            "num_gazing_each_frame": num_gazing_each_frame,
+            "if_padded_gazing": if_padded_gazing,
+            "task_loss_requirement": task_loss_requirement,
+            "past_inputs_embeds": inputs_embeds if use_cache else None,
+            "past_attention_mask": attention_mask if use_cache else None,
+            "past_key_values": past_key_values if use_cache else None,
+            "past_conv_values": past_conv_values if use_cache else None,
+        }
+        return to_return
+    def forward(self, video, gazing_info, **kwargs):
+        # Unpack gazing_info
+        gaze_pos_ids = gazing_info["gazing_pos"]
+        num_gazing_each_frame = gazing_info["num_gazing_each_frame"]
+        if_padded_gazing = gazing_info["if_padded_gazing"]
+        # Subsample frames and resize
+        B, T = video.shape[:2]
+        video = rearrange(video, 'b t c h w -> (b t) c h w')
+        video = F.interpolate(video, size=(self.input_img_size, self.input_img_size), mode="bicubic", align_corners=False)
+        video = rearrange(video, '(b t) c h w -> b t c h w', b=B)
+        # Split the gaze frame-wise
+        gaze_pos_ids_split = list(gaze_pos_ids.split(num_gazing_each_frame.tolist(), dim=1))
+        gaze_pos_ids_split = [gaze_pos_ids_split[t] - self.num_vision_tokens_each_frame * t for t in range(len(gaze_pos_ids_split))]
+        if_padded_gazing_split = list(if_padded_gazing.split(num_gazing_each_frame.tolist(), dim=1))
+        # Fill the padded gazing positions with eos_token_id
+        gaze_pos_ids_split = [gaze_pos * (~padded) + self.gaze_decoder_config.eos_token_id * padded for gaze_pos, padded in zip(gaze_pos_ids_split, if_padded_gazing_split)]
+        # Embed the video and gaze position IDs
+        inputs_embeds, gaze_token_mask, gaze_pred_source_relative, attention_mask, _ = self.embed(video=video, gaze_pos_ids=gaze_pos_ids_split)
+        inputs_embeds = torch.cat(inputs_embeds, dim=1)  # B * N * C
+        gaze_token_mask = torch.cat(gaze_token_mask, dim=0)  # N
+        gaze_pred_source_relative = torch.cat(gaze_pred_source_relative, dim=0)  # N
+        attention_mask = torch.cat(attention_mask, dim=1)  # B * N
+        # Run model forward
+        outputs = self.gaze_decoder(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=attention_mask.cumsum(dim=-1) - 1,
+            **kwargs,
+        )
+        # Get gaze logits and probs
+        logits_multi_token_pred = outputs.logits
+        task_loss_prediction_multi_token_pred = outputs.task_loss_prediction  # B * N * num_multi_token_pred
+        logits_multi_token_pred = rearrange(logits_multi_token_pred, 'b n (k c) -> b n k c', k=self.num_multi_token_pred)
+        gaze_probs_all_multi_token_pred = F.softmax(logits_multi_token_pred, dim=-1)
+        shifted_probs = []
+        shifted_task_loss_prediction = []
+        for i in range(self.num_multi_token_pred):
+            shifted_probs.append(F.pad(gaze_probs_all_multi_token_pred[:, :-(i + 1), i, :], (0, 0, i + 1, 0), value=0))
+            shifted_task_loss_prediction.append(F.pad(task_loss_prediction_multi_token_pred[:, :task_loss_prediction_multi_token_pred.shape[1] - i, i], (i, 0), value=0))
+        shifted_probs = torch.stack(shifted_probs, dim=2)  # B, N, K, C
+        shifted_task_loss_prediction = torch.stack(shifted_task_loss_prediction, dim=2)  # B, N, K
+        gaze_probs_all = shifted_probs[:, torch.arange(logits_multi_token_pred.shape[1]), -gaze_pred_source_relative - 1]
+        task_loss_prediction = shifted_task_loss_prediction[:, torch.arange(logits_multi_token_pred.shape[1]), (-gaze_pred_source_relative) % self.num_multi_token_pred]  # B, N
+        gaze_input_token_pos = torch.nonzero(gaze_token_mask, as_tuple=True)[0]
+        gaze_probs_all = gaze_probs_all[:, gaze_input_token_pos, :]
+        task_loss_prediction = task_loss_prediction[:, gaze_input_token_pos]
+        B, N = gaze_probs_all.shape[:2]
+        gaze_probs = gaze_probs_all.reshape(B * N, -1)[torch.arange(B * N), torch.cat(gaze_pos_ids_split, dim=1).flatten()].reshape(B, N)  # [B, T]
+        outputs = AutoGazeOutput(
+            gaze_probs=gaze_probs,
+            loss=outputs.loss,
+            logits=outputs.logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            task_loss_prediction=task_loss_prediction,
+        )
+        return outputs
+################ Shallow Vision Encoder #################
+class Conv3dBlockForStreaming(nn.Module):
+    def __init__(self, hidden_dim, temporal_patch_size, spatial_kernel_size):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.temporal_patch_size = temporal_patch_size
+        self.spatial_kernel_size = spatial_kernel_size
+        self.conv3d = nn.Conv3d(
+            hidden_dim, hidden_dim,
+            kernel_size=(temporal_patch_size, spatial_kernel_size, spatial_kernel_size),
+            padding=(0, (spatial_kernel_size - 1) // 2, (spatial_kernel_size - 1) // 2),  # We manually pad the temporal dimension in forward, to support streaming
+            bias=True,
+        )
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x, use_cache=False, past_conv_values=None):
+        if not (use_cache and past_conv_values is not None):
+            x = F.pad(x, (0, 0, 0, 0, self.temporal_patch_size - 1, 0), value=0)
+        else:
+            x = torch.cat([past_conv_values, x], dim=2)
+        new_past_conv_values = x[:, :, -(self.temporal_patch_size - 1):]
+        x = self.conv3d(x)
+        x = self.relu(x)
+        return x, new_past_conv_values
+class ShallowVideoConvNet(nn.Module):
+    """
+    A shallow video convolutional network for video gaze modeling, inspired by ViViT's patch embedding approach.
+    Expects input of shape (B, T, C, H, W) or (B*T, C, H, W).
+    """
+    def __init__(self, config: VisionModelConfig):
+        super().__init__()
+        hidden_dim = config.hidden_dim
+        out_dim = config.out_dim
+        depth = config.depth
+        kernel_size = config.kernel_size
+        self.temporal_patch_size = getattr(config, "temporal_patch_size", 1)
+        # For video, first merge temporal and batch if needed, then apply 3D conv for temporal patching
+        self.temporal_conv = nn.Conv3d(
+            in_channels=3,  # RGB
+            out_channels=hidden_dim,
+            kernel_size=(self.temporal_patch_size, kernel_size, kernel_size),
+            stride=(self.temporal_patch_size, kernel_size, kernel_size),
+            bias=True,
+        )
+        self.norm = nn.LayerNorm(hidden_dim)
+        self.trunk_temporal_kernel_size = config.trunk_temporal_kernel_size
+        self.trunk_spatial_kernel_size = config.trunk_spatial_kernel_size
+        blocks = []
+        for i in range(depth):
+            blocks.append(
+                Conv3dBlockForStreaming(
+                    hidden_dim=hidden_dim,
+                    temporal_patch_size=self.trunk_temporal_kernel_size,
+                    spatial_kernel_size=self.trunk_spatial_kernel_size,
+                )
+            )
+        self.blocks = nn.ModuleList(blocks)
+        self.out_proj = nn.Conv3d(
+            hidden_dim, out_dim, kernel_size=1, stride=1, bias=True
+        )
+    def forward(self, x, use_cache=False, past_conv_values=None):
+        # x: (B, T, C, H, W) or (B*T, C, H, W)
+        if x.dim() == 5:
+            # (B, T, C, H, W) -> (B, C, T, H, W)
+            x = x.permute(0, 2, 1, 3, 4)
+        elif x.dim() == 4:
+            # (B*T, C, H, W) -> (B*T, C, 1, H, W)
+            x = x.unsqueeze(2)
+        else:
+            raise ValueError("Input must be 4D or 5D tensor")
+        x = self.temporal_conv(x)  # (B, hidden_dim, T', H', W')
+        # Collapse temporal dimension into batch for normalization and blocks
+        B, C, T, H, W = x.shape
+        x = x.permute(0, 2, 1, 3, 4).contiguous().view(B * T, C, H, W)  # (B*T, C, H, W)
+        # Flatten spatial dims for norm: (B*T, C, H*W)
+        x = x.view(B * T, C, -1).permute(0, 2, 1)  # (B*T, H*W, C)
+        x = self.norm(x)
+        x = x.permute(0, 2, 1).contiguous().view(B * T, C, H, W)  # (B*T, C, H, W)
+        # Reshape back to (B, C, T, H, W)
+        x = x.view(B, T, C, H, W).permute(0, 2, 1, 3, 4)
+        # Main trunk
+        new_past_conv_values = []
+        for i, block in enumerate(self.blocks):
+            x, new_past_conv_values_i = block(
+                x,
+                use_cache=use_cache,
+                past_conv_values=past_conv_values[i] if use_cache and past_conv_values is not None else None
+            )
+            new_past_conv_values.append(new_past_conv_values_i)
+        x = self.out_proj(x)
+        # Output shape: (B, out_dim, T', H', W')
+        return x, new_past_conv_values
+################ Connector Between Vision Encoder and Gaze Model #################
+class Connector(nn.Module):
+    def __init__(self, config: ConnectorConfig):
+        super().__init__()
+        self.hidden_dim = config.hidden_dim
+        self.num_tokens = config.num_tokens
+        self.pos_embed = nn.Parameter(torch.randn(self.num_tokens, self.hidden_dim))
+    def forward(self, x):
+        """
+        x: (B, T, N, C)
+        """
+        x = x + self.pos_embed[None, None]
+        return x

autogaze/models/autogaze/modeling_llama_multi_token_pred.py ADDED Viewed

	@@ -0,0 +1,471 @@

+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
+from copy import deepcopy
+from einops import rearrange
+from importlib.metadata import version
+from packaging.version import Version
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from transformers.cache_utils import Cache
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.utils import (
+    can_return_tuple,
+)
+from transformers.utils.deprecation import deprecate_kwarg
+from transformers.models.llama.modeling_llama import (
+    LlamaModel,
+    LlamaPreTrainedModel,
+)
+from transformers.cache_utils import (
+    Cache,
+)
+from transformers.utils import (
+    ModelOutput,
+)
+from transformers.generation.configuration_utils import (
+    GenerationConfig,
+)
+from transformers.generation.logits_process import (
+    LogitsProcessorList,
+)
+from transformers.generation.stopping_criteria import (
+    StoppingCriteriaList,
+)
+from transformers.generation.utils import (
+    GenerateNonBeamOutput,
+    GenerateDecoderOnlyOutput,
+    GenerateEncoderDecoderOutput,
+    ALL_CACHE_NAMES,
+)
+if TYPE_CHECKING:
+    from transformers.modeling_utils import PreTrainedModel
+    from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+    from transformers.generation.streamers import BaseStreamer
+LOW_TRANSFORMERS_VERSION = Version(version("transformers")) < Version("4.52.0")
+@dataclass
+class CausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for causal language model (or autoregressive) outputs.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+    task_loss_prediction: Optional[torch.FloatTensor] = None
+class LlamaForCausalLM_MultiTokenPred(LlamaPreTrainedModel, GenerationMixin):
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = LlamaModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size * config.num_multi_token_pred, bias=False)
+        self.task_loss_prediction_head = nn.Linear(config.hidden_size, config.num_multi_token_pred, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @can_return_tuple
+    @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        """
+        This function is copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.forward.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        task_loss_prediction = self.task_loss_prediction_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            task_loss_prediction=task_loss_prediction,
+        )
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        num_new_tokens: int = 1,
+    ) -> Dict[str, Any]:
+        # update past_key_values keeping its naming used in model code
+        for possible_cache_name in ALL_CACHE_NAMES:
+            if possible_cache_name in outputs:
+                # TODO (joao): remove output/input mismatch when these old models (xlnet, reformer) are deprecated
+                if possible_cache_name in ("past_buckets_states", "mems"):
+                    cache_name = "past_key_values"
+                else:
+                    cache_name = possible_cache_name
+                model_kwargs[cache_name] = getattr(outputs, possible_cache_name)
+                break
+        # update token_type_ids with last value
+        if "token_type_ids" in model_kwargs:
+            token_type_ids = model_kwargs["token_type_ids"]
+            assert token_type_ids.dim() == 2
+            model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1).repeat(1, num_new_tokens)], dim=-1)
+        if not is_encoder_decoder:
+            # update attention mask
+            if "attention_mask" in model_kwargs:
+                attention_mask = model_kwargs["attention_mask"]
+                model_kwargs["attention_mask"] = torch.cat(
+                    [attention_mask, attention_mask.new_ones((attention_mask.shape[0], num_new_tokens))], dim=-1
+                )
+        else:
+            # update decoder attention mask
+            if "decoder_attention_mask" in model_kwargs:
+                decoder_attention_mask = model_kwargs["decoder_attention_mask"]
+                model_kwargs["decoder_attention_mask"] = torch.cat(
+                    [decoder_attention_mask, decoder_attention_mask.new_ones((decoder_attention_mask.shape[0], num_new_tokens))],
+                    dim=-1,
+                )
+        # Since we generate multiple tokens at once, the number of new tokens > 1 and all those tokens need to
+        # be cached later.
+        if model_kwargs.get("use_cache", True):
+            model_kwargs["cache_position"] = torch.arange(
+                model_kwargs["cache_position"][-1] + 1, model_kwargs["cache_position"][-1] + num_new_tokens + 1, dtype=model_kwargs["cache_position"].dtype
+            ).to(model_kwargs["cache_position"].device)
+        else:
+            past_positions = model_kwargs.pop("cache_position")
+            new_positions = torch.arange(
+                past_positions[-1] + 1, past_positions[-1] + num_new_tokens + 1, dtype=past_positions.dtype
+            ).to(past_positions.device)
+            model_kwargs["cache_position"] = torch.cat((past_positions, new_positions))
+        return model_kwargs
+    def _sample(
+        self,
+        input_ids: torch.LongTensor,
+        logits_processor: LogitsProcessorList,
+        stopping_criteria: StoppingCriteriaList,
+        generation_config: GenerationConfig,
+        synced_gpus: bool,
+        streamer: Optional["BaseStreamer"] = None,
+        **model_kwargs,
+    ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
+        """
+        This function is copied from transformers.generation.utils.GenerationMixin._sample.
+        """
+        # init values
+        pad_token_id = generation_config._pad_token_tensor
+        output_attentions = generation_config.output_attentions
+        output_hidden_states = generation_config.output_hidden_states
+        output_scores = generation_config.output_scores
+        output_logits = generation_config.output_logits
+        return_dict_in_generate = generation_config.return_dict_in_generate
+        do_sample = generation_config.do_sample
+        task_loss_requirement = generation_config.task_loss_requirement
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        raw_logits = () if (return_dict_in_generate and output_logits) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+        # keep track of which sequences are already finished
+        batch_size, cur_len = input_ids.shape
+        this_peer_finished = False
+        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
+        model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs) if LOW_TRANSFORMERS_VERSION else self._get_initial_cache_position(cur_len, input_ids.device, model_kwargs)
+        model_forward = self.__call__
+        if isinstance(model_kwargs.get("past_key_values"), Cache):
+            is_compileable = model_kwargs["past_key_values"].is_compileable and self._supports_static_cache
+            if getattr(self, "hf_quantizer", None) is not None:
+                is_compileable &= self.hf_quantizer.is_compileable
+            is_compileable = is_compileable and not generation_config.disable_compile
+            if is_compileable and (
+                self.device.type == "cuda" or generation_config.compile_config._compile_all_devices
+            ):
+                os.environ["TOKENIZERS_PARALLELISM"] = "0"
+                model_forward = self.get_compiled_call(generation_config.compile_config)
+        if generation_config.prefill_chunk_size is not None:
+            model_kwargs = self._prefill_chunking(input_ids, generation_config, **model_kwargs)
+            is_prefill = False
+        else:
+            is_prefill = True
+        is_first_token = True
+        final_past_key_values = None
+        while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
+            # prepare model inputs
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # prepare variable output controls (note: some models won't accept all output controls)
+            model_inputs.update({"output_attentions": output_attentions} if output_attentions else {})
+            model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
+            if is_prefill:
+                outputs = self(**model_inputs, return_dict=True)
+                is_prefill = False
+            else:
+                outputs = model_forward(**model_inputs, return_dict=True)
+            # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs,
+                model_kwargs,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+                num_new_tokens=self.config.num_multi_token_pred,
+            )
+            if synced_gpus and this_peer_finished:
+                continue
+            # Deepcopy the to-return past_key_values, such that it won't change during the dummy model forwards when this peer is finished but peers from other devices are still generating.
+            final_past_key_values = deepcopy(model_kwargs.get("past_key_values"))
+            # Copy is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
+            # (the clone itself is always small)
+            next_token_logits_all = outputs.logits[:, -1, :].to(copy=True, dtype=torch.float32, device=input_ids.device)
+            task_loss_prediction_all = outputs.task_loss_prediction[:, -1, :].to(copy=True, dtype=torch.float32, device=input_ids.device)
+            # Process all new tokens at once.
+            next_token_logits_all = rearrange(next_token_logits_all, 'b (n v) -> b n v', n=self.config.num_multi_token_pred, v=self.config.vocab_size)
+            # pre-process distribution
+            next_token_scores_all = logits_processor(input_ids, next_token_logits_all)
+            # token selection
+            next_tokens_all = []
+            early_stopped = False
+            for i in range(self.config.num_multi_token_pred):
+                next_token_scores_i = next_token_scores_all[:, i, :]
+                # exit early if meeting max number of token or not token left to choose
+                if len(next_tokens_all) + input_ids.shape[1] >= generation_config.max_new_tokens:
+                    early_stopped = True
+                    break
+                if torch.all(next_token_scores_i == -float("inf")):
+                    early_stopped = True
+                    break
+                if do_sample:
+                    probs = nn.functional.softmax(next_token_scores_i, dim=-1)
+                    # TODO (joao): this OP throws "skipping cudagraphs due to ['incompatible ops']", find solution
+                    next_tokens_i = torch.multinomial(probs, num_samples=1).squeeze(1)
+                else:
+                    next_tokens_i = torch.argmax(next_token_scores_i, dim=-1)
+                next_tokens_all.append(next_tokens_i)
+                # avoid repeating gazing
+                next_token_scores_all[torch.arange(next_tokens_i.shape[0]), i + 1:, next_tokens_i] = -float("inf")
+            next_tokens_all = torch.stack(next_tokens_all, dim=1)
+            # already finished sentences should have their next token be a padding token
+            next_tokens_all = next_tokens_all * unfinished_sequences[..., None] + pad_token_id * (1 - unfinished_sequences[..., None])
+            # Mark finished if task loss requirement is met
+            meet_task_loss_requirement = torch.zeros_like(next_tokens_all, dtype=torch.bool)
+            if task_loss_requirement is not None:
+                meet_task_loss_requirement = task_loss_prediction_all[:, :next_tokens_all.shape[1]] <= task_loss_requirement[..., None]
+                if is_first_token:
+                    meet_task_loss_requirement[:, 0] = False
+                next_tokens_all = next_tokens_all * (~meet_task_loss_requirement) + pad_token_id * meet_task_loss_requirement
+            # Truncate the next tokens to the max new tokens
+            meet_max_new_tokens = False
+            if next_tokens_all.shape[1] + input_ids.shape[1] >= generation_config.max_new_tokens:
+                next_tokens_all = next_tokens_all[:, :generation_config.max_new_tokens - input_ids.shape[1]]
+                meet_max_new_tokens = True
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens_all], dim=-1)
+            if streamer is not None:
+                for i in range(next_tokens_all.shape[1]):
+                    streamer.put(next_tokens_all[:, i].cpu())
+            # Update the finishing flags
+            unfinished_sequences = unfinished_sequences & ~torch.any(meet_task_loss_requirement, dim=-1) & ~meet_max_new_tokens & ~early_stopped
+            this_peer_finished = unfinished_sequences.max() == 0
+            cur_len += 1
+            is_first_token = False
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_token_scores_all,)
+                if output_logits:
+                    raw_logits += (next_token_logits_all,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+            # This is needed to properly delete outputs.logits which may be very large for first iteration
+            # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration
+            del outputs
+        if streamer is not None:
+            streamer.end()
+        if return_dict_in_generate:
+            if self.config.is_encoder_decoder:
+                return GenerateEncoderDecoderOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    logits=raw_logits,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                    past_key_values=final_past_key_values,
+                )
+            else:
+                return GenerateDecoderOnlyOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    logits=raw_logits,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                    past_key_values=final_past_key_values,
+                )
+        else:
+            return input_ids

autogaze/tasks/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """AutoGaze tasks."""

autogaze/tasks/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (176 Bytes). View file

autogaze/tasks/video_mae_reconstruction/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .task_video_mae_reconstruction import VideoMAEReconstruction

autogaze/tasks/video_mae_reconstruction/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (250 Bytes). View file

autogaze/tasks/video_mae_reconstruction/__pycache__/configuration_video_mae.cpython-310.pyc ADDED Viewed

Binary file (5.86 kB). View file

autogaze/tasks/video_mae_reconstruction/__pycache__/modeling_video_mae.cpython-310.pyc ADDED Viewed

Binary file (43.9 kB). View file

autogaze/tasks/video_mae_reconstruction/__pycache__/task_video_mae_reconstruction.cpython-310.pyc ADDED Viewed

Binary file (6.91 kB). View file

autogaze/tasks/video_mae_reconstruction/__pycache__/visualize_video_mae_reconstruction.cpython-310.pyc ADDED Viewed

Binary file (3.44 kB). View file

autogaze/tasks/video_mae_reconstruction/configuration_video_mae.py ADDED Viewed

	@@ -0,0 +1,159 @@

+# coding=utf-8
+# Copyright 2022 Facebook AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ViT MAE model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class ViTMAEConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ViTMAEModel`]. It is used to instantiate an ViT
+    MAE model according to the specified arguments, defining the model architecture. Instantiating a configuration with
+    the defaults will yield a similar configuration to that of the ViT
+    [facebook/vit-mae-base](https://huggingface.co/facebook/vit-mae-base) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        decoder_num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the decoder.
+        decoder_hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the decoder.
+        decoder_num_hidden_layers (`int`, *optional*, defaults to 8):
+            Number of hidden layers in the decoder.
+        decoder_intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the decoder.
+        mask_ratio (`float`, *optional*, defaults to 0.75):
+            The ratio of the number of masked tokens in the input sequence.
+        norm_pix_loss (`bool`, *optional*, defaults to `False`):
+            Whether or not to train with normalized pixels (see Table 3 in the paper). Using normalized pixels improved
+            representation quality in the experiments of the authors.
+    Example:
+    ```python
+    >>> from transformers import ViTMAEConfig, ViTMAEModel
+    >>> # Initializing a ViT MAE vit-mae-base style configuration
+    >>> configuration = ViTMAEConfig()
+    >>> # Initializing a model (with random weights) from the vit-mae-base style configuration
+    >>> model = ViTMAEModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "vit_mae"
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        image_size=224,
+        patch_size=16,
+        num_channels=3,
+        qkv_bias=True,
+        decoder_num_attention_heads=16,
+        decoder_hidden_size=512,
+        decoder_num_hidden_layers=8,
+        decoder_intermediate_size=2048,
+        mask_ratio=0.75,
+        norm_pix_loss=False,
+        scales='224',
+        loss_type='l1',
+        loss_weights='1',
+        l1_loss_config=None,
+        dinov2_reg_loss_config=None,
+        siglip2_loss_config=None,
+        scale_embed=True,
+        max_num_frames=256,
+        time_embed=True,
+        causal=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.decoder_num_attention_heads = decoder_num_attention_heads
+        self.decoder_hidden_size = decoder_hidden_size
+        self.decoder_num_hidden_layers = decoder_num_hidden_layers
+        self.decoder_intermediate_size = decoder_intermediate_size
+        self.mask_ratio = mask_ratio
+        self.norm_pix_loss = norm_pix_loss
+        self.scales = scales
+        self.loss_type = loss_type
+        self.loss_weights = loss_weights
+        self.l1_loss_config = l1_loss_config
+        self.dinov2_reg_loss_config = dinov2_reg_loss_config
+        self.siglip2_loss_config = siglip2_loss_config
+        self.scale_embed = scale_embed
+        self.max_num_frames = max_num_frames
+        self.time_embed = time_embed
+        self.causal = causal
+__all__ = ["ViTMAEConfig"]

autogaze/tasks/video_mae_reconstruction/modeling_video_mae.py ADDED Viewed

	@@ -0,0 +1,1412 @@

+# coding=utf-8
+# Copyright 2022 Facebook AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ViT MAE (masked autoencoder) model."""
+import collections.abc
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Callable, Optional, Set, Tuple, Union
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutput
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.pytorch_utils import prune_linear_layer
+from transformers.utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+    torch_int,
+)
+from .configuration_video_mae import ViTMAEConfig
+logger = logging.get_logger(__name__)
+def find_pruneable_heads_and_indices(heads, n_heads, head_size, already_pruned_heads):
+    """
+    Finds the heads and their indices taking `already_pruned_heads` into account.
+    Args:
+        heads (`Set[int]`): A set of head indices we want to prune.
+        n_heads (`int`): The number of heads in the model.
+        head_size (`int`): The size of each head.
+        already_pruned_heads (`Set[int]`): A set of already pruned heads.
+    Returns:
+        `Tuple[Set[int], torch.LongTensor]`: A tuple with the remaining heads and their corresponding indices.
+    """
+    mask = torch.ones(n_heads, head_size)
+    heads = set(heads) - already_pruned_heads
+    for head in heads:
+        head = head - sum(1 if h < head else 0 for h in already_pruned_heads)
+        mask[head] = 0
+    mask = mask.view(-1).contiguous().eq(1)
+    index = torch.arange(len(mask))[mask].long()
+    return heads, index
+@dataclass
+class ViTMAEModelOutput(ModelOutput):
+    """
+    Class for ViTMAEModel's outputs, with potential hidden states and attentions.
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Tensor indicating which patches are masked (1) and which are not (0).
+        ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Tensor containing the original index of the (shuffled) masked patches.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    mask: Optional[torch.LongTensor] = None
+    ids_restore: Optional[torch.LongTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+@dataclass
+class ViTMAEDecoderOutput(ModelOutput):
+    """
+    Class for ViTMAEDecoder's outputs, with potential hidden states and attentions.
+    Args:
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, patch_size ** 2 * num_channels)`):
+            Pixel reconstruction logits.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+    logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    num_decoded_tokens_each_frame: Optional[torch.LongTensor] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+@dataclass
+class ViTMAEForPreTrainingOutput(ModelOutput):
+    """
+    Class for ViTMAEForPreTraining's outputs, with potential hidden states and attentions.
+    Args:
+        loss_each_reconstruction_frame (`torch.FloatTensor` of shape `(batch_size, num_selected_frames)`):
+            Pixel reconstruction loss for each reconstruction frame.
+        loss_mean (`torch.FloatTensor` of shape `(1,)`):
+            Mean of the pixel reconstruction loss for each reconstruction frame.
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, patch_size ** 2 * num_channels)`):
+            Pixel reconstruction logits.
+        mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Tensor indicating which patches are masked (1) and which are not (0).
+        ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Tensor containing the original index of the (shuffled) masked patches.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+    loss_each_reconstruction_frame: Optional[torch.FloatTensor] = None
+    loss_mean: Optional[torch.FloatTensor] = None
+    reconstruction: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    mask: Optional[torch.LongTensor] = None
+    ids_restore: Optional[torch.LongTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+def get_2d_sincos_pos_embed(embed_dim, grid_size, add_cls_token=False):
+    """
+    Create 2D sin/cos positional embeddings.
+    Args:
+        embed_dim (`int`):
+            Embedding dimension.
+        grid_size (`int`):
+            The grid height and width.
+        add_cls_token (`bool`, *optional*, defaults to `False`):
+            Whether or not to add a classification (CLS) token.
+    Returns:
+        (`torch.FloatTensor` of shape (grid_size*grid_size, embed_dim) or (1+grid_size*grid_size, embed_dim): the
+        position embeddings (with or without classification token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if add_cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be even")
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
+    """
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be even")
+    omega = np.arange(embed_dim // 2, dtype=float)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+class ViTMAEEmbeddings(nn.Module):
+    """
+    Construct the CLS token, position and patch embeddings.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.patch_embeddings = ViTMAEPatchEmbeddings(config)
+        self.num_patches = self.patch_embeddings.num_patches
+        # fixed sin-cos embedding
+        self.position_embeddings = nn.Parameter(
+            torch.zeros(1, self.num_patches + 1, config.hidden_size), requires_grad=False
+        )
+        self.patch_size = config.patch_size
+        self.config = config
+        # multi-scale setting
+        self.scales = sorted([int(scale) for scale in config.scales.split('+')])
+        self.num_patch_each_scale = [(scale // config.patch_size)**2 for scale in self.scales]
+        if config.scale_embed:
+            self.scale_embed = nn.Parameter(torch.randn(len(self.scales), config.hidden_size) * 0)
+        # time embedding
+        if config.time_embed:
+            self.time_embed = nn.Parameter(torch.randn(config.max_num_frames, config.hidden_size) * 0)
+    def initialize_weights(self):
+        # initialize (and freeze) position embeddings by sin-cos embedding
+        pos_embed = get_2d_sincos_pos_embed(
+            self.position_embeddings.shape[-1], int(self.patch_embeddings.num_patches**0.5), add_cls_token=True
+        )
+        self.position_embeddings.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        # initialize patch_embeddings like nn.Linear (instead of nn.Conv2d)
+        w = self.patch_embeddings.projection.weight.data
+        torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        # timm's trunc_normal_(std=.02) is effectively normal_(std=0.02) as cutoff is too big (2.)
+        torch.nn.init.normal_(self.cls_token, std=self.config.initializer_range)
+        # initialize scale embed
+        if self.config.scale_embed:
+            torch.nn.init.normal_(self.scale_embed, std=self.config.initializer_range)
+        # initialize time embed
+        if self.config.time_embed:
+            torch.nn.init.normal_(self.time_embed, std=self.config.initializer_range)
+    # Copied from transformers.models.vit.modeling_vit.ViTEmbeddings.interpolate_pos_encoding
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+        class_pos_embed = self.position_embeddings[:, :1]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+        dim = embeddings.shape[-1]
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+    def mask_with_gazing(self, sequence, gazing_info):
+        """
+        Mask the sequence with the gazing information.
+        For the padded gazing, we select a dummy token to fill in the positions (the dummy token is currently the first token in each sequence).
+        Args:
+            sequence: The sequence to mask.
+            gazing_info:
+                gazing_pos: The gazing positions of each whole sequence. (B, N)
+                num_gazing_each_frame: The number of gazing positions for each frame, including the padded gazing. (T, )
+                if_padded_gazing: Whether the gazing is padded. (B, N)
+        """
+        gazing_pos = gazing_info['gazing_pos'].clone()
+        num_gazing_each_frame = gazing_info['num_gazing_each_frame'].clone()
+        if_padded_gazing = gazing_info['if_padded_gazing'].clone()
+        B, seq_length, dim = sequence.shape
+        gaze_length = gazing_pos.shape[1]
+        assert gaze_length == num_gazing_each_frame.sum()
+        # Record the original sequence length into gazing_info
+        gazing_info['original_seq_length'] = seq_length
+        # Pad the sequence with an additional token for padded gazing to select
+        sequence = torch.cat([sequence, sequence[:, :1]], dim=1)
+        # Change all the padded gazing id to the last token id
+        gazing_pos = gazing_pos.flatten()
+        gazing_pos[if_padded_gazing.flatten()] = seq_length
+        gazing_pos = gazing_pos.view(B, -1)
+        # Get the unmasked part of the sequence for MAE encoding
+        sequence_unmasked = sequence[torch.arange(B)[..., None], gazing_pos]
+        return sequence_unmasked
+    def forward(self, pixel_values, gazing_info=None, noise=None, interpolate_pos_encoding: bool = False):
+        """
+        pixel_values: (B, T, C, H, W)
+        """
+        B, T = pixel_values.shape[:2]
+        pixel_values = rearrange(pixel_values, 'b t c h w -> (b t) c h w')
+        embeddings = []
+        for i, scale in enumerate(self.scales):
+            pixel_values_cur_scale = F.interpolate(pixel_values, size=(scale, scale), mode="bicubic", align_corners=False)
+            embeddings_cur_scale = self.patch_embeddings(pixel_values_cur_scale, interpolate_pos_encoding=interpolate_pos_encoding)
+            if interpolate_pos_encoding:
+                position_embeddings_cur_scale = self.interpolate_pos_encoding(embeddings_cur_scale, scale, scale)
+            else:
+                position_embeddings_cur_scale = self.position_embeddings
+            # add position embeddings w/o cls token
+            embeddings_cur_scale = embeddings_cur_scale + position_embeddings_cur_scale[:, 1:, :]
+            # add scale embedding
+            if self.config.scale_embed:
+                scale_embeddings_cur_scale = self.scale_embed[i][None, None]
+                embeddings_cur_scale = embeddings_cur_scale + scale_embeddings_cur_scale
+            embeddings.append(embeddings_cur_scale)
+        embeddings = torch.cat(embeddings, dim=1)  # (B * T) * N * C
+        # add time embedding
+        embeddings = rearrange(embeddings, '(b t) n c -> b t n c', b=B, t=T)  # B * T * N * C
+        if self.config.time_embed:
+            time_embeddings = self.time_embed[None, :T, None, :]  # 1 * T * 1 * C
+            embeddings = embeddings + time_embeddings
+        embeddings = rearrange(embeddings, 'b t n c -> b (t n) c')  # B * (T * N) * C
+        # masking: length -> length * config.mask_ratio
+        embeddings = self.mask_with_gazing(embeddings, gazing_info)
+        # append cls token
+        cls_token = self.cls_token + self.position_embeddings[:, :1, :]
+        cls_tokens = cls_token.expand(embeddings.shape[0], -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+        return embeddings
+class ViTMAEPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+    def forward(self, pixel_values, interpolate_pos_encoding: bool = False):
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        if not interpolate_pos_encoding and (height != self.image_size[0] or width != self.image_size[1]):
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
+        x = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return x
+# Copied from transformers.models.vit.modeling_vit.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention ViT->ViTMAE
+class ViTMAESelfAttention(nn.Module):
+    def __init__(self, config: ViTMAEConfig) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+        self.config = config
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+                assert False, "SDPA doesn't support output_attentions=True. If falling back to eager, please change the attention mask implementation."
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        context_layer, attention_probs = attention_interface(
+            self,
+            query_layer,
+            key_layer,
+            value_layer,
+            head_mask,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
+        )
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        return outputs
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->ViTMAE
+class ViTMAESelfOutput(nn.Module):
+    """
+    The residual connection is defined in ViTMAELayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+    def __init__(self, config: ViTMAEConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->ViTMAE
+class ViTMAEAttention(nn.Module):
+    def __init__(self, config: ViTMAEConfig) -> None:
+        super().__init__()
+        self.attention = ViTMAESelfAttention(config)
+        self.output = ViTMAESelfOutput(config)
+        self.pruned_heads = set()
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+# Copied from transformers.models.vit.modeling_vit.ViTIntermediate ViT->ViTMAE
+class ViTMAEIntermediate(nn.Module):
+    def __init__(self, config: ViTMAEConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+# Copied from transformers.models.vit.modeling_vit.ViTOutput ViT->ViTMAE
+class ViTMAEOutput(nn.Module):
+    def __init__(self, config: ViTMAEConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states + input_tensor
+        return hidden_states
+# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->ViTMAE,VIT->VITMAE
+class ViTMAELayer(nn.Module):
+    """This corresponds to the Block class in the timm implementation."""
+    def __init__(self, config: ViTMAEConfig) -> None:
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = ViTMAEAttention(config)
+        self.intermediate = ViTMAEIntermediate(config)
+        self.output = ViTMAEOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_attention_outputs = self.attention(
+            self.layernorm_before(hidden_states),  # in ViTMAE, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+        # in ViTMAE, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+        outputs = (layer_output,) + outputs
+        return outputs
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->ViTMAE
+class ViTMAEEncoder(nn.Module):
+    def __init__(self, config: ViTMAEConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([ViTMAELayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    head_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, head_mask, output_attentions)
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+class ViTMAEPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = ViTMAEConfig
+    base_model_prefix = "vit"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _supports_sdpa = True
+    _supports_flash_attn_2 = True
+    _supports_flex_attn = True
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, ViTMAEEmbeddings):
+            module.initialize_weights()
+        elif isinstance(module, ViTMAEDecoder):
+            module.mask_token.data.zero_()
+            module.decoder_pos_embed.data.zero_()
+            if self.config.scale_embed:
+                torch.nn.init.normal_(module.decoder_scale_embed, std=self.config.initializer_range)
+            if self.config.time_embed:
+                torch.nn.init.normal_(module.time_embed, std=self.config.initializer_range)
+class ViTMAEModel(ViTMAEPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = ViTMAEEmbeddings(config)
+        self.encoder = ViTMAEEncoder(config)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        gazing_info: Optional[dict] = None,
+        noise: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> Union[Tuple, ViTMAEModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = head_mask.to(self.dtype)
+        embedding_output = self.embeddings(
+            pixel_values, gazing_info=gazing_info, noise=noise, interpolate_pos_encoding=interpolate_pos_encoding
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        if not return_dict:
+            return sequence_output + encoder_outputs[1:]
+        return ViTMAEModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+class ViTMAEDecoder(ViTMAEPreTrainedModel):
+    def __init__(self, config, num_patches):
+        super().__init__(config)
+        self.decoder_embed = nn.Linear(config.hidden_size, config.decoder_hidden_size, bias=True)
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_hidden_size))
+        self.decoder_pos_embed = nn.Parameter(
+            torch.zeros(1, num_patches + 1, config.decoder_hidden_size), requires_grad=False
+        )  # fixed sin-cos embedding
+        decoder_config = deepcopy(config)
+        decoder_config.hidden_size = config.decoder_hidden_size
+        decoder_config.num_hidden_layers = config.decoder_num_hidden_layers
+        decoder_config.num_attention_heads = config.decoder_num_attention_heads
+        decoder_config.intermediate_size = config.decoder_intermediate_size
+        self.decoder_layers = nn.ModuleList(
+            [ViTMAELayer(decoder_config) for _ in range(config.decoder_num_hidden_layers)]
+        )
+        self.decoder_norm = nn.LayerNorm(config.decoder_hidden_size, eps=config.layer_norm_eps)
+        self.decoder_pred = nn.Linear(
+            config.decoder_hidden_size, config.patch_size**2 * config.num_channels, bias=True
+        )  # encoder to decoder
+        self.gradient_checkpointing = False
+        self.config = config
+        # multi-scale setting
+        self.scales = sorted([int(scale) for scale in config.scales.split('+')])
+        self.num_patch_each_frame_each_scale = [(scale // config.patch_size)**2 for scale in self.scales]
+        if self.config.scale_embed:
+            self.decoder_scale_embed = nn.Parameter(torch.randn(len(self.scales), config.decoder_hidden_size) * 0)
+        # time embed
+        if self.config.time_embed:
+            self.time_embed = nn.Parameter(torch.randn(config.max_num_frames, config.decoder_hidden_size) * 0)
+        self.num_token_each_frame = sum(self.num_patch_each_frame_each_scale)
+        self.initialize_weights(num_patches)
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor) -> torch.Tensor:
+        """
+        This method is a modified version of the interpolation function for ViT-mae model at the decoder, that
+        allows to interpolate the pre-trained decoder position encodings, to be able to use the model on higher
+        resolution images.
+        Adapted from:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+        # -1 removes the class dimension since we later append it without interpolation
+        embeddings_positions = embeddings.shape[1] - 1
+        # Separation of class token and patch tokens
+        class_pos_embed = self.decoder_pos_embed[:, :1]
+        patch_pos_embed = self.decoder_pos_embed[:, 1:]
+        # To retain the final 3d tensor with the required dimensions
+        dim = self.decoder_pos_embed.shape[-1]
+        # Increasing a dimension to enable bicubic interpolation
+        patch_pos_embed = patch_pos_embed.reshape(1, 1, -1, dim)
+        # permute to bring the dimension to be interpolated, to the last
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        # Interpolating the decoder position embeddings shape wrt embeddings shape i.e (x).
+        # we keep the second last dimension constant
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(patch_pos_embed.shape[-2], embeddings_positions),
+            mode="bicubic",
+            align_corners=False,
+        )
+        # Converting back to the original shape
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        # Adding the class token back
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+    def initialize_weights(self, num_patches):
+        # initialize (and freeze) position embeddings by sin-cos embedding
+        decoder_pos_embed = get_2d_sincos_pos_embed(
+            self.decoder_pos_embed.shape[-1], int(num_patches**0.5), add_cls_token=True
+        )
+        self.decoder_pos_embed.data.copy_(torch.from_numpy(decoder_pos_embed).float().unsqueeze(0))
+        # timm's trunc_normal_(std=.02) is effectively normal_(std=0.02) as cutoff is too big (2.)
+        torch.nn.init.normal_(self.mask_token, std=self.config.initializer_range)
+    def forward(
+        self,
+        hidden_states,
+        gazing_info=None,
+        frame_idx_to_reconstruct=None,
+        head_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        interpolate_pos_encoding: bool = False,
+    ):
+        gazing_pos = gazing_info['gazing_pos']
+        num_gazing_each_frame = gazing_info['num_gazing_each_frame']
+        if_padded_gazing = gazing_info['if_padded_gazing']
+        original_seq_length = gazing_info['original_seq_length']
+        B = hidden_states.shape[0]
+        gaze_length = gazing_pos.shape[1]
+        assert gaze_length == num_gazing_each_frame.sum()
+        T = len(num_gazing_each_frame)
+        original_seq_length_each_frame = original_seq_length // T
+        # embed tokens
+        x = self.decoder_embed(hidden_states)
+        # Take out cls token
+        x_ = x[:, 1:, :]
+        cls_token = x[:, :1, :]
+        # Change all the padded gazing id to the last token id
+        gazing_pos = gazing_pos.flatten()
+        gazing_pos[if_padded_gazing.flatten()] = original_seq_length
+        gazing_pos = gazing_pos.view(B, -1)
+        # add mask tokens back to the sequence (temporarily append an additional token for padded gazing to select)
+        full_seq = self.mask_token.repeat(x.shape[0], original_seq_length + 1, 1).to(x.dtype)
+        full_seq[torch.arange(B)[..., None], gazing_pos] = x_
+        full_seq = full_seq[:, :-1, :]
+        # add pos embed and scale embed
+        full_seq = rearrange(full_seq, 'b (t n) c -> (b t) n c', t=T)
+        decoder_pos_embed = []
+        decoder_scale_embed = []
+        for i, scale in enumerate(self.scales):
+            x_cur_scale = full_seq[:, sum(self.num_patch_each_frame_each_scale[:i]):sum(self.num_patch_each_frame_each_scale[:i+1])]
+            if interpolate_pos_encoding:
+                decoder_pos_embed_cur_scale = self.interpolate_pos_encoding(F.pad(x_cur_scale, (0, 0, 1, 0)))[:, 1:]
+            else:
+                decoder_pos_embed_cur_scale = self.decoder_pos_embed
+            decoder_pos_embed.append(decoder_pos_embed_cur_scale)
+            if self.config.scale_embed:
+                decoder_scale_embed.append(self.decoder_scale_embed[i][None, None].repeat(1, decoder_pos_embed_cur_scale.shape[1], 1))
+        decoder_pos_embed = torch.cat(decoder_pos_embed, dim=1)
+        decoder_scale_embed = torch.cat(decoder_scale_embed, dim=1) if self.config.scale_embed else 0
+        full_seq = full_seq + decoder_pos_embed + decoder_scale_embed
+        full_seq = rearrange(full_seq, '(b t) n c -> b (t n) c', t=T)
+        # add time embed
+        if self.config.time_embed:
+            time_embed = self.time_embed[None, :T, None, :]
+            full_seq = rearrange(full_seq, 'b (t n) c -> b t n c', t=T)
+            full_seq = full_seq + time_embed
+            full_seq = rearrange(full_seq, 'b t n c -> b (t n) c', t=T)
+        # Get the index of tokens to feed into decoder (encoded tokens + mask tokens for selected frames)
+        idx_to_decode = gazing_pos.clone()
+        idx_to_decode = list(idx_to_decode.split(num_gazing_each_frame.tolist(), dim=-1))
+        for frame_idx in frame_idx_to_reconstruct:
+            idx_to_decode[frame_idx] = torch.arange(original_seq_length_each_frame, device=gazing_pos.device)[None].repeat(B, 1) + original_seq_length_each_frame * frame_idx
+        idx_to_decode = torch.cat(idx_to_decode, dim=-1)
+        # Get the tokens to decode
+        full_seq = torch.cat([full_seq, full_seq[:, :1]], dim=1)
+        hidden_states = full_seq[torch.arange(B)[..., None], idx_to_decode]
+        # add cls token
+        cls_token = cls_token + self.decoder_pos_embed[:, :1]
+        hidden_states = torch.cat([cls_token, hidden_states], dim=1)
+        # apply Transformer layers (blocks)
+        head_mask = head_mask.to(self.dtype)
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        for i, layer_module in enumerate(self.decoder_layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    head_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, head_mask=head_mask, output_attentions=output_attentions)
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        hidden_states = self.decoder_norm(hidden_states)
+        # predictor projection
+        logits = self.decoder_pred(hidden_states)
+        # remove cls token
+        logits = logits[:, 1:, :]
+        if not return_dict:
+            return tuple(v for v in [logits, all_hidden_states, all_self_attentions] if v is not None)
+        return ViTMAEDecoderOutput(
+            logits=logits,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+class ViTMAEForPreTraining(ViTMAEPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.vit = ViTMAEModel(config)
+        self.decoder = ViTMAEDecoder(config, num_patches=self.vit.embeddings.num_patches)
+        # multi-scale setting
+        self.scales = sorted([int(scale) for scale in config.scales.split('+')])
+        self.num_patch_each_scale = [(scale // config.patch_size)**2 for scale in self.scales]
+        self.num_token_each_frame = sum(self.num_patch_each_scale)
+        # loss setting
+        self.loss_type = [str(loss) for loss in config.loss_type.split('+')]
+        self.loss_weights = [float(weight) for weight in config.loss_weights.split('+')]
+        self.transform = None  # will be initialized in the outer
+        self.loss_fns = []
+        for loss in self.loss_type:
+            if loss == 'l1':
+                self.loss_fns.append(self.l1_loss)
+            elif loss == 'dinov2_reg':
+                self.dinov2_reg = None  # will be initialized in the outer
+                self.dinov2_reg_transform = None  # will be initialized in the outer
+                self.loss_fns.append(self.dinov2_reg_loss)
+            elif loss == 'siglip2':
+                self.siglip2 = None  # will be initialized in the outer
+                self.siglip2_transform = None  # will be initialized in the outer
+                self.loss_fns.append(self.siglip2_loss)
+            else:
+                raise ValueError(f"Loss type {loss} not supported")
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.vit.embeddings.patch_embeddings
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    def patchify(self, pixel_values, interpolate_pos_encoding: bool = False):
+        """
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values.
+            interpolate_pos_encoding (`bool`, *optional*, default `False`):
+                interpolation flag passed during the forward pass.
+        Returns:
+            `torch.FloatTensor` of shape `(batch_size, num_patches, patch_size**2 * num_channels)`:
+                Patchified pixel values.
+        """
+        patch_size, num_channels = self.config.patch_size, self.config.num_channels
+        # sanity checks
+        if not interpolate_pos_encoding and (
+            pixel_values.shape[2] != pixel_values.shape[3] or pixel_values.shape[2] % patch_size != 0
+        ):
+            raise ValueError("Make sure the pixel values have a squared size that is divisible by the patch size")
+        if pixel_values.shape[1] != num_channels:
+            raise ValueError(
+                "Make sure the number of channels of the pixel values is equal to the one set in the configuration"
+            )
+        # patchify
+        batch_size = pixel_values.shape[0]
+        num_patches_h = pixel_values.shape[2] // patch_size
+        num_patches_w = pixel_values.shape[3] // patch_size
+        patchified_pixel_values = pixel_values.reshape(
+            batch_size, num_channels, num_patches_h, patch_size, num_patches_w, patch_size
+        )
+        patchified_pixel_values = torch.einsum("nchpwq->nhwpqc", patchified_pixel_values)
+        patchified_pixel_values = patchified_pixel_values.reshape(
+            batch_size, num_patches_h * num_patches_w, patch_size**2 * num_channels
+        )
+        return patchified_pixel_values
+    def unpatchify(self, patchified_pixel_values, original_image_size: Optional[Tuple[int, int]] = None):
+        """
+        Args:
+            patchified_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_patches, patch_size**2 * num_channels)`:
+                Patchified pixel values.
+            original_image_size (`Tuple[int, int]`, *optional*):
+                Original image size.
+        Returns:
+            `torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`:
+                Pixel values.
+        """
+        patch_size, num_channels = self.config.patch_size, self.config.num_channels
+        original_image_size = (
+            original_image_size
+            if original_image_size is not None
+            else (self.config.image_size, self.config.image_size)
+        )
+        original_height, original_width = original_image_size
+        num_patches_h = original_height // patch_size
+        num_patches_w = original_width // patch_size
+        # sanity check
+        if num_patches_h * num_patches_w != patchified_pixel_values.shape[1]:
+            raise ValueError(
+                f"The number of patches in the patchified pixel values {patchified_pixel_values.shape[1]}, does not match the number of patches on original image {num_patches_h}*{num_patches_w}"
+            )
+        # unpatchify
+        batch_size = patchified_pixel_values.shape[0]
+        patchified_pixel_values = patchified_pixel_values.reshape(
+            batch_size,
+            num_patches_h,
+            num_patches_w,
+            patch_size,
+            patch_size,
+            num_channels,
+        )
+        patchified_pixel_values = torch.einsum("nhwpqc->nchpwq", patchified_pixel_values)
+        pixel_values = patchified_pixel_values.reshape(
+            batch_size,
+            num_channels,
+            num_patches_h * patch_size,
+            num_patches_w * patch_size,
+        )
+        return pixel_values
+    def retransform(self, image, source_transform, target_transform):
+        # Revert the source transform
+        image = rearrange(image, 'b c h w -> b h w c')
+        if source_transform.do_normalize:
+            image = image * torch.tensor(source_transform.image_std, device=image.device, dtype=image.dtype) + torch.tensor(source_transform.image_mean, device=image.device, dtype=image.dtype)
+        if source_transform.do_rescale:
+            if hasattr(source_transform, 'offset') and source_transform.offset:
+                image = image + 1
+            image = image / source_transform.rescale_factor
+        image = rearrange(image, 'b h w c -> b c h w')
+        # Apply the target transform
+        image = rearrange(image, 'b c h w -> b h w c')
+        if target_transform.do_rescale:
+            image = image * target_transform.rescale_factor
+            if hasattr(target_transform, 'offset') and target_transform.offset:
+                image = image - 1
+        if target_transform.do_normalize:
+            image = (image - torch.tensor(target_transform.image_mean, device=image.device, dtype=image.dtype)) / torch.tensor(target_transform.image_std, device=image.device, dtype=image.dtype)
+        image = rearrange(image, 'b h w c -> b c h w')
+        return image
+    def l1_loss(self, pred, target):
+        """
+        pred, target: (B, C, H, W)
+        """
+        return (pred - target).abs().mean(dim=(-1, -2, -3))
+    def dinov2_reg_loss(self, pred, target):
+        """
+        pred, target: (B, C, H, W)
+        """
+        def get_dinov2_reg_features(image):
+            image = self.retransform(image, self.transform, self.dinov2_reg_transform)
+            features = self.dinov2_reg(image, output_hidden_states=True).hidden_states
+            features = torch.cat([feature[:, self.dinov2_reg.config.num_register_tokens + 1:] for feature in features[-4:]], dim=-1)
+            return features
+        pred_features = get_dinov2_reg_features(pred)
+        target_features = get_dinov2_reg_features(target)
+        # Get average l2 loss over last k layers' features
+        loss = (pred_features - target_features).pow(2).mean(dim=(-1, -2))
+        return loss
+    def siglip2_loss(self, pred, target):
+        """
+        pred, target: (B, C, H, W)
+        """
+        def get_siglip2_features(image):
+            image = self.retransform(image, self.transform, self.siglip2_transform)
+            features = self.siglip2(image, output_hidden_states=True).hidden_states
+            features = torch.cat(features[-4:], dim=-1)
+            return features
+        pred_features = get_siglip2_features(pred)
+        target_features = get_siglip2_features(target)
+        # Get average l2 loss over last k layers' features
+        loss = (pred_features - target_features).pow(2).mean(dim=(-1, -2))
+        return loss
+    def forward_loss(self, pixel_values, pred, interpolate_pos_encoding: bool = False):
+        """
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, T, num_channels, height, width)`):
+                Pixel values.
+            pred (`torch.FloatTensor` of shape `(batch_size, T, num_patches, patch_size**2 * num_channels)`:
+                Predicted pixel values.
+            interpolate_pos_encoding (`bool`, *optional*, default `False`):
+                interpolation flag passed during the forward pass.
+        Returns:
+            `torch.FloatTensor`: Pixel reconstruction loss.
+        """
+        B, T = pixel_values.shape[:2]
+        pixel_values = pixel_values.flatten(0, 1)  # (B * T), C, H, W
+        pred = pred.flatten(0, 1)  # (B * T), N, C
+        pred = self.unpatchify(pred, original_image_size=(pixel_values.shape[2], pixel_values.shape[3]))
+        loss = 0
+        for loss_fn, loss_weight in zip(self.loss_fns, self.loss_weights):
+            loss += loss_weight * loss_fn(pred, pixel_values)
+        loss = rearrange(loss, '(b t) -> b t', b=B, t=T)
+        mean_loss = loss.mean(dim=-1)
+        return loss, mean_loss
+    def get_reconstructed_image(self, pixel_values, pred, interpolate_pos_encoding: bool = False):
+        """
+        pixel_values: (B, T, C, H, W)
+        pred: (B, T, N, C)
+        """
+        B, T = pixel_values.shape[:2]
+        pixel_values = pixel_values.flatten(0, 1)  # (B * T), C, H, W
+        pred = pred.flatten(0, 1)  # (B * T), N, C
+        pred = self.unpatchify(pred, original_image_size=(pixel_values.shape[2], pixel_values.shape[3]))
+        pred = rearrange(pred, '(b t) c h w -> b t c h w', b=B, t=T)
+        return pred
+    def get_causal_mask(self, num_tokens_each_frame, num_layers, batch_size, num_heads, token_mask=None, cls_token=True):
+        """
+        Assume a input of shape B * N * C, where N contains tokens from several frames.
+        Each frame has num_tokens_each_frame[t] tokens.
+        Create a block-causal attention mask such that each token can only attend to tokens from either previous frames or the same frame.
+        Additionally, mask any tokens indicated by token_mask (e.g., the tokens at padded gazing positions)
+        Inputs:
+            num_tokens_each_frame: (T)
+            token_mask: (B, N)
+            cls_token: whether to include the cls token in the mask
+        Return:
+            mask: batch x num_heads x seq_length x seq_length
+        """
+        T = len(num_tokens_each_frame)
+        N = num_tokens_each_frame.sum()
+        device = num_tokens_each_frame.device
+        # Create a causal mask
+        mask = torch.tril(torch.ones(batch_size, N, N, device=device))
+        # Make the tokens inside each frame attend to each other
+        for t in range(T):
+            mask[:, sum(num_tokens_each_frame[:t]):sum(num_tokens_each_frame[:t+1]), sum(num_tokens_each_frame[:t]):sum(num_tokens_each_frame[:t+1])] = 1
+        # Mask out tokens indicated by token_mask
+        if token_mask is not None:
+            token_mask = token_mask.unsqueeze(1).repeat(1, N, 1)
+            mask = mask * (~token_mask).float()
+        # Add mask for cls token
+        if cls_token:
+            mask_ = mask.clone()
+            mask = torch.tril(torch.ones(batch_size, N + 1, N + 1, device=device))
+            mask[:, 1:, 1:] = mask_
+        # Each token must be able to attend to itself
+        mask[:, torch.arange(N), torch.arange(N)] = 1
+        # According to different attention implementations, the mask values are different.
+        if self.config._attn_implementation == "flex_attention" or self.config._attn_implementation == "sdpa":
+            # mask is a float tensor that will be added to the attention scores. This means the tokens to be attended should have mask value of 0, and the rest should have mask value of -inf.
+            mask = torch.where(mask == 1, 0, -torch.inf)
+        elif self.config._attn_implementation == "flash_attention_2":
+            raise NotImplementedError("Flash attention 2 doesn't support custom attention mask. Please use attention_implementation='flex_attention'.")
+        elif self.config._attn_implementation == "eager":
+            # mask is a float tensor that will be multiplied to the attn prob after softmax. This means the tokens to be attended should have mask value of 1, and the rest should have mask value of 0.
+            pass
+        mask = mask.unsqueeze(1).repeat(1, num_heads, 1, 1)
+        return mask.to(num_tokens_each_frame.device)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        gazing_info: Optional[dict] = None,
+        frame_idx_to_reconstruct: Optional[torch.LongTensor] = None,
+        noise: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> Union[Tuple, ViTMAEForPreTrainingOutput]:
+        """
+        pixel_values: (B, T, C, H, W)
+        gazing_info:
+            gazing_pos: The gazing positions of each whole sequence. (B, N)
+            num_gazing_each_frame: The number of gazing positions for each frame, including the padded gazing. (T, )
+            if_padded_gazing: Whether the gazing is padded. (B, N)
+        frame_idx_to_reconstruct: (num_selected_frames, )
+        """
+        B, T = pixel_values.shape[:2]
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Get the encoder attention mask
+        encoder_attn_mask = self.get_causal_mask(gazing_info['num_gazing_each_frame'], self.config.num_hidden_layers, B, self.config.num_attention_heads, token_mask=gazing_info['if_padded_gazing'], cls_token=True) if self.config.causal else None
+        # Get the encoder outputs
+        outputs = self.vit(
+            pixel_values,
+            gazing_info=gazing_info,
+            noise=noise,
+            head_mask=encoder_attn_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )
+        latent = outputs.last_hidden_state  # B * N * C
+        # Get the number of tokens to decode for each frame
+        num_decoded_tokens_each_frame = gazing_info['num_gazing_each_frame'].clone()
+        num_decoded_tokens_each_frame[frame_idx_to_reconstruct] = self.num_token_each_frame
+        # Get the gazing padding mask for decoder
+        if_padded_gazing_decoder = gazing_info['if_padded_gazing'].clone()
+        if_padded_gazing_decoder = list(if_padded_gazing_decoder.split(gazing_info['num_gazing_each_frame'].tolist(), dim=-1))
+        for frame_idx in frame_idx_to_reconstruct:
+            if_padded_gazing_decoder[frame_idx] = torch.zeros(B, self.num_token_each_frame).to(gazing_info['if_padded_gazing'].device).to(torch.bool)
+        if_padded_gazing_decoder = torch.cat(if_padded_gazing_decoder, dim=-1)
+        # Get the decoder attention mask
+        decoder_attn_mask = self.get_causal_mask(num_decoded_tokens_each_frame, self.config.decoder_num_hidden_layers, B, self.config.decoder_num_attention_heads, token_mask=if_padded_gazing_decoder, cls_token=True) if self.config.causal else None
+        # Get the decoder outputs
+        decoder_outputs = self.decoder(
+            latent,
+            gazing_info=gazing_info,
+            frame_idx_to_reconstruct=frame_idx_to_reconstruct,
+            head_mask=decoder_attn_mask,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )
+        logits = decoder_outputs.logits  # shape (batch_size, num_patches, patch_size*patch_size*num_channels)
+        # Only keep the predictions for the selected frames
+        decoded_token_idx_to_keep = []
+        for frame_idx in frame_idx_to_reconstruct:
+            decoded_token_idx_to_keep.append(torch.arange(sum(num_decoded_tokens_each_frame[:frame_idx]), sum(num_decoded_tokens_each_frame[:frame_idx+1])))
+        decoded_token_idx_to_keep = torch.cat(decoded_token_idx_to_keep, dim=0)
+        logits = logits[:, decoded_token_idx_to_keep]
+        logits = rearrange(logits, 'b (t n) c -> b t n c', t=len(frame_idx_to_reconstruct))  # B * num_selected_frames * N * C
+        # throw away the reconstruction and masks for smaller scales
+        logits = logits[:, :, sum(self.num_patch_each_scale[:-1]):, :]
+        loss_each_reconstruction_frame, loss_mean = self.forward_loss(pixel_values[:, frame_idx_to_reconstruct], logits, interpolate_pos_encoding=interpolate_pos_encoding)
+        reconstruction = self.get_reconstructed_image(pixel_values[:, frame_idx_to_reconstruct], logits, interpolate_pos_encoding=interpolate_pos_encoding)  # B * num_selected_frames * C * H * W
+        if not return_dict:
+            output = (logits, reconstruction) + outputs[2:]
+            return ((loss_each_reconstruction_frame, loss_mean) + output) if loss_each_reconstruction_frame is not None else output
+        return ViTMAEForPreTrainingOutput(
+            loss_each_reconstruction_frame=loss_each_reconstruction_frame,
+            loss_mean=loss_mean,
+            reconstruction=reconstruction,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+__all__ = ["ViTMAEForPreTraining", "ViTMAELayer", "ViTMAEModel", "ViTMAEPreTrainedModel"]

autogaze/tasks/video_mae_reconstruction/task_video_mae_reconstruction.py ADDED Viewed

	@@ -0,0 +1,182 @@

+from omegaconf import OmegaConf
+import torch
+from torch import nn
+from torch.nn import functional as F
+from transformers import AutoModel, AutoImageProcessor, VivitImageProcessor
+from transformers.models.siglip.modeling_siglip import SiglipVisionModel
+from transformers.models.siglip2.modeling_siglip2 import Siglip2VisionModel
+from .modeling_video_mae import ViTMAEForPreTraining
+from .visualize_video_mae_reconstruction import VisualizeReconstruction
+class VideoMAEReconstruction(nn.Module):
+    def __init__(self, recon_model, recon_model_config, scales, recon_sample_rate, attn_mode):
+        super().__init__()
+        # Create model
+        self.scales = sorted([int(scale) for scale in str(scales).split("+")])
+        self.transform = VivitImageProcessor.from_pretrained(recon_model, size=self.scales[-1])  # use mae image preprocessor config to intialize video preprocessor
+        self.mae = ViTMAEForPreTraining.from_pretrained(recon_model, attn_implementation="sdpa", scales=str(scales), **OmegaConf.to_container(recon_model_config))
+        self.mae.transform = self.transform
+        if "dinov2_reg" in self.mae.loss_type:
+            self.mae.dinov2_reg = AutoModel.from_pretrained(recon_model_config.dinov2_reg_loss_config.model, attn_implementation=attn_mode)
+            self.mae.dinov2_reg_transform = AutoImageProcessor.from_pretrained(recon_model_config.dinov2_reg_loss_config.model)
+            for param in self.mae.dinov2_reg.parameters():
+                param.requires_grad = False
+            self.mae.dinov2_reg.eval()
+        if "siglip2" in self.mae.loss_type:
+            if "naflex" in recon_model_config.siglip2_loss_config.model:
+                self.mae.siglip2 = Siglip2VisionModel.from_pretrained(recon_model_config.siglip2_loss_config.model, attn_implementation=attn_mode)
+            else:
+                self.mae.siglip2 = SiglipVisionModel.from_pretrained(recon_model_config.siglip2_loss_config.model, attn_implementation=attn_mode)
+            self.mae.siglip2_transform = AutoImageProcessor.from_pretrained(recon_model_config.siglip2_loss_config.model)
+            for param in self.mae.siglip2.parameters():
+                param.requires_grad = False
+            self.mae.siglip2.eval()
+        # Sampling strategy for reconstruction
+        self.recon_sample_rate = recon_sample_rate
+        # Create visualization methods
+        self.visualize_methods = [VisualizeReconstruction()]
+        # kwargs for the gaze model input. Will be passed to the gaze model during training.
+        self.gaze_model_kwargs = {
+            "target_scales": self.scales,
+            "target_patch_size": self.mae.config.patch_size,
+            "target_image_mean": self.transform.image_mean,
+            "target_image_std": self.transform.image_std,
+        }
+    @torch.autocast("cuda", dtype=torch.bfloat16)
+    def forward_output(self, inputs, gaze_outputs, frame_idx_to_reconstruct=None):
+        """
+        Get all the outputs from the inputs
+        """
+        video = inputs['video']
+        gazing_pos = gaze_outputs['gazing_pos']
+        num_gazing_each_frame = gaze_outputs['num_gazing_each_frame']
+        if_padded_gazing = gaze_outputs['if_padded_gazing']
+        frame_sampling_rate = gaze_outputs['frame_sampling_rate']
+        num_vision_tokens_each_frame = gaze_outputs['num_vision_tokens_each_frame']
+        assert frame_sampling_rate == 1, "If frame_sampling_rate > 1, we can downsample the video here but ideally we don't want to do that"
+        assert num_vision_tokens_each_frame == sum([(scale // self.mae.config.patch_size) ** 2 for scale in self.scales]), "The number of vision tokens in each frame is not consistent between gaze model and MAE model"
+        # Frame sampling strategy for reconstruction
+        B, T = video.shape[:2]
+        if frame_idx_to_reconstruct is None:
+            frame_idx_to_reconstruct = torch.randperm(T)[:int(T * self.recon_sample_rate)].to(video.device)
+        # Reconstruct the video
+        gazing_info = {
+            'gazing_pos': gazing_pos,
+            'num_gazing_each_frame': num_gazing_each_frame,
+            'if_padded_gazing': if_padded_gazing,
+        }
+        recon_output = self.mae(video, gazing_info=gazing_info, frame_idx_to_reconstruct=frame_idx_to_reconstruct, interpolate_pos_encoding=True)
+        recon_loss_mean = recon_output.loss_mean
+        recon_loss_each_reconstruction_frame = recon_output.loss_each_reconstruction_frame
+        num_gazing_before_each_reconstruction_frame = torch.stack([num_gazing_each_frame[:frame_idx+1].sum(dim=-1) for frame_idx in frame_idx_to_reconstruct], dim=0)
+        num_non_padded_gazing_at_each_reconstruction_frame = [(~if_padded_gazing)[:, num_gazing_each_frame[:frame_idx].sum():num_gazing_each_frame[:frame_idx+1].sum()].sum(dim=-1) for frame_idx in frame_idx_to_reconstruct]
+        num_non_padded_gazing_at_each_reconstruction_frame = torch.stack(num_non_padded_gazing_at_each_reconstruction_frame, dim=-1)  # B * num_reconstruction_frames
+        # Organize the recon loss at each gazing token
+        if_padded_gazing_each_frame = list(if_padded_gazing.split(num_gazing_each_frame.tolist(), dim=-1))
+        reconstruction_loss_each_gazing_token = [torch.zeros(*if_padded_gazing_each_frame[t].shape, dtype=gazing_pos.dtype, device=gazing_pos.device) for t in range(len(num_gazing_each_frame))]
+        reconstruction_loss_each_gazing_token_mask = [torch.zeros(*if_padded_gazing_each_frame[t].shape, dtype=gazing_pos.dtype, device=gazing_pos.device) for t in range(len(num_gazing_each_frame))]
+        for i, frame_idx in enumerate(frame_idx_to_reconstruct):
+            cur_mask = F.pad(if_padded_gazing_each_frame[frame_idx][:, 1:], (0, 1), value=True).to(torch.float)
+            reconstruction_loss_each_gazing_token[frame_idx] = recon_loss_each_reconstruction_frame[:, i:i+1] * cur_mask
+            reconstruction_loss_each_gazing_token_mask[frame_idx] = cur_mask
+        reconstruction_loss_each_gazing_token = torch.cat(reconstruction_loss_each_gazing_token, dim=-1)  # B * N
+        reconstruction_loss_each_gazing_token_mask = torch.cat(reconstruction_loss_each_gazing_token_mask, dim=-1)  # B * N
+        outputs = {
+            "reconstruction": recon_output.reconstruction,
+            "reconstruction_loss": recon_loss_mean,
+            "reconstruction_loss_each_reconstruction_frame": recon_loss_each_reconstruction_frame,
+            "reconstruction_loss_each_gazing_token": reconstruction_loss_each_gazing_token,
+            "reconstruction_loss_each_gazing_token_mask": reconstruction_loss_each_gazing_token_mask,
+            "num_gazing_before_each_reconstruction_frame": num_gazing_before_each_reconstruction_frame,
+            "num_non_padded_gazing_at_each_reconstruction_frame": num_non_padded_gazing_at_each_reconstruction_frame,
+            "frame_idx_to_reconstruct": frame_idx_to_reconstruct,
+            "image_mean": self.transform.image_mean,
+            "image_std": self.transform.image_std,
+            "rescale_factor": self.transform.rescale_factor,
+            "scales": self.scales,
+        }
+        return outputs
+    def loss(self, inputs, gaze_outputs, outputs):
+        """
+        Compute the loss of the outputs. Used for training the task itself.
+        """
+        reconstruction_loss = outputs['reconstruction_loss']
+        reconstruction_loss_each_gazing_token = outputs['reconstruction_loss_each_gazing_token']
+        reconstruction_loss_each_gazing_token_mask = outputs['reconstruction_loss_each_gazing_token_mask']
+        return reconstruction_loss, reconstruction_loss_each_gazing_token, reconstruction_loss_each_gazing_token_mask
+    def reward(self, inputs, gaze_outputs, outputs):
+        """
+        Compute the reward of the outputs. Used for training the gazing model.
+        """
+        reconstruction_loss_each_reconstruction_frame = outputs['reconstruction_loss_each_reconstruction_frame']
+        rewards = -reconstruction_loss_each_reconstruction_frame.detach()
+        # Gazing length before each reward
+        traj_len_each_reward = outputs['num_gazing_before_each_reconstruction_frame']
+        return rewards, traj_len_each_reward
+    def metric(self, inputs, gaze_outputs, outputs):
+        """
+        Compute the metric used for recording during validation.
+        """
+        # Reconstruction loss
+        reconstruction_loss, _, __ = self.loss(inputs, gaze_outputs, outputs)
+        reconstruction_loss = reconstruction_loss.mean()
+        # Average gazing ratio per frame
+        bs, num_frames = inputs['video'].shape[:2]
+        num_vision_tokens_each_frame = gaze_outputs['num_vision_tokens_each_frame']
+        num_gazing_total = (~gaze_outputs['if_padded_gazing']).sum()
+        avg_gazing_ratio = num_gazing_total / (bs *num_frames * num_vision_tokens_each_frame)
+        metrics = {
+            'reconstruction_loss': reconstruction_loss,
+            'avg_gazing_ratio_per_frame': avg_gazing_ratio,
+        }
+        return metrics
+    def visualize(self, inputs, gaze_outputs, task_outputs, rl_outputs=None):
+        """
+        Visualize the outputs.
+        """
+        for method in self.visualize_methods:
+            method(inputs, gaze_outputs, task_outputs, rl_outputs)
+    def forward(self, inputs, gaze_outputs):
+        """
+        Compute the outputs and the loss, reward, and metric of the outputs.
+        inputs:
+            image: B, C, H, W
+        gaze_outputs:
+            gazing_pos: B, N
+        """
+        outputs = self.forward_output(inputs, gaze_outputs)
+        loss, reconstruction_loss_each_gazing_token, reconstruction_loss_each_gazing_token_mask = self.loss(inputs, gaze_outputs, outputs)
+        reward, traj_len_each_reward = self.reward(inputs, gaze_outputs, outputs)
+        metric = self.metric(inputs, gaze_outputs, outputs)
+        to_return = {
+            'outputs': outputs,
+            'loss': loss,
+            'reward': reward,
+            'traj_len_each_reward': traj_len_each_reward,
+            'task_losses': reconstruction_loss_each_gazing_token,
+            'task_losses_mask': reconstruction_loss_each_gazing_token_mask,
+            'metrics': metric,
+        }
+        return to_return

autogaze/tasks/video_mae_reconstruction/visualize_video_mae_reconstruction.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+import wandb
+import matplotlib.pyplot as plt
+from autogaze.utils import UnNormalize
+class VisualizeReconstruction:
+    def __init__(self, **kwargs):
+        self.visualize_step = 0
+        if wandb.run is not None:
+            # define our custom x axis metric
+            wandb.define_metric("visualize_gaze/visualize_step")
+            # define which metrics will be plotted against it
+            wandb.define_metric("visualize_gaze/*", step_metric="visualize_gaze/visualize_step")
+    @torch.no_grad()
+    def __call__(self, inputs, gaze_outputs, task_outputs, rl_outputs):
+        # Get all information for visualization
+        videos = inputs['video']
+        gazing_mask = gaze_outputs['gazing_mask'] # containing multi-scale masks; list of B * T * N_each_scale
+        frame_sampling_rate = gaze_outputs['frame_sampling_rate']
+        scales = task_outputs['outputs']['scales']
+        reconstruction = task_outputs['outputs']['reconstruction']
+        frame_idx_to_reconstruct = task_outputs['outputs']['frame_idx_to_reconstruct']
+        image_mean = task_outputs['outputs']['image_mean']
+        image_std = task_outputs['outputs']['image_std']
+        rescale_factor = task_outputs['outputs']['rescale_factor']
+        num_scales = len(scales)
+        # sample the frames to visualize
+        videos = videos[:, ::frame_sampling_rate]
+        assert videos.shape[1] == gazing_mask[0].shape[1]
+        # only visualize the first instance
+        video = videos[0]
+        gazing_mask = [m[0] for m in gazing_mask]
+        reconstruction = reconstruction[0]
+        unnormalize = UnNormalize(image_mean, image_std, rescale_factor)
+        video = unnormalize(video)
+        reconstruction = unnormalize(reconstruction)
+        video = video.cpu().float().numpy()
+        reconstruction = reconstruction.cpu().float().numpy()
+        # complete the reconstruction by filling the unselected frames
+        reconstruction_full = np.zeros_like(video)
+        reconstruction_full[frame_idx_to_reconstruct.cpu().numpy()] = reconstruction
+        reconstruction = reconstruction_full
+        # Create a figure with subplots: original video frames and one row for each scale's masked video frames
+        T = video.shape[0]  # Number of frames
+        fig, axes = plt.subplots(num_scales + 2, T, figsize=(3 * T, 3 * (num_scales + 2)))
+        # Plot original video frames
+        for t in range(T):
+            frame = video[t].transpose(1, 2, 0)  # C * H * W -> H * W * C
+            axes[0, t].imshow(frame)
+            axes[0, t].set_title(f'Original Frame {t+1}')
+            axes[0, t].axis('off')
+        # Visualize masked video for each scale
+        for scale_idx in range(num_scales):
+            scale_mask = gazing_mask[scale_idx]  # T * N
+            for t in range(T):
+                frame_mask = scale_mask[t]  # N
+                # Reshape if it's flattened
+                if frame_mask.dim() == 1:
+                    h = w = int(frame_mask.shape[0] ** 0.5)
+                    frame_mask = frame_mask.reshape(h, w)
+                # Resize mask to match current scale
+                frame_mask = F.interpolate(
+                    frame_mask.unsqueeze(0).unsqueeze(0),
+                    size=(scales[scale_idx], scales[scale_idx]),
+                    mode='nearest'
+                ).squeeze()
+                frame_mask = frame_mask.cpu().float().numpy()
+                # Resize frame to match mask dimensions
+                frame = video[t]  # C * H * W
+                scale_frame = F.interpolate(
+                    torch.from_numpy(frame).unsqueeze(0),
+                    size=(scales[scale_idx], scales[scale_idx]),
+                    mode='bicubic',
+                    align_corners=False
+                ).squeeze().clamp(0, 1).numpy()
+                masked_frame = scale_frame * (0.8 * frame_mask[None, :, :] + 0.2)
+                # Plot this frame's masked image
+                axes[scale_idx + 1, t].imshow(masked_frame.transpose(1, 2, 0))
+                # Add red borders around gazed patches
+                original_mask = gazing_mask[scale_idx][t]
+                if original_mask.dim() == 1:
+                    patch_grid_size = int(original_mask.shape[0] ** 0.5)
+                    original_mask = original_mask.reshape(patch_grid_size, patch_grid_size)
+                patch_size = scales[scale_idx] // patch_grid_size
+                for i in range(patch_grid_size):
+                    for j in range(patch_grid_size):
+                        if original_mask[i, j] > 0.5:  # If this patch is gazed at
+                            rect = plt.Rectangle((j * patch_size - 0.5, i * patch_size - 0.5),
+                                               patch_size, patch_size,
+                                               linewidth=1, edgecolor='red', facecolor='none')
+                            axes[scale_idx + 1, t].add_patch(rect)
+                axes[scale_idx + 1, t].set_title(f'Scale {scales[scale_idx]} Frame {t+1}')
+                axes[scale_idx + 1, t].axis('off')
+        # plot the reconstruction
+        for t in range(T):
+            frame = reconstruction[t].transpose(1, 2, 0)  # C * H * W -> H * W * C
+            axes[num_scales + 1, t].imshow(frame)
+            axes[num_scales + 1, t].set_title(f'Reconstructed Frame {t+1}')
+            axes[num_scales + 1, t].axis('off')
+        # Adjust layout and log to wandb
+        plt.tight_layout()
+        wandb.log({
+            "visualize_gaze/visualize_step": self.visualize_step,
+            "visualize_gaze/visualize_gaze": wandb.Image(plt)
+        })
+        # Close the figure to free memory
+        plt.close(fig)
+        self.visualize_step += 1

autogaze/utils.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import builtins
+from omegaconf import OmegaConf
+from loguru import logger
+import sys
+import os
+import torch
+import numpy as np
+import wandb
+import random
+from torch.nn.parallel import DistributedDataParallel as DDP
+class UnNormalize(object):
+    def __init__(self, mean, std, rescale_factor=None):
+        self.mean = mean
+        self.std = std
+        self.rescale_factor = rescale_factor
+    def __call__(self, image):
+        image2 = torch.clone(image)
+        dims = len(image2.shape)
+        if dims == 3:
+            image2 = image2.unsqueeze(0)
+        image2 = image2.permute(1, 0, 2, 3)
+        for t, m, s in zip(image2, self.mean, self.std):
+            t.mul_(s).add_(m)
+        image2 = image2.permute(1, 0, 2, 3)
+        if dims == 3:
+            image2 = image2.squeeze(0)
+        if self.rescale_factor is not None:
+            standard_rescale = 1.0 / 255.0
+            if abs(self.rescale_factor - standard_rescale) > 1e-6:
+                # if the processor uses 1/127.5, needs /2.0 + 0.5 correction
+                image2 = image2 / 2.0 + 0.5
+        return torch.clamp(image2, 0, 1)
+class AverageScalarMeter(object):
+    def __init__(self, window_size):
+        self.window_size = window_size
+        self.current_size = 0
+        self.mean = 0
+    def update(self, values):
+        size = values.size()[0]
+        if size == 0:
+            return
+        new_mean = torch.mean(values.float(), dim=0).cpu().numpy().item()
+        size = np.clip(size, 0, self.window_size)
+        old_size = min(self.window_size - size, self.current_size)
+        size_sum = old_size + size
+        self.current_size = size_sum
+        self.mean = (self.mean * old_size + new_mean * size) / size_sum
+    def clear(self):
+        self.current_size = 0
+        self.mean = 0
+    def __len__(self):
+        return self.current_size
+    def get_mean(self):
+        return self.mean
+def plot_grad_norms(named_parameters, name_prefix=''):
+    for name, param in named_parameters:
+        if param.grad is not None:
+            norm = torch.linalg.vector_norm(param.grad, 2.0).item()
+            wandb.log({f'{name_prefix}{name}': norm})
+def suppress_print():
+    """Suppresses printing from the current process."""
+    def ignore(*_objects, _sep=" ", _end="\n", _file=sys.stdout, _flush=False):
+        pass
+    builtins.print = ignore
+def suppress_wandb():
+    """Suppresses wandb logging from the current_process."""
+    # Store original functions
+    original_functions = {}
+    for attr_name in dir(wandb):
+        attr = getattr(wandb, attr_name)
+        if callable(attr) and not attr_name.startswith('__'):
+            original_functions[attr_name] = attr
+            # Replace with no-op function
+            def make_noop(name):
+                def noop(*args, **kwargs):
+                    pass
+                return noop
+            setattr(wandb, attr_name, make_noop(attr_name))
+def suppress_logging():
+    """Suppresses loguru logging from the current process."""
+    logger.remove()  # Remove all handlers
+    logger.add(lambda _: None)  # Add a no-op handler
+def dump_cfg(cfg, logdir):
+    out_f = os.path.join(logdir, "config.yaml")
+    with open(out_f, "w") as f:
+        f.write(OmegaConf.to_yaml(cfg))
+    print("Wrote config to: {}".format(out_f))
+def get_scheduled_temperature(step, total_steps, temp_schedule_args):
+    if temp_schedule_args['mode'] == 'exp':
+        t_start = temp_schedule_args['exp']['temp_start']
+        t_end = temp_schedule_args['exp']['temp_end']
+        return t_start * (t_end / t_start) ** (step / total_steps)
+    else:
+        raise ValueError(f"Unknown temp_schedule_args: {temp_schedule_args}")
+def seed_everything(seed: int):
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+    if hasattr(torch, 'mps') and torch.backends.mps.is_available():
+        torch.mps.manual_seed(seed)
+def seed_worker(worker_id):
+    worker_seed = torch.initial_seed() % 2**32
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+    torch.manual_seed(worker_seed + worker_id)  # Add worker_id to make it different
+def format_kwargs(cfg, optional_args):
+    return {
+        arg_name: getattr(getattr(cfg, section), attr)
+        for arg_name, section, attr in optional_args
+        if hasattr(cfg, section) and hasattr(getattr(cfg, section), attr)
+    }
+def move_inputs_to_cuda(inputs):
+    for k, v in inputs.items():
+        if isinstance(v, torch.Tensor):
+            inputs[k] = v.cuda()
+        elif isinstance(v, dict):
+            inputs[k] = move_inputs_to_cuda(v)
+    return inputs
+def unwrap_model(model):
+    """Unwrap DDP model if needed."""
+    if isinstance(model, DDP):
+        return model.module
+    return model
+def get_gazing_pos_from_gazing_mask(gazing_mask: torch.Tensor) -> torch.Tensor:
+    """
+    Get the gazing positions from the gazing mask.
+    inputs:
+        gazing_mask: (B, N). 1 means gazed, 0 means not gazed.
+    outputs:
+        gazing_pos: (B, K). K is the maximum number of gazed tokens per instance. If the instance has less than K gazed tokens, the remaining positions are padded with -1.
+        if_padded_gazing: (B, K). 1 means padded, 0 means not padded.
+    """
+    # x: (B, N) with 0/1 values (float/bool/int all fine)
+    gazing_mask = gazing_mask.to(torch.long)
+    B, N = gazing_mask.shape
+    # Indices per row
+    idx = torch.arange(N, device=gazing_mask.device).expand(B, N)
+    # Sort key: put ones first, keep original order among ones/zeros
+    #  - ones get key = idx (0..N-1)
+    #  - zeros get key = N + idx (pushed after all ones)
+    key = (1 - gazing_mask) * N + idx
+    order = key.argsort(dim=1, stable=True)        # (B, N)
+    sorted_idx = idx.gather(1, order)              # ones first, then zeros
+    # Max number of ones (K) and per-row counts
+    counts = gazing_mask.sum(dim=1)                           # (B,)
+    K = int(counts.max().item())
+    if K == 0:
+        return sorted_idx[:, :0]  # (B, 0) empty result
+    topk = sorted_idx[:, :K]                          # (B, K)
+    pos = torch.arange(K, device=gazing_mask.device).expand(B, K)
+    mask = pos < counts.unsqueeze(1)                  # True where a real "1" exists
+    # Pad with -1 where the row has fewer than K ones
+    gazing_pos = topk.masked_fill(~mask, -1)
+    if_padded_gazing = (gazing_pos == -1)
+    return gazing_pos, if_padded_gazing

demo_utils.py CHANGED Viewed

@@ -1,5 +1,10 @@
-import sys
-import os
 import torch
 import torch.nn.functional as F
 import numpy as np
@@ -9,19 +14,11 @@ from transformers import VivitImageProcessor
 from PIL import Image, ImageDraw, ImageFont
 from omegaconf import OmegaConf
 from einops import rearrange
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'gengaze'))
 from autogaze.models.autogaze import AutoGaze
 from autogaze.datasets.video_utils import read_video_pyav, transform_video_for_pytorch
 from autogaze.tasks.video_mae_reconstruction import VideoMAEReconstruction
 from autogaze.utils import UnNormalize
-from tqdm import trange
-try:
-    import spaces
-    ZEROGPU_AVAILABLE = True
-except ImportError:
-    ZEROGPU_AVAILABLE = False
 def image_to_video(image_path, output_path, fps):
@@ -218,24 +215,29 @@ def process_video(video_path, setup, gazing_ratio=0.75, task_loss_requirement=0.
                 progress_callback(0.1 + 0.4 * (batch_idx / num_spatial_batches), f"Gazing progress: {gazing_pct}%")
             yield None
-            # Extract mini-batch from CPU and move to GPU: (batch_size, nt, 16, C, H, W)
             spatial_batch = video_chunks[start_idx:end_idx].to(device)
-            # Flatten to (batch_size * nt, 16, C, H, W) for model
             spatial_batch = rearrange(spatial_batch, 'bs nt t c h w -> (bs nt) t c h w')
             print(f'Processing spatial batch {batch_idx+1}/{num_spatial_batches} with {batch_size} spatial locations x {nt} temporal = {spatial_batch.shape[0]} chunks')
             # Run AutoGaze on this mini-batch
             batch_gaze_output = model({"video": spatial_batch}, gazing_ratio=gazing_ratio, task_loss_requirement=task_loss_requirement)
             # Free GPU memory after forward pass
             del spatial_batch
             # Count gazing tokens for this batch
             if_padded = batch_gaze_output.get('if_padded_gazing')
             if if_padded is not None:
-                total_gazing_tokens += (~if_padded).sum().item()
             else:
-                total_gazing_tokens += (batch_gaze_output['gazing_pos'] < (196 * 16)).sum().item()
             # Store the output
             all_gaze_outputs.append(batch_gaze_output)
@@ -283,7 +285,7 @@ def process_video(video_path, setup, gazing_ratio=0.75, task_loss_requirement=0.
         # Clean up mini-batch outputs
         del all_gaze_outputs
-        total_possible_tokens = 196 * 16 * num_chunks
         # Extract gazing masks for later visualization (already in batched form)
         gazing_masks_batched = gaze_output['gazing_mask']  # List of 4 scales, each (num_chunks, 16, num_patches)

+# IMPORTANT: Import spaces first, before any CUDA-related packages (torch, etc.)
+try:
+    import spaces
+    ZEROGPU_AVAILABLE = True
+except ImportError:
+    ZEROGPU_AVAILABLE = False
 import torch
 import torch.nn.functional as F
 import numpy as np
 from PIL import Image, ImageDraw, ImageFont
 from omegaconf import OmegaConf
 from einops import rearrange
+from tqdm import trange
 from autogaze.models.autogaze import AutoGaze
 from autogaze.datasets.video_utils import read_video_pyav, transform_video_for_pytorch
 from autogaze.tasks.video_mae_reconstruction import VideoMAEReconstruction
 from autogaze.utils import UnNormalize
 def image_to_video(image_path, output_path, fps):
                 progress_callback(0.1 + 0.4 * (batch_idx / num_spatial_batches), f"Gazing progress: {gazing_pct}%")
             yield None
             spatial_batch = video_chunks[start_idx:end_idx].to(device)
             spatial_batch = rearrange(spatial_batch, 'bs nt t c h w -> (bs nt) t c h w')
             print(f'Processing spatial batch {batch_idx+1}/{num_spatial_batches} with {batch_size} spatial locations x {nt} temporal = {spatial_batch.shape[0]} chunks')
             # Run AutoGaze on this mini-batch
             batch_gaze_output = model({"video": spatial_batch}, gazing_ratio=gazing_ratio, task_loss_requirement=task_loss_requirement)
+            num_gazing_each_frame = batch_gaze_output['num_gazing_each_frame'][:T]
+            num_gazing_total = num_gazing_each_frame.sum().item()
             # Free GPU memory after forward pass
             del spatial_batch
             # Count gazing tokens for this batch
             if_padded = batch_gaze_output.get('if_padded_gazing')
             if if_padded is not None:
+                print(f'shape of if_padded: {if_padded.shape}')
+                if_padded = if_padded[:, :min(num_gazing_total, if_padded.shape[1])]
+                new_gazing_tokens = (~if_padded).sum().item()
             else:
+                new_gazing_tokens = (batch_gaze_output['gazing_pos'] < (196 * T)).sum().item()
+            total_gazing_tokens += new_gazing_tokens
+            print(f'Batch {batch_idx+1}: Gazing tokens = {new_gazing_tokens}, Total gazing tokens so far = {total_gazing_tokens}')
             # Store the output
             all_gaze_outputs.append(batch_gaze_output)
         # Clean up mini-batch outputs
         del all_gaze_outputs
+        total_possible_tokens = 196 * min(T, 16) * num_chunks
         # Extract gazing masks for later visualization (already in batched form)
         gazing_masks_batched = gaze_output['gazing_mask']  # List of 4 scales, each (num_chunks, 16, num_patches)

packages.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+git
+git-lfs
+ffmpeg
+pkg-config
+libavcodec-dev
+libavformat-dev
+libavutil-dev
+libswscale-dev
+libswresample-dev
+libavdevice-dev
+libavfilter-dev
+libsm6
+libxext6
+cmake
+rsync
+libgl1

requirements.txt CHANGED Viewed

@@ -11,5 +11,5 @@ tqdm==4.67.1
 transformers==4.53.0
 omegaconf==2.3.0
 einops==0.8.1
-av==14.4.0
-imageio==2.37.0

 transformers==4.53.0
 omegaconf==2.3.0
 einops==0.8.1
+av
+imageio[ffmpeg]==2.37.0