VidMuse_CVPR

Browse files

Files changed (4) hide show

config.json +0 -8
modeling_vidmuse.py +0 -27
modeling_vidmuse_back.py +0 -51
video_processor.py +9 -4

config.json DELETED Viewed

@@ -1,8 +0,0 @@
-{
-    "model_type": "simple_processor",
-    "message": "Hello from SimpleProcessor!",
-    "auto_map": {
-      "AutoConfig": "processor.VidMuseConfig",
-      "AutoProcessor": "processor.VidMuseProcessor"
-    }
-  }

modeling_vidmuse.py DELETED Viewed

@@ -1,27 +0,0 @@
-# modeling_vidmuse.py
-from transformers import AutoConfig, AutoModel, PretrainedConfig, PreTrainedModel
-import torch
-# 注册自定义配置和模型（关键步骤！）
-class VidMuseConfig(PretrainedConfig):
-    model_type = "vidmuse"
-    def __init__(self, compression_model=None, **kwargs):
-        super().__init__(**kwargs)
-        self.compression_model = compression_model
-class VidMuseModel(PreTrainedModel):
-    config_class = VidMuseConfig  # 明确指定关联的配置类
-    def __init__(self, config):
-        super().__init__(config)  # 必须调用父类初始化
-        self.model_dir = os.path.dirname(os.path.abspath(__file__))
-        self.compression_model = self._load_submodel(config.compression_model)
-    def _load_submodel(self, relative_path):
-        full_path = os.path.join(self.model_dir, relative_path)
-        return torch.load(full_path)
-# 注册到Auto框架（必须放在类定义之后！）
-AutoConfig.register("vidmuse", VidMuseConfig)
-AutoModel.register(VidMuseConfig, VidMuseModel)

modeling_vidmuse_back.py DELETED Viewed

@@ -1,51 +0,0 @@
-# modeling_vidmuse.py
-from transformers import AutoConfig, AutoModel, PretrainedConfig, PreTrainedModel
-import torch
-import os
-from huggingface_hub import hf_hub_download
-from huggingface_hub import snapshot_download
-# 注册自定义配置和模型（关键步骤！）
-class VidMuseConfig(PretrainedConfig):
-    model_type = "vidmuse"
-    def __init__(self, compression_model=None, **kwargs):
-        super().__init__(**kwargs)
-        self.compression_model = compression_model
-class VidMuseModel(PreTrainedModel):
-    config_class = VidMuseConfig  # 明确指定关联的配置类
-    def __init__(self, config):
-        super().__init__(config)  # 必须调用父类初始化
-        # self.model_dir = os.path.dirname(os.path.abspath(__file__))
-        self.hub_cache_dir = snapshot_download(
-            repo_id="Zeyue7/VidMuse",
-            revision=config._commit_hash  # 使用配置中的 commit hash
-        )
-        self.compression_model = self._load_submodel(config.compression_model)
-        # import pdb; pdb.set_trace()
-    def _load_submodel(self, relative_path):
-        full_path = os.path.join(self.hub_cache_dir, relative_path)
-        return torch.load(full_path)
-    # @classmethod
-    # def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-    #     # 主动下载附加文件
-    #     hf_hub_download(
-    #         repo_id=pretrained_model_name_or_path,
-    #         filename="compression_state_dict.bin",
-    #         force_download=True,
-    #         cache_dir=kwargs.get("cache_dir", None)
-    #     )
-    #     # 继续正常加载流程
-    #     return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
-# 注册到Auto框架（必须放在类定义之后！）
-AutoConfig.register("vidmuse", VidMuseConfig)
-AutoModel.register(VidMuseConfig, VidMuseModel)

video_processor.py CHANGED Viewed

@@ -24,13 +24,14 @@ class VideoProcessor:
         target_duration = duration * target_fps
         if current_duration > target_duration:
-            video_tensor = video_tensor[:, :target_duration]
         elif current_duration < target_duration:
             last_frame = video_tensor[:, -1:]
-            repeat_times = target_duration - current_duration
             video_tensor = torch.cat((video_tensor, last_frame.repeat(1, repeat_times, 1, 1)), dim=1)
         return video_tensor
     def video_read_global(self, filepath, seek_time=0., duration=-1, target_fps=2, global_mode='average', global_num_frames=32):
         vr = VideoReader(filepath, ctx=cpu(0))
         fps = vr.get_avg_fps()
@@ -40,7 +41,7 @@ class VideoProcessor:
             total_frames_to_read = target_fps * duration
             frame_interval = int(math.ceil(fps / target_fps))
             start_frame = int(seek_time * fps)
-            end_frame = start_frame + frame_interval * total_frames_to_read
             frame_ids = list(range(start_frame, min(end_frame, frame_count), frame_interval))
         else:
             frame_ids = list(range(0, frame_count, int(math.ceil(fps / target_fps))))
@@ -53,8 +54,9 @@ class VideoProcessor:
         local_video_tensor = einops.rearrange(local_video_tensor, 't c h w -> c t h w') # [T, C, H, W] -> [C, T, H, W]
         local_video_tensor = self.adjust_video_duration(local_video_tensor, duration, target_fps)
-        if global_mode == 'average':
             global_frame_ids = torch.linspace(0, frame_count - 1, global_num_frames).long()
             global_frames = vr.get_batch(global_frame_ids)
             global_frames = torch.from_numpy(global_frames.asnumpy()).permute(0, 3, 1, 2)  # [N, H, W, C] -> [N, C, H, W]
@@ -62,8 +64,11 @@ class VideoProcessor:
             global_video_tensor = torch.stack(global_frames)
             global_video_tensor = einops.rearrange(global_video_tensor, 't c h w -> c t h w') # [T, C, H, W] -> [C, T, H, W]
         return local_video_tensor, global_video_tensor
     def process(self, video_path, target_fps=2, global_mode='average', global_num_frames=32):
         duration = self.get_video_duration(video_path)
         if duration is None:

         target_duration = duration * target_fps
         if current_duration > target_duration:
+            video_tensor = video_tensor[:, :int(target_duration)]
         elif current_duration < target_duration:
             last_frame = video_tensor[:, -1:]
+            repeat_times = int(target_duration - current_duration)
             video_tensor = torch.cat((video_tensor, last_frame.repeat(1, repeat_times, 1, 1)), dim=1)
         return video_tensor
     def video_read_global(self, filepath, seek_time=0., duration=-1, target_fps=2, global_mode='average', global_num_frames=32):
         vr = VideoReader(filepath, ctx=cpu(0))
         fps = vr.get_avg_fps()
             total_frames_to_read = target_fps * duration
             frame_interval = int(math.ceil(fps / target_fps))
             start_frame = int(seek_time * fps)
+            end_frame = int(start_frame + frame_interval * total_frames_to_read)
             frame_ids = list(range(start_frame, min(end_frame, frame_count), frame_interval))
         else:
             frame_ids = list(range(0, frame_count, int(math.ceil(fps / target_fps))))
         local_video_tensor = einops.rearrange(local_video_tensor, 't c h w -> c t h w') # [T, C, H, W] -> [C, T, H, W]
         local_video_tensor = self.adjust_video_duration(local_video_tensor, duration, target_fps)
+        if global_mode=='average':
             global_frame_ids = torch.linspace(0, frame_count - 1, global_num_frames).long()
             global_frames = vr.get_batch(global_frame_ids)
             global_frames = torch.from_numpy(global_frames.asnumpy()).permute(0, 3, 1, 2)  # [N, H, W, C] -> [N, C, H, W]
             global_video_tensor = torch.stack(global_frames)
             global_video_tensor = einops.rearrange(global_video_tensor, 't c h w -> c t h w') # [T, C, H, W] -> [C, T, H, W]
+        assert global_video_tensor.shape[1] == global_num_frames, f"the shape of global_video_tensor is {global_video_tensor.shape}"
         return local_video_tensor, global_video_tensor
     def process(self, video_path, target_fps=2, global_mode='average', global_num_frames=32):
         duration = self.get_video_duration(video_path)
         if duration is None: