fix subtitle extraction code & update results

Files changed (3) hide show

README.md CHANGED Viewed

@@ -60,7 +60,7 @@ print(response)
 ### Video-MME Evaluation
-You are expected to reproduce the results of 48.2 (without subtitle) and 52.9 (with subtitle) by running the following command. By default, the results are saved as `output_w_sub.json` and `output_wo_sub.json` in local directory. We provide our results in `ref_results` directory.
 ```
 python eval.py

 ### Video-MME Evaluation
+You are expected to reproduce the results of 48.2 (without subtitle) and 51.7 (with subtitle) by running the following command. By default, the results are saved as `output_w_sub.json` and `output_wo_sub.json` in local directory. We provide our results in `ref_results` directory.
 ```
 python eval.py

eval.py CHANGED Viewed

@@ -81,7 +81,7 @@ def fps_indices(input_fps: float, total_frames: int, output_fps: float = None, m
     return indices
-def load_video(src_path: str, sample_type: str, sub_path: str = None, **kwargs) -> list[Image.Image] | tuple[list[Image.Image], str]:
     """Load video using decord, optionally load subtitles
     Args:
@@ -120,7 +120,8 @@ def load_video(src_path: str, sample_type: str, sub_path: str = None, **kwargs)
                 if sub.end < cur_time:
                     continue
                 elif sub.start < cur_time:
-                    sub_text.append(sub.text)
                 else:
                     break
             sub_text = ' '.join(sub_text)
@@ -209,7 +210,7 @@ if __name__ == '__main__':
     mllm.eval()
     dataset = VideoMMEDataset(
-        dataset_path='your/dataset/path',
         sample_config=dict(
             sample_type='uniform',
             num_frames=32
@@ -229,7 +230,6 @@ if __name__ == '__main__':
             )
             results = []
             for data in tqdm(dataloader):
-                print(data['text'][0])
                 response, pixel_values = mllm.generate(
                     texts=['\n'.join([DEFAULT_VIDEO_TOKEN, t]) for t in data['text'][0]],
                     videos=data['video'],

     return indices
+def load_video(src_path: str, sample_type: str, sub_path: str = None, **kwargs) -> list[Image.Image]:# | tuple[list[Image.Image], str]:
     """Load video using decord, optionally load subtitles
     Args:
                 if sub.end < cur_time:
                     continue
                 elif sub.start < cur_time:
+                    sub_text.append(sub.text.replace('\\N', ' '))
+                    break   # in accordance to the official benchmark
                 else:
                     break
             sub_text = ' '.join(sub_text)
     mllm.eval()
     dataset = VideoMMEDataset(
+        dataset_path='',
         sample_config=dict(
             sample_type='uniform',
             num_frames=32
             )
             results = []
             for data in tqdm(dataloader):
                 response, pixel_values = mllm.generate(
                     texts=['\n'.join([DEFAULT_VIDEO_TOKEN, t]) for t in data['text'][0]],
                     videos=data['video'],

ref_results/output_w_sub.json CHANGED Viewed

The diff for this file is too large to render. See raw diff