jaronfei commited on
Commit
c9a55ab
·
1 Parent(s): d15ac76

fix subtitle extraction code & update results

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. eval.py +4 -4
  3. ref_results/output_w_sub.json +0 -0
README.md CHANGED
@@ -60,7 +60,7 @@ print(response)
60
 
61
  ### Video-MME Evaluation
62
 
63
- You are expected to reproduce the results of 48.2 (without subtitle) and 52.9 (with subtitle) by running the following command. By default, the results are saved as `output_w_sub.json` and `output_wo_sub.json` in local directory. We provide our results in `ref_results` directory.
64
 
65
  ```
66
  python eval.py
 
60
 
61
  ### Video-MME Evaluation
62
 
63
+ You are expected to reproduce the results of 48.2 (without subtitle) and 51.7 (with subtitle) by running the following command. By default, the results are saved as `output_w_sub.json` and `output_wo_sub.json` in local directory. We provide our results in `ref_results` directory.
64
 
65
  ```
66
  python eval.py
eval.py CHANGED
@@ -81,7 +81,7 @@ def fps_indices(input_fps: float, total_frames: int, output_fps: float = None, m
81
  return indices
82
 
83
 
84
- def load_video(src_path: str, sample_type: str, sub_path: str = None, **kwargs) -> list[Image.Image] | tuple[list[Image.Image], str]:
85
  """Load video using decord, optionally load subtitles
86
 
87
  Args:
@@ -120,7 +120,8 @@ def load_video(src_path: str, sample_type: str, sub_path: str = None, **kwargs)
120
  if sub.end < cur_time:
121
  continue
122
  elif sub.start < cur_time:
123
- sub_text.append(sub.text)
 
124
  else:
125
  break
126
  sub_text = ' '.join(sub_text)
@@ -209,7 +210,7 @@ if __name__ == '__main__':
209
  mllm.eval()
210
 
211
  dataset = VideoMMEDataset(
212
- dataset_path='your/dataset/path',
213
  sample_config=dict(
214
  sample_type='uniform',
215
  num_frames=32
@@ -229,7 +230,6 @@ if __name__ == '__main__':
229
  )
230
  results = []
231
  for data in tqdm(dataloader):
232
- print(data['text'][0])
233
  response, pixel_values = mllm.generate(
234
  texts=['\n'.join([DEFAULT_VIDEO_TOKEN, t]) for t in data['text'][0]],
235
  videos=data['video'],
 
81
  return indices
82
 
83
 
84
+ def load_video(src_path: str, sample_type: str, sub_path: str = None, **kwargs) -> list[Image.Image]:# | tuple[list[Image.Image], str]:
85
  """Load video using decord, optionally load subtitles
86
 
87
  Args:
 
120
  if sub.end < cur_time:
121
  continue
122
  elif sub.start < cur_time:
123
+ sub_text.append(sub.text.replace('\\N', ' '))
124
+ break # in accordance to the official benchmark
125
  else:
126
  break
127
  sub_text = ' '.join(sub_text)
 
210
  mllm.eval()
211
 
212
  dataset = VideoMMEDataset(
213
+ dataset_path='',
214
  sample_config=dict(
215
  sample_type='uniform',
216
  num_frames=32
 
230
  )
231
  results = []
232
  for data in tqdm(dataloader):
 
233
  response, pixel_values = mllm.generate(
234
  texts=['\n'.join([DEFAULT_VIDEO_TOKEN, t]) for t in data['text'][0]],
235
  videos=data['video'],
ref_results/output_w_sub.json CHANGED
The diff for this file is too large to render. See raw diff