jaronfei commited on
Commit ·
c9a55ab
1
Parent(s): d15ac76
fix subtitle extraction code & update results
Browse files- README.md +1 -1
- eval.py +4 -4
- ref_results/output_w_sub.json +0 -0
README.md
CHANGED
|
@@ -60,7 +60,7 @@ print(response)
|
|
| 60 |
|
| 61 |
### Video-MME Evaluation
|
| 62 |
|
| 63 |
-
You are expected to reproduce the results of 48.2 (without subtitle) and
|
| 64 |
|
| 65 |
```
|
| 66 |
python eval.py
|
|
|
|
| 60 |
|
| 61 |
### Video-MME Evaluation
|
| 62 |
|
| 63 |
+
You are expected to reproduce the results of 48.2 (without subtitle) and 51.7 (with subtitle) by running the following command. By default, the results are saved as `output_w_sub.json` and `output_wo_sub.json` in local directory. We provide our results in `ref_results` directory.
|
| 64 |
|
| 65 |
```
|
| 66 |
python eval.py
|
eval.py
CHANGED
|
@@ -81,7 +81,7 @@ def fps_indices(input_fps: float, total_frames: int, output_fps: float = None, m
|
|
| 81 |
return indices
|
| 82 |
|
| 83 |
|
| 84 |
-
def load_video(src_path: str, sample_type: str, sub_path: str = None, **kwargs) -> list[Image.Image] | tuple[list[Image.Image], str]:
|
| 85 |
"""Load video using decord, optionally load subtitles
|
| 86 |
|
| 87 |
Args:
|
|
@@ -120,7 +120,8 @@ def load_video(src_path: str, sample_type: str, sub_path: str = None, **kwargs)
|
|
| 120 |
if sub.end < cur_time:
|
| 121 |
continue
|
| 122 |
elif sub.start < cur_time:
|
| 123 |
-
sub_text.append(sub.text)
|
|
|
|
| 124 |
else:
|
| 125 |
break
|
| 126 |
sub_text = ' '.join(sub_text)
|
|
@@ -209,7 +210,7 @@ if __name__ == '__main__':
|
|
| 209 |
mllm.eval()
|
| 210 |
|
| 211 |
dataset = VideoMMEDataset(
|
| 212 |
-
dataset_path='
|
| 213 |
sample_config=dict(
|
| 214 |
sample_type='uniform',
|
| 215 |
num_frames=32
|
|
@@ -229,7 +230,6 @@ if __name__ == '__main__':
|
|
| 229 |
)
|
| 230 |
results = []
|
| 231 |
for data in tqdm(dataloader):
|
| 232 |
-
print(data['text'][0])
|
| 233 |
response, pixel_values = mllm.generate(
|
| 234 |
texts=['\n'.join([DEFAULT_VIDEO_TOKEN, t]) for t in data['text'][0]],
|
| 235 |
videos=data['video'],
|
|
|
|
| 81 |
return indices
|
| 82 |
|
| 83 |
|
| 84 |
+
def load_video(src_path: str, sample_type: str, sub_path: str = None, **kwargs) -> list[Image.Image]:# | tuple[list[Image.Image], str]:
|
| 85 |
"""Load video using decord, optionally load subtitles
|
| 86 |
|
| 87 |
Args:
|
|
|
|
| 120 |
if sub.end < cur_time:
|
| 121 |
continue
|
| 122 |
elif sub.start < cur_time:
|
| 123 |
+
sub_text.append(sub.text.replace('\\N', ' '))
|
| 124 |
+
break # in accordance to the official benchmark
|
| 125 |
else:
|
| 126 |
break
|
| 127 |
sub_text = ' '.join(sub_text)
|
|
|
|
| 210 |
mllm.eval()
|
| 211 |
|
| 212 |
dataset = VideoMMEDataset(
|
| 213 |
+
dataset_path='',
|
| 214 |
sample_config=dict(
|
| 215 |
sample_type='uniform',
|
| 216 |
num_frames=32
|
|
|
|
| 230 |
)
|
| 231 |
results = []
|
| 232 |
for data in tqdm(dataloader):
|
|
|
|
| 233 |
response, pixel_values = mllm.generate(
|
| 234 |
texts=['\n'.join([DEFAULT_VIDEO_TOKEN, t]) for t in data['text'][0]],
|
| 235 |
videos=data['video'],
|
ref_results/output_w_sub.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|