1f commited on
Commit
885ccec
·
verified ·
1 Parent(s): 81aa597

Add files using upload-large-folder tool

Browse files
Files changed (20) hide show
  1. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/dynamath.py +240 -0
  2. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_base.py +172 -0
  3. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_caption.py +75 -0
  4. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_ccocr.py +197 -0
  5. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_mcq.py +904 -0
  6. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_mt.py +128 -0
  7. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_vqa.py +1475 -0
  8. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_yorn.py +95 -0
  9. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/longvideobench.py +328 -0
  10. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/miabench.py +167 -0
  11. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mlvu.py +455 -0
  12. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mmbench_video.py +256 -0
  13. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mmgenbench.py +69 -0
  14. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mmlongbench.py +584 -0
  15. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mmmath.py +446 -0
  16. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mvbench.py +666 -0
  17. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/slidevqa.py +189 -0
  18. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/tempcompass.py +639 -0
  19. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/text_base.py +88 -0
  20. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/text_mcq.py +123 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/dynamath.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ import sympy as sp
4
+ import numpy as np
5
+ import pandas as pd
6
+ from sympy import simplify, Eq, sympify, Pow, pi
7
+ from sympy.parsing.latex import parse_latex
8
+ import sys
9
+ import math
10
+ import os
11
+ import os.path as osp
12
+ import argparse
13
+
14
+ from .image_base import ImageBaseDataset
15
+ from .utils import build_judge
16
+ from ..utils import track_progress_rich
17
+ from ..smp import load, dump, d2df, toliststr
18
+
19
+
20
+ def preprocess(str1):
21
+ if 0 <= str1.find("{") < str1.rfind("}"):
22
+ str1 = str1[str1.find("{"): str1.rfind("}") + 1]
23
+ str2 = str1.replace("\\", "")
24
+ str2 = str2.replace("\\n", "\n")
25
+ return str2
26
+
27
+
28
+ def transfer(str1):
29
+ if "\u03c0" in str1:
30
+ strs = str1.split('\u03c0')
31
+ str1 = strs[0]
32
+ return float(str1) * np.pi
33
+ else:
34
+ return float(str1)
35
+
36
+
37
+ def parse_answer(answer, answer_type="multiple choice"):
38
+ if answer_type == "float":
39
+ if answer.isdigit():
40
+ return True, float(answer)
41
+ else:
42
+ parts = answer.split(' ')
43
+ answer = parts[0]
44
+ try:
45
+ answer = transfer(answer)
46
+ return True, answer
47
+ except:
48
+ return False, None
49
+ elif answer_type == "multiple choice":
50
+ if len(answer) == 1:
51
+ return True, answer.upper()
52
+ else:
53
+ in_flag = [ch in answer.upper() for ch in 'ABCDE']
54
+ if sum(in_flag) == 1:
55
+ for ch in 'ABCDE':
56
+ if ch in answer.upper():
57
+ return True, ch
58
+ return False, None
59
+ else:
60
+ return True, answer
61
+
62
+
63
+ def DynaMath_auxeval(model, line):
64
+ pred = line['prediction']
65
+ pred = preprocess(pred)
66
+
67
+ succeed, short_answer = None, None
68
+ try:
69
+ dj = json.loads(pred, strict=False)
70
+ short_answer = dj.get("short answer")
71
+ assert short_answer is not None
72
+ succeed, short_answer = parse_answer(short_answer, answer_type=line['anwser_type'])
73
+ assert succeed
74
+ except:
75
+ # Failed to parse the JSON, use an auxiliary LLM to get the short answer
76
+ if line['answer_type'] == 'multiple choice':
77
+ inst = "Output the corresponing choice option, such as 'A', 'B', 'C', 'D', in a single line."
78
+ elif line['answer_type'] == 'float':
79
+ inst = "Output a three-digit floating-point number in a single line."
80
+ else:
81
+ inst = (
82
+ "Output a short answer in a single line. Any float numbers in the answer "
83
+ "should be formatted as three-digit floating-point numbers."
84
+ )
85
+
86
+ prompt = f"Free-form answer: {pred}\nInstruction: {inst}"
87
+ response = pred
88
+ succeed, short_answer = parse_answer(response, line['answer_type'])
89
+ if not succeed:
90
+ response = model.generate(prompt)
91
+ succeed, short_answer = parse_answer(response, line['answer_type'])
92
+
93
+ if line['answer_type'] == 'float':
94
+ if succeed:
95
+ diff = float(short_answer) - float(line['answer'])
96
+ if abs(diff) <= 0.001:
97
+ return dict(parse=True, extracted=short_answer, correct=True)
98
+ else:
99
+ return dict(parse=True, extracted=short_answer, correct=False)
100
+ else:
101
+ return dict(parse=False, extracted=None, correct=False)
102
+ elif line['answer_type'] == 'multiple choice':
103
+ if succeed:
104
+ return dict(parse=True, extracted=short_answer, correct=(short_answer == line['answer']))
105
+ else:
106
+ if line['answer'] in pred[:3].upper():
107
+ return dict(parse=False, extracted=None, correct=True)
108
+ else:
109
+ return dict(parse=False, extracted=None, correct=False)
110
+ else:
111
+ if succeed:
112
+ return dict(parse=True, extracted=short_answer, correct=(short_answer.lower() in line['answer'].lower()))
113
+ else:
114
+ return dict(parse=False, extracted=None, correct=(short_answer.lower() in line['answer'].lower()))
115
+
116
+
117
+ class Dynamath(ImageBaseDataset):
118
+
119
+ TYPE = 'VQA'
120
+ DATASET_URL = {'DynaMath': 'https://opencompass.openxlab.space/utils/VLMEval/DynaMath.tsv'}
121
+ DATASET_MD5 = {'DynaMath': 'b8425ad9a7114571fc9366e013699494'}
122
+ GUIDE = """
123
+ ## Answer Instruction Please provide an answer to the question outlined above. Your response should adhere \
124
+ to the following JSON format, which includes two keys: 'solution' and 'short answer'. The 'solution' key can contain \
125
+ detailed steps needed to solve the question, and the 'short answer' key should provide a concise response. {INST}
126
+
127
+ Example of expected JSON response format:
128
+
129
+ """
130
+ EXAMPLE = {
131
+ "solution": "[Detailed step-by-step explanation]",
132
+ "short answer": "[Concise Answer]"
133
+ }
134
+ TEXT_EXAMPLE = json.dumps(EXAMPLE, indent=4)
135
+
136
+ # Given one data record, return the built prompt (a multi-modal message), can override
137
+ def build_prompt(self, line):
138
+ if isinstance(line, int):
139
+ line = self.data.iloc[line]
140
+
141
+ if self.meta_only:
142
+ tgt_path = toliststr(line['image_path'])
143
+ else:
144
+ tgt_path = self.dump_image(line)
145
+
146
+ prompt = f"## Question\n {line['question']}"
147
+ if line['answer_type'] == 'multiple choice':
148
+ inst = "Provide the corresponing choice option in the 'short answer' key, such as 'A', 'B', 'C', or 'D'."
149
+ elif line['answer_type'] == 'float':
150
+ inst = "Format the answer as a three-digit floating-point number and provide it in the 'short answer' key."
151
+ else:
152
+ inst = "Float numbers in the answer should be formatted as three-digit floating-point numbers."
153
+
154
+ prompt = prompt + self.GUIDE.format(INST=inst) + self.TEXT_EXAMPLE
155
+
156
+ msgs = []
157
+ if isinstance(tgt_path, list):
158
+ msgs.extend([dict(type='image', value=p) for p in tgt_path])
159
+ else:
160
+ msgs = [dict(type='image', value=tgt_path)]
161
+ msgs.append(dict(type='text', value=prompt))
162
+ return msgs
163
+
164
+ def evaluate(self, eval_file, **judge_kwargs):
165
+ judge_name = judge_kwargs.pop('model', 'gpt-4o-mini')
166
+
167
+ model = build_judge(model=judge_name, **judge_kwargs)
168
+ suffix = eval_file.split('.')[-1]
169
+
170
+ storage = eval_file.replace(f'.{suffix}', f'_{judge_name}.xlsx') # noqa: F841
171
+ score_file = eval_file.replace(f'.{suffix}', f'_{judge_name}_score.csv') # noqa: F841
172
+ tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_name}.pkl') # noqa: F841
173
+ nproc = judge_kwargs.pop('nproc', 6) # noqa: F841
174
+
175
+ res = load(tmp_file) if os.path.exists(tmp_file) else {}
176
+ res = {k: v for k, v in res.items() if v is not None}
177
+
178
+ model.system_prompt = """\
179
+ You are a helpful assistant that helps me to format free-form answers into a short answer according to the instruction.
180
+ """
181
+ if not osp.exists(storage):
182
+ data = load(eval_file)
183
+ lt = len(data)
184
+ payloads = [dict(model=model, line=data.iloc[i]) for i in range(lt) if data.iloc[i]['index'] not in res]
185
+ keys = [idx for idx in data['index'] if idx not in res]
186
+
187
+ if len(keys):
188
+ results = track_progress_rich(DynaMath_auxeval, payloads, nproc=nproc, save=tmp_file, keys=keys)
189
+ for k, r in zip(keys, results):
190
+ res[k] = r
191
+
192
+ data['parse'] = [res[idx]['parse'] for idx in data['index']]
193
+ data['extracted'] = [res[idx]['extracted'] for idx in data['index']]
194
+ data['correct'] = [res[idx]['correct'] for idx in data['index']]
195
+ dump(data, storage)
196
+
197
+ data = load(storage)
198
+ # Calculate Average Accuracy
199
+ score_avg = {}
200
+ score_avg['Overall'] = np.mean(data['correct'])
201
+
202
+ subs = set(data['subject'])
203
+ for sub in subs:
204
+ data_sub = data[data['subject'] == sub]
205
+ score_avg[f'Subject-{sub}'] = np.mean(data_sub['correct'])
206
+
207
+ lvls = set(data['knowledge_level'])
208
+ for lvl in lvls:
209
+ data_lvl = data[data['knowledge_level'] == lvl]
210
+ score_avg[f'Level-{lvl}'] = np.mean(data_lvl['correct'])
211
+
212
+ # Calculate the Worst Case Accuracy
213
+ score_worst = {}
214
+ data_worst = data[data['varid'] == 1]
215
+ qid2corr = {idx: True for idx in data_worst['index']}
216
+ lt = len(data)
217
+ for i in range(lt):
218
+ item = data.iloc[i]
219
+ qid2corr[item['qid']] *= item['correct']
220
+ data_worst['correct'] = [qid2corr[idx] for idx in data_worst['qid']]
221
+ score_worst['Overall'] = np.mean(data_worst['correct'])
222
+
223
+ subs = set(data_worst['subject'])
224
+ for sub in subs:
225
+ data_sub = data_worst[data_worst['subject'] == sub]
226
+ score_worst[f'Subject-{sub}'] = np.mean(data_sub['correct'])
227
+
228
+ lvls = set(data_worst['knowledge_level'])
229
+ for lvl in lvls:
230
+ data_lvl = data_worst[data_worst['knowledge_level'] == lvl]
231
+ score_worst[f'Level-{lvl}'] = np.mean(data_lvl['correct'])
232
+
233
+ d1 = {'Setting': 'Average'}
234
+ d1.update(score_avg)
235
+ d2 = {'Setting': 'Worst Case'}
236
+ d2.update(score_worst)
237
+ score = pd.concat([d2df(d1), d2df(d2)], ignore_index=True)
238
+
239
+ dump(score, score_file)
240
+ return score
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_base.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from abc import abstractmethod
3
+ from ..smp import *
4
+
5
+
6
+ def img_root_map(dataset):
7
+ if 'MM_NIAH' in dataset:
8
+ return 'MMNIAH'
9
+ if 'CRPE' in dataset:
10
+ return 'CRPE'
11
+ if 'OCRVQA' in dataset:
12
+ return 'OCRVQA'
13
+ if 'COCO_VAL' == dataset:
14
+ return 'COCO'
15
+ if 'MMMU' in dataset:
16
+ return 'MMMU'
17
+ if "QSpatial" in dataset:
18
+ return "QSpatial"
19
+
20
+ mmbench_root_map = {
21
+ 'MMBench_DEV_EN': 'MMBench', 'MMBench_TEST_EN': 'MMBench',
22
+ 'MMBench_DEV_CN': 'MMBench', 'MMBench_TEST_CN': 'MMBench',
23
+ 'MMBench': 'MMBench', 'MMBench_CN': 'MMBench',
24
+ 'MMBench_DEV_EN_V11': 'MMBench_V11', 'MMBench_TEST_EN_V11': 'MMBench_V11',
25
+ 'MMBench_DEV_CN_V11': 'MMBench_V11', 'MMBench_TEST_CN_V11': 'MMBench_V11',
26
+ 'MMBench_V11': 'MMBench', 'MMBench_CN_V11': 'MMBench',
27
+ }
28
+ if dataset in mmbench_root_map:
29
+ return mmbench_root_map[dataset]
30
+ return dataset
31
+
32
+
33
+ class ImageBaseDataset:
34
+
35
+ MODALITY = 'IMAGE'
36
+ DATASET_URL = {}
37
+ DATASET_MD5 = {}
38
+
39
+ def __init__(self, dataset='MMBench', skip_noimg=True):
40
+ ROOT = LMUDataRoot()
41
+ # You can override this variable to save image files to a different directory
42
+ self.dataset_name = dataset
43
+ self.img_root = osp.join(ROOT, 'images', img_root_map(dataset))
44
+
45
+ data = self.load_data(dataset)
46
+ self.skip_noimg = skip_noimg
47
+ if skip_noimg and 'image' in data:
48
+ data = data[~pd.isna(data['image'])]
49
+
50
+ data['index'] = [str(x) for x in data['index']]
51
+
52
+ self.meta_only = True
53
+
54
+ # The image field can store the base64 encoded image or another question index (for saving space)
55
+ if 'image' in data:
56
+ data['image'] = [str(x) for x in data['image']]
57
+ image_map = {x: y for x, y in zip(data['index'], data['image'])}
58
+ for k in image_map:
59
+ if len(image_map[k]) <= 64:
60
+ idx = image_map[k]
61
+ assert idx in image_map and len(image_map[idx]) > 64
62
+ image_map[k] = image_map[idx]
63
+
64
+ images = [toliststr(image_map[k]) for k in data['index']]
65
+ data['image'] = [x[0] if len(x) == 1 else x for x in images]
66
+ self.meta_only = False
67
+
68
+ if 'image_path' in data:
69
+ paths = [toliststr(x) for x in data['image_path']]
70
+ data['image_path'] = [x[0] if len(x) == 1 else x for x in paths]
71
+
72
+ if np.all([istype(x, int) for x in data['index']]):
73
+ data['index'] = [int(x) for x in data['index']]
74
+
75
+ self.data = data
76
+ self.post_build(dataset)
77
+
78
+ def __len__(self):
79
+ return len(self.data)
80
+
81
+ def __getitem__(self, idx):
82
+ return dict(self.data.iloc[idx])
83
+
84
+ def prepare_tsv(self, url, file_md5=None):
85
+ data_root = LMUDataRoot()
86
+ os.makedirs(data_root, exist_ok=True)
87
+ update_flag = False
88
+ file_name = url.split('/')[-1]
89
+ data_path = osp.join(data_root, file_name)
90
+ if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
91
+ pass
92
+ else:
93
+ warnings.warn('The dataset tsv is not downloaded')
94
+ download_file(url, data_path)
95
+ update_flag = True
96
+
97
+ if file_size(data_path, 'GB') > 1:
98
+ local_path = data_path.replace('.tsv', '_local.tsv')
99
+ if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag:
100
+ from ..tools import LOCALIZE
101
+ LOCALIZE(data_path, local_path)
102
+ data_path = local_path
103
+ return load(data_path)
104
+
105
+ def dump_image(self, line):
106
+ os.makedirs(self.img_root, exist_ok=True)
107
+
108
+ if 'image' in line:
109
+ if isinstance(line['image'], list):
110
+ tgt_path = []
111
+ assert 'image_path' in line
112
+ for img, im_name in zip(line['image'], line['image_path']):
113
+ path = osp.join(self.img_root, im_name)
114
+ if not read_ok(path):
115
+ decode_base64_to_image_file(img, path)
116
+ tgt_path.append(path)
117
+ else:
118
+ tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
119
+ if not read_ok(tgt_path):
120
+ decode_base64_to_image_file(line['image'], tgt_path)
121
+ tgt_path = [tgt_path]
122
+ else:
123
+ assert 'image_path' in line
124
+ tgt_path = toliststr(line['image_path'])
125
+
126
+ return tgt_path
127
+
128
+ def display(self, line):
129
+ if isinstance(line, int):
130
+ line = self.data.iloc[line]
131
+ assert isinstance(line, pd.Series) or isinstance(line, dict)
132
+ mmqa_display(line)
133
+
134
+ # Return a list of dataset names that are supported by this class, can override
135
+ @classmethod
136
+ def supported_datasets(cls):
137
+ return list(cls.DATASET_URL)
138
+
139
+ # Given the dataset name, return the dataset as a pandas dataframe, can override
140
+ def load_data(self, dataset):
141
+ url = self.DATASET_URL[dataset]
142
+ file_md5 = self.DATASET_MD5[dataset] if dataset in self.DATASET_MD5 else None
143
+ return self.prepare_tsv(url, file_md5)
144
+
145
+ # Post built hook, will be called after the dataset is built, can override
146
+ def post_build(self, dataset):
147
+ pass
148
+
149
+ # Given one data record, return the built prompt (a multi-modal message), can override
150
+ def build_prompt(self, line):
151
+ if isinstance(line, int):
152
+ line = self.data.iloc[line]
153
+
154
+ if self.meta_only:
155
+ tgt_path = toliststr(line['image_path'])
156
+ else:
157
+ tgt_path = self.dump_image(line)
158
+
159
+ question = line['question']
160
+
161
+ msgs = []
162
+ if isinstance(tgt_path, list):
163
+ msgs.extend([dict(type='image', value=p) for p in tgt_path])
164
+ else:
165
+ msgs = [dict(type='image', value=tgt_path)]
166
+ msgs.append(dict(type='text', value=question))
167
+ return msgs
168
+
169
+ # Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
170
+ @abstractmethod
171
+ def evaluate(self, eval_file, **judge_kwargs):
172
+ pass
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_caption.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .image_base import ImageBaseDataset
2
+ from ..smp import *
3
+
4
+
5
+ class COCO_Caption_Scorer():
6
+ def __init__(self, ref, gt):
7
+ from pycocoevalcap.bleu.bleu import Bleu
8
+ from pycocoevalcap.rouge.rouge import Rouge
9
+ from pycocoevalcap.cider.cider import Cider
10
+
11
+ self.ref = ref
12
+ self.gt = gt
13
+ print('setting up scorers...')
14
+ self.scorers = [
15
+ (Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']),
16
+ (Rouge(), 'ROUGE_L'),
17
+ (Cider(), 'CIDEr'),
18
+ ]
19
+
20
+ def compute_scores(self):
21
+ total_scores = {}
22
+ for scorer, method in self.scorers:
23
+ print('computing %s score...' % (scorer.method()))
24
+ score, scores = scorer.compute_score(self.gt, self.ref)
25
+ if isinstance(method, list):
26
+ for sc, scs, m in zip(score, scores, method):
27
+ print('%s: %0.3f' % (m, sc * 100))
28
+ total_scores['Bleu'] = [x * 100 for x in score]
29
+ else:
30
+ print('%s: %0.3f' % (method, score * 100))
31
+ total_scores[method] = score * 100
32
+
33
+ print('*****DONE*****')
34
+ for key, value in total_scores.items():
35
+ print('{}:{}'.format(key, value))
36
+ return total_scores
37
+
38
+
39
+ class ImageCaptionDataset(ImageBaseDataset):
40
+
41
+ TYPE = 'Caption'
42
+
43
+ DATASET_URL = {
44
+ 'COCO_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/COCO_VAL.tsv',
45
+ }
46
+
47
+ DATASET_MD5 = {
48
+ 'COCO_VAL': '72a5079dead060269ac222c5aa5128af',
49
+ }
50
+
51
+ def load_data(self, dataset):
52
+ data = super().load_data(dataset)
53
+ if 'question' not in data:
54
+ data['question'] = [(
55
+ 'Please describe this image in general. Directly provide the description, '
56
+ 'do not include prefix like "This image depicts". '
57
+ )] * len(data)
58
+ return data
59
+
60
+ # It returns a dictionary of scores
61
+ @classmethod
62
+ def evaluate(self, eval_file, **kwargs):
63
+ data = load(eval_file)
64
+ lt = len(data)
65
+ lines = [data.iloc[i] for i in range(lt)]
66
+ ref, gt = {}, {}
67
+ for i, line in enumerate(lines):
68
+ ref[str(i)] = [str(line['prediction'])]
69
+ gt[str(i)] = eval(line['answer'])
70
+
71
+ scorer = COCO_Caption_Scorer(ref, gt)
72
+ coco_caption_score_dict = scorer.compute_scores()
73
+ score_pth = eval_file.replace('.xlsx', '_score.json')
74
+ dump(coco_caption_score_dict, score_pth)
75
+ return coco_caption_score_dict
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_ccocr.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa
2
+
3
+ import os
4
+ import re
5
+ import tempfile
6
+ from functools import partial
7
+ import pandas as pd
8
+
9
+ from .image_base import ImageBaseDataset
10
+ from ..smp import *
11
+
12
+ # should be the same as FAIL_MSG definded in vlmeval/inference.py
13
+ FAIL_MSG = 'Failed to obtain answer via API.'
14
+
15
+
16
+ class CCOCRDataset(ImageBaseDataset):
17
+ TYPE = 'VQA'
18
+ DATASET_URL_MODELSCOPE = {
19
+ "CCOCR_DocParsing_DocPhotoChn": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/doc/doc_photo_chn_75.tsv",
20
+ "CCOCR_DocParsing_DocPhotoEng": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/doc/doc_photo_eng_75.tsv",
21
+ "CCOCR_DocParsing_DocScanChn": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/doc/doc_scan_chn_75.tsv",
22
+ "CCOCR_DocParsing_DocScanEng": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/doc/doc_scan_eng_75.tsv",
23
+ "CCOCR_DocParsing_TablePhotoChn": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/table/table_photo_chn_75.tsv",
24
+ "CCOCR_DocParsing_TablePhotoEng": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/table/table_photo_eng_75.tsv",
25
+ "CCOCR_DocParsing_TableScanChn": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/table/table_scan_chn_75.tsv",
26
+ "CCOCR_DocParsing_TableScanEng": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/table/table_scan_eng_75.tsv",
27
+ "CCOCR_DocParsing_MolecularHandwriting": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/molecular/molecular_handwriting_100.tsv",
28
+ "CCOCR_DocParsing_FormulaHandwriting": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/formula/formula_handwriting_100.tsv",
29
+ "CCOCR_Kie_Sroie2019Word": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/kie/constrained_category/sroie2019_word_347.tsv",
30
+ "CCOCR_Kie_Cord": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/kie/constrained_category/CORD_100.tsv",
31
+ "CCOCR_Kie_EphoieScut": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/kie/constrained_category/EPHOIE_SCUT_311.tsv",
32
+ "CCOCR_Kie_Poie": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/kie/constrained_category/POIE_250.tsv",
33
+ "CCOCR_Kie_ColdSibr": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/kie/open_category/COLD_SIBR_400.tsv",
34
+ "CCOCR_Kie_ColdCell": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/kie/open_category/COLD_CELL_600.tsv",
35
+ "CCOCR_MultiLanOcr_Arabic": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Arabic/Arabic_150.tsv",
36
+ "CCOCR_MultiLanOcr_French": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/French/French_150.tsv",
37
+ "CCOCR_MultiLanOcr_German": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/German/German_150.tsv",
38
+ "CCOCR_MultiLanOcr_Italian": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Italian/Italian_150.tsv",
39
+ "CCOCR_MultiLanOcr_Japanese": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Japanese/Japanese_150.tsv",
40
+ "CCOCR_MultiLanOcr_Korean": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Korean/Korean_150.tsv",
41
+ "CCOCR_MultiLanOcr_Portuguese": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Portuguese/Portuguese_150.tsv",
42
+ "CCOCR_MultiLanOcr_Russian": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Russian/Russian_150.tsv",
43
+ "CCOCR_MultiLanOcr_Spanish": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Spanish/Spanish_150.tsv",
44
+ "CCOCR_MultiLanOcr_Vietnamese": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Vietnamese/Vietnamese_150.tsv",
45
+ "CCOCR_MultiSceneOcr_Cord": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/document_text/CORD_100.tsv",
46
+ "CCOCR_MultiSceneOcr_Funsd": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/document_text/FUNSD_50.tsv",
47
+ "CCOCR_MultiSceneOcr_Iam": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/document_text/IAM_50.tsv",
48
+ "CCOCR_MultiSceneOcr_ZhDoc": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/document_text/zh_doc_100.tsv",
49
+ "CCOCR_MultiSceneOcr_ZhHandwriting": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/document_text/zh_handwriting_50.tsv",
50
+ "CCOCR_MultiSceneOcr_Hieragent": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/scene_text/Hieragent_100.tsv",
51
+ "CCOCR_MultiSceneOcr_Ic15": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/scene_text/IC15_500.tsv",
52
+ "CCOCR_MultiSceneOcr_Inversetext": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/scene_text/InverseText_500.tsv",
53
+ "CCOCR_MultiSceneOcr_Totaltext": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/scene_text/TotalText_300.tsv",
54
+ "CCOCR_MultiSceneOcr_ZhScene": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/scene_text/zh_scene_450.tsv",
55
+ "CCOCR_MultiSceneOcr_UgcLaion": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/ugc_text/ugc_laion_400.tsv",
56
+ "CCOCR_MultiSceneOcr_ZhDense": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/ugc_text/zh_dense_50.tsv",
57
+ "CCOCR_MultiSceneOcr_ZhVertical": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/ugc_text/zh_vertical_100.tsv"
58
+ }
59
+
60
+ DATASET_URL_HUGGINGFACE = {
61
+ "CCOCR_DocParsing_DocPhotoChn": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/doc/doc_photo_chn_75.tsv",
62
+ "CCOCR_DocParsing_DocPhotoEng": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/doc/doc_photo_eng_75.tsv",
63
+ "CCOCR_DocParsing_DocScanChn": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/doc/doc_scan_chn_75.tsv",
64
+ "CCOCR_DocParsing_DocScanEng": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/doc/doc_scan_eng_75.tsv",
65
+ "CCOCR_DocParsing_TablePhotoChn": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/table/table_photo_chn_75.tsv",
66
+ "CCOCR_DocParsing_TablePhotoEng": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/table/table_photo_eng_75.tsv",
67
+ "CCOCR_DocParsing_TableScanChn": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/table/table_scan_chn_75.tsv",
68
+ "CCOCR_DocParsing_TableScanEng": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/table/table_scan_eng_75.tsv",
69
+ "CCOCR_DocParsing_MolecularHandwriting": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/molecular/molecular_handwriting_100.tsv",
70
+ "CCOCR_DocParsing_FormulaHandwriting": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/formula/formula_handwriting_100.tsv",
71
+ "CCOCR_Kie_Sroie2019Word": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/kie/constrained_category/sroie2019_word_347.tsv",
72
+ "CCOCR_Kie_Cord": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/kie/constrained_category/CORD_100.tsv",
73
+ "CCOCR_Kie_EphoieScut": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/kie/constrained_category/EPHOIE_SCUT_311.tsv",
74
+ "CCOCR_Kie_Poie": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/kie/constrained_category/POIE_250.tsv",
75
+ "CCOCR_Kie_ColdSibr": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/kie/open_category/COLD_SIBR_400.tsv",
76
+ "CCOCR_Kie_ColdCell": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/kie/open_category/COLD_CELL_600.tsv",
77
+ "CCOCR_MultiLanOcr_Arabic": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Arabic/Arabic_150.tsv",
78
+ "CCOCR_MultiLanOcr_French": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/French/French_150.tsv",
79
+ "CCOCR_MultiLanOcr_German": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/German/German_150.tsv",
80
+ "CCOCR_MultiLanOcr_Italian": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Italian/Italian_150.tsv",
81
+ "CCOCR_MultiLanOcr_Japanese": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Japanese/Japanese_150.tsv",
82
+ "CCOCR_MultiLanOcr_Korean": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Korean/Korean_150.tsv",
83
+ "CCOCR_MultiLanOcr_Portuguese": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Portuguese/Portuguese_150.tsv",
84
+ "CCOCR_MultiLanOcr_Russian": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Russian/Russian_150.tsv",
85
+ "CCOCR_MultiLanOcr_Spanish": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Spanish/Spanish_150.tsv",
86
+ "CCOCR_MultiLanOcr_Vietnamese": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Vietnamese/Vietnamese_150.tsv",
87
+ "CCOCR_MultiSceneOcr_Cord": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/document_text/CORD_100.tsv",
88
+ "CCOCR_MultiSceneOcr_Funsd": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/document_text/FUNSD_50.tsv",
89
+ "CCOCR_MultiSceneOcr_Iam": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/document_text/IAM_50.tsv",
90
+ "CCOCR_MultiSceneOcr_ZhDoc": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/document_text/zh_doc_100.tsv",
91
+ "CCOCR_MultiSceneOcr_ZhHandwriting": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/document_text/zh_handwriting_50.tsv",
92
+ "CCOCR_MultiSceneOcr_Hieragent": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/scene_text/Hieragent_100.tsv",
93
+ "CCOCR_MultiSceneOcr_Ic15": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/scene_text/IC15_500.tsv",
94
+ "CCOCR_MultiSceneOcr_Inversetext": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/scene_text/InverseText_500.tsv",
95
+ "CCOCR_MultiSceneOcr_Totaltext": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/scene_text/TotalText_300.tsv",
96
+ "CCOCR_MultiSceneOcr_ZhScene": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/scene_text/zh_scene_450.tsv",
97
+ "CCOCR_MultiSceneOcr_UgcLaion": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/ugc_text/ugc_laion_400.tsv",
98
+ "CCOCR_MultiSceneOcr_ZhDense": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/ugc_text/zh_dense_50.tsv",
99
+ "CCOCR_MultiSceneOcr_ZhVertical": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/ugc_text/zh_vertical_100.tsv"
100
+ }
101
+
102
+ # define data path
103
+ DATASET_URL = DATASET_URL_MODELSCOPE
104
+ DATASET_MD5 = {
105
+ "CCOCR_DocParsing_DocPhotoChn": "9039dcbb31830d413261a95cfa29d97f",
106
+ "CCOCR_DocParsing_DocPhotoEng": "2ca0824881e1d7317626f2a19d902989",
107
+ "CCOCR_DocParsing_DocScanChn": "9e265c8aa760ebdf5c3bf9e892d55492",
108
+ "CCOCR_DocParsing_DocScanEng": "77d04637be3def86dbc2ce37ba64a704",
109
+ "CCOCR_DocParsing_TablePhotoChn": "c4dc85252ddad2b43a03a67b1d1ae983",
110
+ "CCOCR_DocParsing_TablePhotoEng": "02ab75d6169da0cd2ece9ce0ae14a479",
111
+ "CCOCR_DocParsing_TableScanChn": "f1f79959fdd01127df7377c9d46722f2",
112
+ "CCOCR_DocParsing_TableScanEng": "794903c7acf52bfe956eefba2166d14b",
113
+ "CCOCR_DocParsing_MolecularHandwriting": "30b7f7679b713ce000a939eca7b4078f",
114
+ "CCOCR_DocParsing_FormulaHandwriting": "e03047776ce5e79a61ae1c057e2a348e",
115
+ "CCOCR_Kie_Sroie2019Word": "3287d99a8e86a99b74171fa5a70f9acb",
116
+ "CCOCR_Kie_Cord": "ab297cadcbc7158884a301c366f3330a",
117
+ "CCOCR_Kie_EphoieScut": "bb8fa3ba7ea91cbf17be0904956ad3f3",
118
+ "CCOCR_Kie_Poie": "882b64317989ecbfed6518051cdffb14",
119
+ "CCOCR_Kie_ColdSibr": "109d5dad8b7081fb6a2f088e963196d4",
120
+ "CCOCR_Kie_ColdCell": "7b44c45b4d7d768d1dbdc08872fe7d3a",
121
+ "CCOCR_MultiLanOcr_Arabic": "e9a3f2bb9298d0b882ebc7a98980c3f3",
122
+ "CCOCR_MultiLanOcr_French": "729407ed2036c22e602eff645eddd40c",
123
+ "CCOCR_MultiLanOcr_German": "96fc2edae747f0ec95b0a6f9bf723022",
124
+ "CCOCR_MultiLanOcr_Italian": "29a508fa5d5a5e767497dd69e2430ebb",
125
+ "CCOCR_MultiLanOcr_Japanese": "bbcca96ccf25fff63597c2ab4f3ebb1f",
126
+ "CCOCR_MultiLanOcr_Korean": "0f55dbd24eba5edc189c91e124411641",
127
+ "CCOCR_MultiLanOcr_Portuguese": "a6fcf8831775a61aa631c0cf1c422ae7",
128
+ "CCOCR_MultiLanOcr_Russian": "19d2f84062a1699d3e9333912bd6b303",
129
+ "CCOCR_MultiLanOcr_Spanish": "f5a0cfa9f2ae4115c91c7b362034e591",
130
+ "CCOCR_MultiLanOcr_Vietnamese": "bf1cd4e83d91767f4906f81550cec8b9",
131
+ "CCOCR_MultiSceneOcr_Cord": "92943f0ccb4c5a196c574222e76759a0",
132
+ "CCOCR_MultiSceneOcr_Funsd": "229cc38d193edd00f4383610e98ee873",
133
+ "CCOCR_MultiSceneOcr_Iam": "d897a6d6c3880c65e752ec11b211204c",
134
+ "CCOCR_MultiSceneOcr_ZhDoc": "303682cc16c8bb51b2b896f8ceb8bd38",
135
+ "CCOCR_MultiSceneOcr_ZhHandwriting": "faa298d366bc05e5cfb39e334afb8eff",
136
+ "CCOCR_MultiSceneOcr_Hieragent": "6f132cdd0473d7cc145c3e3a08957dd6",
137
+ "CCOCR_MultiSceneOcr_Ic15": "3d94869f312a41d53d0578a06a2fb1f2",
138
+ "CCOCR_MultiSceneOcr_Inversetext": "e141d424a0c4cf9579064428a270f13d",
139
+ "CCOCR_MultiSceneOcr_Totaltext": "ca1daf81d49eeb57ef844b72a23c2e62",
140
+ "CCOCR_MultiSceneOcr_ZhScene": "9295152a66e6f117db8bfbb20a9013e6",
141
+ "CCOCR_MultiSceneOcr_UgcLaion": "8e9ea1fbf9d56532157e807eabf39b21",
142
+ "CCOCR_MultiSceneOcr_ZhDense": "de8f48ee0c8a2cf8ed7f2b3a81e6322d",
143
+ "CCOCR_MultiSceneOcr_ZhVertical": "4892b4aec6e7fd11e39aaea23712709b"
144
+ }
145
+
146
+ # It returns a DataFrame
147
+ def evaluate(self, eval_file, **judge_kwargs):
148
+ """
149
+ """
150
+ df = load(eval_file)
151
+ dict_list = df.to_dict(orient='records')
152
+
153
+ required_colume_list = ['answer', 'prediction', "category", "image_name", "l2-category", "split"]
154
+ for required_colume in required_colume_list:
155
+ assert required_colume in df, "required_colume: {} NOT found".format(required_colume)
156
+
157
+ gt_info, ptd_info = {}, {}
158
+ for data_info in dict_list:
159
+ image_name = data_info['image_name']
160
+ gt_info[image_name] = data_info['answer']
161
+
162
+ # warning the FAIL samples
163
+ if data_info['prediction'] != FAIL_MSG:
164
+ ptd_info[image_name] = data_info['prediction']
165
+
166
+ # assert eval_file is a single dataset
167
+ group_name = set([str(x) for x in df['category']]).pop()
168
+ op_name = set([str(x) for x in df['l2-category']]).pop()
169
+ data_name = set([str(x) for x in df['split']]).pop()
170
+
171
+ data_info = {"op": op_name, "group": group_name, "dataset": data_name, "num": len(gt_info)}
172
+ try:
173
+ from .utils.ccocr_evaluator import evaluator_map_info as ccocr_evaluator_map
174
+ except ImportError as err:
175
+ import warnings
176
+ warnings.warn('The dependency of CCOCR evaluator is not properly installed')
177
+ warnings.warn(f'{type(err)}: {err}')
178
+ eval_func = ccocr_evaluator_map.get(group_name, None)
179
+ if eval_func is None:
180
+ raise ValueError("error: evaluator not defined for: {}".format(group_name))
181
+ meta_info, eval_info = eval_func(ptd_info, gt_info, **data_info)
182
+
183
+ output_info = {"meta": meta_info, "evaluation": eval_info, "config": data_info}
184
+ result_file = os.path.splitext(os.path.abspath(eval_file))[0] + "_eval.json"
185
+ dump(output_info, result_file)
186
+
187
+ # update global status for summary
188
+ # warning: the evaluate function should NOT run in parallel
189
+ all_status_info = {}
190
+ global_status_path = os.path.join(os.path.dirname(eval_file), "status.json")
191
+ if os.path.exists(global_status_path):
192
+ with open(global_status_path, "r") as f:
193
+ all_status_info = json.load(f)
194
+ all_status_info[data_name] = output_info
195
+ with open(global_status_path, "w") as f:
196
+ json.dump(all_status_info, f, ensure_ascii=False, indent=4)
197
+ return eval_info.get("summary")
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_mcq.py ADDED
@@ -0,0 +1,904 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+
3
+ from .image_base import ImageBaseDataset
4
+ from .utils import build_judge, DEBUG_MESSAGE
5
+ from ..smp import *
6
+ import pandas as pd
7
+
8
+ MMMB_URLS = {
9
+ 'MMMB_ar': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_ar.tsv',
10
+ 'MMMB_cn': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_cn.tsv',
11
+ 'MMMB_en': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_en.tsv',
12
+ 'MMMB_pt': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_pt.tsv',
13
+ 'MMMB_ru': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_ru.tsv',
14
+ 'MMMB_tr': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_tr.tsv',
15
+ }
16
+
17
+ MTL_MMBench_URLS = {
18
+ 'MMBench_dev_ar': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_ar.tsv',
19
+ 'MMBench_dev_cn': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_cn.tsv',
20
+ 'MMBench_dev_en': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_en.tsv',
21
+ 'MMBench_dev_pt': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_pt.tsv',
22
+ 'MMBench_dev_tr': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_tr.tsv',
23
+ 'MMBench_dev_ru': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_ru.tsv',
24
+ }
25
+
26
+ MMMB_MD5 = {
27
+ 'MMMB_ar': 'f3a18b6385f1d9701840aa42de27aead', 'MMMB_cn': '13ed82fa89730037292fcaa27f08f430',
28
+ 'MMMB_en': '1cd781a71ec5a2983c090b84105d6a01', 'MMMB_pt': '548ea2b3bb2da991790386f0015d30d1',
29
+ 'MMMB_ru': 'ce1cc8a0533425ab0d86b326ebfc2984', 'MMMB_tr': '0733739d43090327975294292bc5cd67'
30
+ }
31
+
32
+ MTL_MMBench_MD5 = {
33
+ 'MMBench_dev_ar': '4271b4a0d0200e1a86380a878e0d64a4', 'MMBench_dev_cn': '2ed5135326fed02c8e51ea50dda8222f',
34
+ 'MMBench_dev_en': 'd9ab776fc018b3d45785e9a5c23431c2', 'MMBench_dev_pt': '4ddfbcd27ef12444b908c03831cd0295',
35
+ 'MMBench_dev_tr': '4fab39d501389d3d6cc90264bb708f11', 'MMBench_dev_ru': '5ba1171ff2e68f80637bf78349e402a5'
36
+ }
37
+
38
+
39
+ class ImageMCQDataset(ImageBaseDataset):
40
+
41
+ TYPE = 'MCQ'
42
+
43
+ DATASET_URL = {
44
+ # MMBench v1.0
45
+ 'MMBench_DEV_EN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_EN.tsv',
46
+ 'MMBench_TEST_EN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_EN.tsv',
47
+ 'MMBench_DEV_CN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_CN.tsv',
48
+ 'MMBench_TEST_CN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_CN.tsv',
49
+ 'MMBench': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench.tsv', # Internal
50
+ 'MMBench_CN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_CN.tsv', # Internal
51
+ # MMBench v1.1
52
+ 'MMBench_DEV_EN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_EN_V11.tsv',
53
+ 'MMBench_TEST_EN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_EN_V11.tsv',
54
+ 'MMBench_DEV_CN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_CN_V11.tsv',
55
+ 'MMBench_TEST_CN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_CN_V11.tsv',
56
+ 'MMBench_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_V11.tsv', # Internal
57
+ 'MMBench_CN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_CN_V11.tsv', # Internal
58
+ # SEEDBench Series
59
+ 'SEEDBench_IMG': 'https://opencompass.openxlab.space/utils/benchmarks/SEEDBench/SEEDBench_IMG.tsv',
60
+ 'SEEDBench2': 'https://huggingface.co/datasets/VLMEval/SEEDBench2/resolve/main/SEEDBench2.tsv',
61
+ 'SEEDBench2_Plus': 'https://opencompass.openxlab.space/utils/benchmarks/SEEDBench/SEEDBench2_Plus.tsv',
62
+ # ScienceQA Series
63
+ 'ScienceQA_VAL': 'https://opencompass.openxlab.space/utils/benchmarks/ScienceQA/ScienceQA_VAL.tsv',
64
+ 'ScienceQA_TEST': 'https://opencompass.openxlab.space/utils/benchmarks/ScienceQA/ScienceQA_TEST.tsv',
65
+ # MMT-Bench
66
+ 'MMT-Bench_ALL_MI': 'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_ALL_MI.tsv',
67
+ 'MMT-Bench_ALL': 'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_ALL.tsv',
68
+ 'MMT-Bench_VAL_MI': 'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_VAL_MI.tsv',
69
+ 'MMT-Bench_VAL': 'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_VAL.tsv',
70
+ # AesBench
71
+ 'AesBench_VAL': 'https://huggingface.co/datasets/VLMEval/AesBench/resolve/main/AesBench_VAL.tsv',
72
+ 'AesBench_TEST': 'https://huggingface.co/datasets/VLMEval/AesBench/resolve/main/AesBench_TEST.tsv',
73
+ # Q-Bench1
74
+ 'Q-Bench1_VAL': 'https://huggingface.co/datasets/zhangzicheng/qbench_tsv/resolve/main/Q-Bench1_VAL.tsv',
75
+ 'Q-Bench1_TEST': 'https://huggingface.co/datasets/zhangzicheng/qbench_tsv/resolve/main/Q-Bench1_TEST.tsv',
76
+ # A-Bench
77
+ 'A-Bench_VAL': 'https://huggingface.co/datasets/zhangzicheng/abench_tsv/resolve/main/A-bench_VAL.tsv',
78
+ 'A-Bench_TEST': 'https://huggingface.co/datasets/zhangzicheng/abench_tsv/resolve/main/A-bench_TEST.tsv',
79
+ # R-Bench
80
+ 'R-Bench-Dis': 'https://huggingface.co/datasets/lcysyzxdxc/R-Bench/blob/main/R-bench-dis.tsv',
81
+ 'R-Bench-Ref': 'https://huggingface.co/datasets/lcysyzxdxc/R-Bench/blob/main/R-bench-ref.tsv',
82
+ # Other Benchmarks
83
+ 'CCBench': 'https://opencompass.openxlab.space/utils/VLMEval/CCBench.tsv',
84
+ 'AI2D_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST.tsv',
85
+ 'AI2D_TEST_NO_MASK': 'https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST_NO_MASK.tsv',
86
+ 'MMStar': 'https://opencompass.openxlab.space/utils/VLMEval/MMStar.tsv',
87
+ 'RealWorldQA': 'https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv',
88
+ 'MLLMGuard_DS': 'https://opencompass.openxlab.space/utils/VLMEval/MLLMGuard_DS.tsv',
89
+ 'BLINK': 'https://opencompass.openxlab.space/utils/VLMEval/BLINK.tsv',
90
+ 'TaskMeAnything_v1_imageqa_random': (
91
+ 'https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-random/'
92
+ 'resolve/main/TaskMeAnything-v1-imageqa-random.tsv'
93
+ ),
94
+ 'A-OKVQA': 'https://huggingface.co/datasets/Allen8/A-OKVQA/resolve/main/a-okvqa.tsv',
95
+ 'WorldMedQA-V': 'https://opencompass.openxlab.space/utils/VLMEval/WorldMedQA-V.tsv',
96
+ 'VisOnlyQA-VLMEvalKit': (
97
+ 'https://huggingface.co/datasets/ryokamoi/VisOnlyQA_Eval_Real/'
98
+ 'resolve/main/visonlyqa_vlmevalkit.tsv'
99
+ ),
100
+ '3DSRBench': (
101
+ 'https://huggingface.co/datasets/ccvl/3DSRBench/'
102
+ 'resolve/main/3dsrbench_v1_vlmevalkit_circular.tsv'
103
+ ),
104
+ }
105
+
106
+ DATASET_MD5 = {
107
+ # MMBench v1.0
108
+ 'MMBench_DEV_EN': 'b6caf1133a01c6bb705cf753bb527ed8',
109
+ 'MMBench_TEST_EN': '6939fadb0ce626fefc0bdc9c64efc528',
110
+ 'MMBench_DEV_CN': '08b8fc3324a5ed74155350f57be69fbd',
111
+ 'MMBench_TEST_CN': '7e1239baf0ee4c8b513e19705a0f317e',
112
+ 'MMBench': '4115aea3383f3dd0083be6a633e0f820', # Internal Only
113
+ 'MMBench_CN': '2e053ffc90ea598b1feae13c36dc13ee', # Internal Only
114
+ # MMBench v1.1
115
+ 'MMBench_DEV_EN_V11': '30c05be8f2f347a50be25aa067248184',
116
+ 'MMBench_TEST_EN_V11': '26f0f15381a21720255091d3e0316ce6',
117
+ 'MMBench_DEV_CN_V11': '593f9b5f6bea453d870a798b34ae4f37',
118
+ 'MMBench_TEST_CN_V11': '74bbe4556dac745613c7cbe5ad787050',
119
+ 'MMBench_V11': 'b9276414f57af1308dcc4d0cd9b42e7c', # Internal Only
120
+ 'MMBench_CN_V11': '95f6980dd1b4de38e3cbffe0305a3f25', # Internal Only
121
+ # SEEDBench
122
+ 'SEEDBench_IMG': '68017231464752261a2526d6ca3a10c0',
123
+ 'SEEDBench2': '4ec15cf864c4f16274112284f531813e',
124
+ 'SEEDBench2_Plus': 'e32d3216dc4f452b0fe497a52015d1fd',
125
+ # ScienceQA
126
+ 'ScienceQA_VAL': '96320d05e142e585e7204e72affd29f3',
127
+ 'ScienceQA_TEST': 'e42e9e00f9c59a80d8a5db35bc32b71f',
128
+ # MMT-Bench
129
+ 'MMT-Bench_ALL_MI': '5272157097e19cdd7cb41e412ab3b7c7',
130
+ 'MMT-Bench_ALL': 'b273a2f4c596fe4f2605de0494cd632f',
131
+ 'MMT-Bench_VAL_MI': 'c7d7b998eb5cd9aa36c7d4f721472462',
132
+ 'MMT-Bench_VAL': '8dd4b730f53dbf9c3aed90ca31c928e0',
133
+ # AesBench
134
+ 'AesBench_VAL': '3edb0c319e9187aa0b97fe7a11700a8c',
135
+ 'AesBench_TEST': '58b1f7ba2cc32e1d68896d6ee716bbf8',
136
+ # Q-Bench1
137
+ 'Q-Bench1_VAL': '837bdb6cd2da571713543462815187b7',
138
+ 'Q-Bench1_TEST': '15e759bfd58c9d5f30b23a317d347153',
139
+ # A-Bench
140
+ 'A-Bench_VAL': '218563ec50d34bb336c814143a5bb9c1',
141
+ 'A-Bench_TEST': '567013fb033a20cf23f51d8e865bd16c',
142
+ # R-Bench
143
+ 'R-Bench-Dis': 'd6e961dbfc43350688af2560226830b4',
144
+ 'R-Bench-Ref': '270c1cb555acb523f3fdb178ed57021d',
145
+ # Other Benchmarks
146
+ 'CCBench': 'f5dde47f24dc5a6fb6e595b409b466ac',
147
+ 'AI2D_TEST': '0f593e0d1c7df9a3d69bf1f947e71975',
148
+ 'AI2D_TEST_NO_MASK': 'fd8f463634d4fe9fbd23b876e8eea5be',
149
+ 'MMStar': 'e1ecd2140806c1b1bbf54b43372efb9e',
150
+ 'RealWorldQA': '4de008f55dc4fd008ca9e15321dc44b7',
151
+ 'MLLMGuard_DS': '975fc0dd7119386e198c37d71e274b3f',
152
+ 'BLINK': '3b6649b6a662184ea046908e5506260e',
153
+ 'TaskMeAnything_v1_imageqa_random': '023fef69e2ca21827afb77c5ec3bc889',
154
+ 'WorldMedQA-V': '441e63875e30c87f5750528b57b41285',
155
+ "VisOnlyQA-VLMEvalKit": 'cf460a31d2acb8d3a7cecd0e69298bfa',
156
+ '3DSRBench': '13a99f33164dc1b9faf0e8b8b01fd6f2',
157
+ }
158
+
159
+ DATASET_URL.update(MMMB_URLS)
160
+ DATASET_URL.update(MTL_MMBench_URLS)
161
+ DATASET_MD5.update(MMMB_MD5)
162
+ DATASET_MD5.update(MTL_MMBench_MD5)
163
+
164
+ def build_prompt(self, line):
165
+
166
+ if isinstance(line, int):
167
+ line = self.data.iloc[line]
168
+
169
+ if self.meta_only:
170
+ tgt_path = toliststr(line['image_path'])
171
+ else:
172
+ tgt_path = self.dump_image(line)
173
+
174
+ question = line['question']
175
+ options = {
176
+ cand: line[cand]
177
+ for cand in string.ascii_uppercase
178
+ if cand in line and not pd.isna(line[cand])
179
+ }
180
+ options_prompt = 'Options:\n'
181
+ for key, item in options.items():
182
+ options_prompt += f'{key}. {item}\n'
183
+ hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
184
+ prompt = ''
185
+ if hint is not None:
186
+ prompt += f'Hint: {hint}\n'
187
+ prompt += f'Question: {question}\n'
188
+ if len(options):
189
+ prompt += options_prompt
190
+ prompt += 'Please select the correct answer from the options above. \n'
191
+
192
+ msgs = []
193
+ if isinstance(tgt_path, list):
194
+ msgs.extend([dict(type='image', value=p) for p in tgt_path])
195
+ else:
196
+ msgs = [dict(type='image', value=tgt_path)]
197
+ msgs.append(dict(type='text', value=prompt))
198
+
199
+ return msgs
200
+
201
+ def evaluate(self, eval_file, **judge_kwargs):
202
+ from .utils.multiple_choice import report_acc, report_acc_MMT, mcq_circular_eval, mcq_vanilla_eval
203
+ # assert dataset is not None
204
+ dataset_map = {
205
+ 'MMBench_TEST_EN': 'MMBench', 'MMBench_TEST_EN_V11': 'MMBench_V11',
206
+ 'MMBench_TEST_CN': 'MMBench_CN', 'MMBench_TEST_CN_V11': 'MMBench_CN_V11'
207
+ }
208
+ dataset = self.dataset_name
209
+ if dataset in dataset_map:
210
+ dataset = dataset_map[dataset]
211
+ nproc = judge_kwargs.pop('nproc', 4)
212
+
213
+ circular = False
214
+ if listinstr(['mmbench', 'ccbench'], dataset.lower()):
215
+ data = load(eval_file)
216
+ data['index'] = [int(x) for x in data['index']]
217
+ dump(data, eval_file)
218
+ circular = True
219
+
220
+ suffix = eval_file.split('.')[-1]
221
+ model = judge_kwargs.get('model', 'exact_matching')
222
+ assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
223
+ name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
224
+ name_str = name_str_map[model] if model in name_str_map else model
225
+
226
+ if model == 'exact_matching':
227
+ model = None
228
+ elif gpt_key_set():
229
+ model = build_judge(**judge_kwargs)
230
+ if not model.working():
231
+ warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
232
+ warnings.warn(DEBUG_MESSAGE)
233
+ model = None
234
+ else:
235
+ warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
236
+ model = None
237
+
238
+ result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
239
+
240
+ data = load(eval_file)
241
+ data = data.sort_values(by='index')
242
+ data['prediction'] = [str(x) for x in data['prediction']]
243
+ # If not choice label, then use lower case
244
+ for k in data.keys():
245
+ data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
246
+
247
+ meta = self.data
248
+ meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
249
+ data_map = {x: y for x, y in zip(data['index'], data['question'])}
250
+ for k in data_map:
251
+ assert k in meta_q_map, (
252
+ f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
253
+ )
254
+
255
+ if circular:
256
+ data = mcq_circular_eval(model, data, meta, nproc, result_file, self.dataset_name)
257
+ else:
258
+ data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
259
+
260
+ # load split
261
+ dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
262
+ data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
263
+
264
+ # May have different report acc functions for different datasets
265
+ if 'MMT' in dataset:
266
+ acc = report_acc_MMT(data)
267
+ else:
268
+ acc = report_acc(data)
269
+
270
+ score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
271
+ dump(acc, score_file)
272
+
273
+ if dataset == 'AesBench_VAL':
274
+ warnings.warn('Note that AesBench VAL is just a toy version of AesBench TEST. For full results, \
275
+ please evaluate on AesBench TEST. The AesBench TEST dataset is more than 20 times \
276
+ larger than the VAL dataset and the leaderboard results are based on AesBench TEST.')
277
+ if dataset == 'VisOnlyQA-VLMEvalKit':
278
+ warnings.warn('Note that the results on VisOnlyQA-VLMEvalKit are different from the results on \
279
+ the original VisOnlyQA. VisOnlyQA-VLMEvalKit does not include the \
280
+ chemistry__shape_multi split and uses a different evaluation prompt. Please \
281
+ explicitly specify the version of the dataset when you report results.')
282
+
283
+ return acc
284
+
285
+
286
+ class MMMUDataset(ImageMCQDataset):
287
+
288
+ DATASET_URL = {
289
+ 'MMMU_DEV_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_DEV_VAL.tsv',
290
+ 'MMMU_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_TEST.tsv',
291
+ }
292
+
293
+ DATASET_MD5 = {
294
+ 'MMMU_DEV_VAL': '585e8ad75e73f75dcad265dfd0417d64',
295
+ 'MMMU_TEST': 'c19875d11a2d348d07e5eb4bdf33166d',
296
+ }
297
+
298
+ @staticmethod
299
+ def split_MMMU(msgs):
300
+ text, images = None, []
301
+ for s in msgs:
302
+ if s['type'] == 'image':
303
+ images.append(s['value'])
304
+ elif s['type'] == 'text':
305
+ assert text is None
306
+ text = s['value']
307
+ text_segs = text.split('<image ')
308
+ if len(text_segs) == 1:
309
+ return msgs
310
+
311
+ segs = [dict(type='text', value=text_segs[0])]
312
+ for i, seg in enumerate(text_segs):
313
+ if i == 0:
314
+ continue
315
+ assert istype(seg[0], int) and seg[1] == '>'
316
+ image_idx = int(seg[0]) - 1
317
+ segs.append(dict(type='image', value=images[image_idx]))
318
+ segs.append(dict(type='text', value=seg[2:]))
319
+ return segs
320
+
321
+ def build_prompt(self, line):
322
+ msgs = super().build_prompt(line)
323
+ msgs = self.split_MMMU(msgs)
324
+ return msgs
325
+
326
+
327
+ class MUIRDataset(ImageMCQDataset):
328
+
329
+ DATASET_URL = {
330
+ 'MUIRBench': 'http://opencompass.openxxlab.com/utils/VLMEval/MUIRBench.tsv'
331
+ }
332
+
333
+ DATASET_MD5 = {
334
+ 'MUIRBench': '2e5e6fd7699761b08a7cb3ab8c0c2ec8'
335
+ }
336
+
337
+ @staticmethod
338
+ def split_MUIR(msgs):
339
+ text, images = None, []
340
+
341
+ # Separate images and text from msgs
342
+ for s in msgs:
343
+ if s['type'] == 'image':
344
+ images.append(s['value'])
345
+ elif s['type'] == 'text':
346
+ assert text is None # Ensure only one text entry is expected
347
+ text = s['value']
348
+
349
+ # Split text by <image> tags
350
+ text_segs = text.split('<image>')
351
+
352
+ # Initialize the segments list
353
+ segs = []
354
+
355
+ # Iterate through the text segments and images
356
+ for i, seg in enumerate(text_segs):
357
+ # Append the image if this is not the first segment and there are still images left
358
+ if i > 0 and i - 1 < len(images):
359
+ segs.append(dict(type='image', value=images[i - 1]))
360
+ # Append the text segment (if it's non-empty)
361
+ if len(seg) > 0:
362
+ segs.append(dict(type='text', value=seg))
363
+
364
+ return segs
365
+
366
+ def build_prompt(self, line):
367
+
368
+ if isinstance(line, int):
369
+ line = self.data.iloc[line]
370
+
371
+ if self.meta_only:
372
+ tgt_path = toliststr(line['image_path'])
373
+ else:
374
+ tgt_path = self.dump_image(line)
375
+
376
+ question = line['question']
377
+ options = {
378
+ cand: line[cand]
379
+ for cand in string.ascii_uppercase
380
+ if cand in line and not pd.isna(line[cand])
381
+ }
382
+ # options_prompt = ''
383
+ options_prompt = '\n'.join([f'{key}. {item}' for key, item in options.items()])
384
+ # for key, item in options.items():
385
+ # options_prompt += f'{key}. {item}\n'
386
+
387
+ prompt = ''
388
+
389
+ prompt += f'{question}\n'
390
+ if len(options):
391
+ prompt += options_prompt
392
+ prompt += "\nAnswer with the option's letter from the given choices directly."
393
+
394
+ msgs = []
395
+ if isinstance(tgt_path, list):
396
+ msgs.extend([dict(type='image', value=p) for p in tgt_path])
397
+ else:
398
+ msgs = [dict(type='image', value=tgt_path)]
399
+ msgs.append(dict(type='text', value=prompt))
400
+
401
+ msgs = self.split_MUIR(msgs)
402
+ return msgs
403
+
404
+
405
+ class GMAIMMBenchDataset(ImageMCQDataset):
406
+
407
+ DATASET_URL = {
408
+ 'GMAI-MMBench_VAL': 'https://huggingface.co/datasets/VLMEval/GMAI-MMBench/resolve/main/GMAI-MMBench_VAL.tsv',
409
+ 'GMAI_mm_bench_TEST_part_1': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_1.tsv', # noqa: E501
410
+ 'GMAI_mm_bench_TEST_part_2': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_2.tsv', # noqa: E501
411
+ 'GMAI_mm_bench_TEST_part_3': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_3.tsv', # noqa: E501
412
+ 'GMAI_mm_bench_TEST_part_4': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_4.tsv', # noqa: E501
413
+ 'GMAI_mm_bench_TEST_part_5': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_5.tsv', # noqa: E501
414
+ 'GMAI_mm_bench_TEST_part_6': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_6.tsv', # noqa: E501
415
+ 'GMAI_mm_bench_TEST_part_7': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_7.tsv', # noqa: E501
416
+ 'GMAI_mm_bench_TEST_part_8': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_8.tsv', # noqa: E501
417
+ 'GMAI_mm_bench_TEST_part_9': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_9.tsv', # noqa: E501
418
+ 'GMAI_mm_bench_TEST_part_10': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_10.tsv', # noqa: E501
419
+ 'GMAI_mm_bench_TEST_part_11': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_11.tsv', # noqa: E501
420
+ }
421
+
422
+ DATASET_MD5 = {
423
+ 'GMAI-MMBench_VAL': '254bd581627866f1c499d3d6b4422324',
424
+ 'GMAI_mm_bench_TEST_part_1': '900d735231230a63f4ed45665c078ef4',
425
+ 'GMAI_mm_bench_TEST_part_2': '1b27ab621386945d7e4a765ad2d22b0e',
426
+ 'GMAI_mm_bench_TEST_part_3': '44bdc2b6267dd505d529b8cad06f0fb2',
427
+ 'GMAI_mm_bench_TEST_part_4': '5a04a04fcac9f1466709f242fdb80acb',
428
+ 'GMAI_mm_bench_TEST_part_5': 'c70baf8909eda9af0ddeab275c721336',
429
+ 'GMAI_mm_bench_TEST_part_6': '825abc39596b644dead9350d0cfa3b96',
430
+ 'GMAI_mm_bench_TEST_part_7': 'defb8aed2fb77365a76b6b9abd6a2701',
431
+ 'GMAI_mm_bench_TEST_part_8': 'ff490d60b85f2bb0abb67a435b298c65',
432
+ 'GMAI_mm_bench_TEST_part_9': 'ff67c86f40da93b09139ac1d1ba5dc6b',
433
+ 'GMAI_mm_bench_TEST_part_10': '3dae94627b9ac0fe00180d4780fbf6dc',
434
+ 'GMAI_mm_bench_TEST_part_11': 'd08dc813f0eb6bbab63cae2a9d113c4b',
435
+ }
436
+
437
+ @classmethod
438
+ def supported_datasets(cls):
439
+ return ['GMAI-MMBench_VAL', 'GMAI-MMBench_TEST']
440
+
441
+ def load_data(self, dataset):
442
+ if dataset == 'GMAI-MMBench_VAL':
443
+ data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
444
+ if file_size(data_path, 'GB') > 1:
445
+ local_path = data_path.replace('.tsv', '_local.tsv')
446
+ if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL'):
447
+ from ..tools import LOCALIZE
448
+ LOCALIZE(data_path, local_path)
449
+ data_path = local_path
450
+ return load(data_path)
451
+ elif dataset == 'GMAI-MMBench_TEST':
452
+ dfs = []
453
+ for part_num in range(1, 12):
454
+ part_name = f'GMAI_mm_bench_TEST_part_{part_num}'
455
+ url = self.DATASET_URL[part_name]
456
+ file_md5 = self.DATASET_MD5.get(part_name)
457
+ tsv_path = osp.join(LMUDataRoot(), f'{part_name}.tsv')
458
+ if not osp.exists(tsv_path) or (file_md5 and md5(tsv_path) != file_md5):
459
+ download_file(url, filename=tsv_path)
460
+ local_path = tsv_path.replace('.tsv', '_local.tsv')
461
+ if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL'):
462
+ from ..tools import LOCALIZE
463
+ LOCALIZE(tsv_path, local_path)
464
+ tsv_path = local_path
465
+ # 加载数据
466
+ df = load(tsv_path)
467
+ dfs.append(df)
468
+ # 合并所有数据
469
+ data = pd.concat(dfs, ignore_index=True)
470
+ return data
471
+ else:
472
+ raise ValueError(f"未知的数据集:{dataset}")
473
+
474
+ def report_acc_by_groups(self, df, group_column):
475
+ res = defaultdict(list)
476
+
477
+ # Check for the 'split' column
478
+ if 'split' in df:
479
+ splits = list(set(df['split']))
480
+ res['split'] = splits
481
+ else:
482
+ df['split'] = ['none'] * len(df)
483
+ res['split'] = ['none']
484
+
485
+ res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']]
486
+
487
+ if group_column not in df:
488
+ raise ValueError(f"Column '{group_column}' not found in dataframe.") # noqa: E713
489
+
490
+ abilities = list(set(df[group_column]))
491
+ abilities = ['None' if isinstance(ab, float) and pd.isna(ab) else ab for ab in abilities]
492
+ abilities.sort()
493
+
494
+ for ab in abilities:
495
+ ab_name = ab
496
+ sub_df = df[df[group_column] == ab]
497
+ res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']]
498
+
499
+ return pd.DataFrame(res)
500
+
501
+ def evaluate(self, eval_file, **judge_kwargs):
502
+ from .utils.multiple_choice import report_acc, mcq_vanilla_eval
503
+ nproc = judge_kwargs.pop('nproc', 4)
504
+
505
+ suffix = eval_file.split('.')[-1]
506
+ model = judge_kwargs.get('model', 'exact_matching')
507
+ assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
508
+ name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
509
+ name_str = name_str_map[model] if model in name_str_map else model
510
+
511
+ if model == 'exact_matching':
512
+ model = None
513
+ elif gpt_key_set():
514
+ model = build_judge(**judge_kwargs)
515
+ if not model.working():
516
+ warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
517
+ warnings.warn(DEBUG_MESSAGE)
518
+ model = None
519
+ else:
520
+ warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
521
+ model = None
522
+
523
+ result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
524
+
525
+ data = load(eval_file)
526
+ data = data.sort_values(by='index')
527
+ data['prediction'] = [str(x) for x in data['prediction']]
528
+ # If not choice label, then use lower case
529
+ for k in data.keys():
530
+ data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
531
+
532
+ meta = self.data
533
+ meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
534
+ data_map = {x: y for x, y in zip(data['index'], data['question'])}
535
+ for k in data_map:
536
+ assert k in meta_q_map, (
537
+ f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
538
+ )
539
+
540
+ data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
541
+
542
+ # load split
543
+ dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
544
+ data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
545
+
546
+ acc = report_acc(data)
547
+
548
+ for group_col in ['clinical vqa task', 'department', 'perceptual granularity']:
549
+ acc_grouped = self.report_acc_by_groups(data, group_col)
550
+ score_file_grouped = eval_file.replace(f'.{suffix}', f'_{group_col}_acc.csv')
551
+ dump(acc_grouped, score_file_grouped)
552
+
553
+ return acc
554
+
555
+
556
+ class MMERealWorld(ImageMCQDataset):
557
+
558
+ TYPE = 'MMERealWorld'
559
+
560
+ DATASET_MD5 = {
561
+ 'MME-RealWorld': '271c33ec814c39533c467ec6fb8a6f36',
562
+ 'MME-RealWorld-Lite': '4c17057d7d3b6c4a0d4397c3dae0881c',
563
+ 'MME-RealWorld-CN': 'daaa763d52a760a38606d5dedb3fe444',
564
+ }
565
+ SYS = {
566
+ 'MME-RealWorld': (
567
+ 'Select the best answer to the above multiple-choice question based on the image. '
568
+ 'Respond with only the letter (A, B, C, D, or E) of the correct option. \n'
569
+ 'The best answer is:'
570
+ ),
571
+ 'MME-RealWorld-Lite': (
572
+ 'Select the best answer to the above multiple-choice question based on the image. '
573
+ 'Respond with only the letter (A, B, C, D, or E) of the correct option. \n'
574
+ 'The best answer is:'
575
+ ),
576
+ 'MME-RealWorld-CN': (
577
+ '根据图像选择上述多项选择题的最佳答案。只需回答正确选项的字母(A, B, C, D 或 E)。\n'
578
+ '最佳答案为:'
579
+ ),
580
+ }
581
+
582
+ @classmethod
583
+ def supported_datasets(cls):
584
+ return ['MME-RealWorld', 'MME-RealWorld-CN', 'MME-RealWorld-Lite',]
585
+
586
+ def load_data(
587
+ self, dataset="MME-RealWorld", repo_id="yifanzhang114/MME-RealWorld-Base64"
588
+ ):
589
+
590
+ def check_integrity(pth):
591
+ data_file = osp.join(pth, f"{dataset}.tsv")
592
+
593
+ if not os.path.exists(data_file):
594
+ return False
595
+
596
+ if md5(data_file) != self.DATASET_MD5[dataset]:
597
+ return False
598
+ return True
599
+
600
+ def generate_tsv(pth):
601
+ tsv_file = os.path.join(pth, f"{dataset}.tsv")
602
+
603
+ if os.path.exists(tsv_file):
604
+ print(f"{tsv_file} already exists.")
605
+ return
606
+
607
+ json_dir = os.path.join(pth, dataset)
608
+ json_files = [f for f in os.listdir(json_dir) if f.endswith(".json")]
609
+
610
+ data_list = []
611
+ for json_file in json_files:
612
+ with open(os.path.join(json_dir, json_file), "r") as f:
613
+ data = json.load(f)
614
+ for item in tqdm(data):
615
+ choice_prompt = (
616
+ "The choices are listed below:\n"
617
+ if dataset in ["MME-RealWorld", "MME-RealWorld-Lite"]
618
+ else "选项如下所示:\n"
619
+ )
620
+ data_list.append(
621
+ {
622
+ "index": item["index"],
623
+ "image": item["image"],
624
+ "question": item["question"],
625
+ "multi-choice options": choice_prompt
626
+ + "\n".join(item["multi-choice options"]),
627
+ "A": item["multi-choice options"][0][4:],
628
+ "B": item["multi-choice options"][1][4:],
629
+ "C": item["multi-choice options"][2][4:],
630
+ "D": item["multi-choice options"][3][4:],
631
+ "E": item["multi-choice options"][4][4:],
632
+ "answer": item["answer"],
633
+ "category": item["category"],
634
+ "l2-category": item["l2-category"],
635
+ }
636
+ )
637
+ df = pd.DataFrame(data_list)
638
+ df.to_csv(tsv_file, sep="\t", index=False)
639
+ print(f"TSV file saved to {tsv_file}")
640
+
641
+ # Check if dataset is cached and has integrity
642
+ if dataset == "MME-RealWorld-Lite":
643
+ url = 'https://huggingface.co/datasets/yifanzhang114/MME-RealWorld-Base64/resolve/main/mme_realworld_lite.tsv' # noqa: E501
644
+ file_md5 = (
645
+ self.DATASET_MD5[dataset] if dataset in self.DATASET_MD5 else None
646
+ )
647
+ datas = self.prepare_tsv(url, file_md5)
648
+ choice_prompt = "The choices are listed below:\n"
649
+ for index, item in datas.iterrows():
650
+ options = eval(item["multi-choice options"])
651
+ datas.loc[index, "multi-choice options"] = choice_prompt + "\n".join(
652
+ options
653
+ )
654
+ datas.loc[index, "A"] = options[0][4:]
655
+ datas.loc[index, "B"] = options[1][4:]
656
+ datas.loc[index, "C"] = options[2][4:]
657
+ datas.loc[index, "D"] = options[3][4:]
658
+ datas.loc[index, "E"] = options[4][4:]
659
+ return datas
660
+
661
+ update_flag = False
662
+ cache_path = get_cache_path(repo_id)
663
+ if cache_path is not None and check_integrity(cache_path):
664
+ dataset_path = cache_path
665
+ print(f"Using cached dataset from {cache_path}")
666
+ else:
667
+ from huggingface_hub import snapshot_download
668
+
669
+ # Download or find the dataset path
670
+ dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
671
+ generate_tsv(dataset_path)
672
+ update_flag = True
673
+
674
+ data_path = os.path.join(dataset_path, f"{dataset}.tsv")
675
+ if file_size(data_path, "GB") > 1:
676
+ local_path = data_path.replace(".tsv", "_local.tsv")
677
+ if (
678
+ not osp.exists(local_path)
679
+ or os.environ.get("FORCE_LOCAL", None)
680
+ or update_flag
681
+ ):
682
+ from vlmeval.tools import LOCALIZE
683
+
684
+ LOCALIZE(data_path, local_path)
685
+ data_path = local_path
686
+ return load(data_path)
687
+
688
+ def post_build(self, dataset):
689
+ self.TYPE = 'MMERealWorld'
690
+
691
+ # Given one data record, return the built prompt (a multi-modal message), can override
692
+ def build_prompt(self, line):
693
+ if isinstance(line, int):
694
+ line = self.data.iloc[line]
695
+
696
+ if self.meta_only:
697
+ tgt_path = toliststr(line['image_path'])
698
+ else:
699
+ tgt_path = self.dump_image(line)
700
+
701
+ question = line['question']
702
+
703
+ choice_prompt = line['multi-choice options'] + '\n'
704
+ question += ' ' + choice_prompt + self.SYS[self.dataset_name]
705
+
706
+ msgs = []
707
+ if isinstance(tgt_path, list):
708
+ msgs.extend([dict(type='image', value=p) for p in tgt_path])
709
+ else:
710
+ msgs = [dict(type='image', value=tgt_path)]
711
+ msgs.append(dict(type='text', value=question))
712
+ return msgs
713
+
714
+ # It returns a dictionary
715
+ @classmethod
716
+ def evaluate(self, eval_file, **judge_kwargs):
717
+ from .utils.multiple_choice import extract_characters_regex, get_dimension_rating
718
+ assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
719
+ FAIL_MSG = 'Failed to obtain answer via API.'
720
+ tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
721
+ tgt_file = eval_file.replace('.xlsx', '_rating.json')
722
+ score_file = eval_file.replace('.xlsx', '_score.xlsx')
723
+
724
+ if not osp.exists(score_file):
725
+
726
+ res = {} if not osp.exists(tmp_file) else load(tmp_file)
727
+ res = {k: v for k, v in res.items() if FAIL_MSG not in v}
728
+
729
+ data = load(eval_file)
730
+ cnt_rejected = 0
731
+ data_un = data[~pd.isna(data['prediction'])]
732
+
733
+ for idx in data['index']:
734
+ ans = data.loc[data['index'] == idx, 'answer'].values[0]
735
+ pred = data.loc[data['index'] == idx, 'prediction'].values[0]
736
+
737
+ extract_pred = extract_characters_regex(pred)
738
+ if extract_pred == '':
739
+ cnt_rejected += 1
740
+ data.loc[data['index'] == idx, 'score'] = 0
741
+ else:
742
+ data.loc[data['index'] == idx, 'score'] = int(extract_pred == ans)
743
+
744
+ print(
745
+ f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
746
+ f'failed to obtain the score for another {cnt_rejected} questions. '
747
+ f'Those questions will be counted as 0 score in ALL rating.'
748
+ )
749
+
750
+ dump(data, score_file)
751
+
752
+ rating = get_dimension_rating(score_file)
753
+ dump(rating, tgt_file)
754
+ return rating
755
+
756
+
757
+ class HRBenchDataset(ImageMCQDataset):
758
+
759
+ DATASET_URL = {
760
+ 'HRBench4K': 'https://huggingface.co/datasets/DreamMr/HR-Bench/resolve/main/hr_bench_4k.tsv',
761
+ 'HRBench8K': 'https://huggingface.co/datasets/DreamMr/HR-Bench/resolve/main/hr_bench_8k.tsv',
762
+ }
763
+
764
+ DATASET_MD5 = {
765
+ 'HRBench4K': 'f6b041b03d49543494b8a56d2e35be65',
766
+ 'HRBench8K': '274c9c7f89329b804a4723178a00219c',
767
+ }
768
+
769
+ def evaluate(self, eval_file, **judge_kwargs):
770
+ assert os.path.exists(eval_file), '{} does not exist!'.format(eval_file)
771
+ from .utils.multiple_choice import mcq_vanilla_eval
772
+ from .utils.hrbench import report_acc_hrbench
773
+ nproc = judge_kwargs.pop('nproc', 4)
774
+
775
+ suffix = eval_file.split('.')[-1]
776
+ model = judge_kwargs.get('model', 'extract_matching')
777
+ assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
778
+ name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
779
+ name_str = name_str_map[model] if model in name_str_map else model
780
+
781
+ if model == 'exact_matching':
782
+ model = None
783
+ elif gpt_key_set():
784
+ model = build_judge(**judge_kwargs)
785
+ if not model.working():
786
+ warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
787
+ warnings.warn(DEBUG_MESSAGE)
788
+ model = None
789
+ else:
790
+ warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
791
+ model = None
792
+
793
+ result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
794
+
795
+ data = load(eval_file)
796
+ data = data.sort_values(by='index')
797
+ data['prediction'] = [str(x) for x in data['prediction']]
798
+ # If not choice label, then use lower case
799
+ for k in data.keys():
800
+ data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
801
+
802
+ meta = self.data
803
+ meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
804
+ data_map = {x: y for x, y in zip(data['index'], data['question'])}
805
+ for k in data_map:
806
+ assert k in meta_q_map, (
807
+ f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
808
+ )
809
+
810
+ score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
811
+
812
+ if osp.exists(score_file):
813
+ acc = load(score_file)
814
+ return acc
815
+ data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
816
+ dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
817
+ data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
818
+
819
+ acc = report_acc_hrbench(data)
820
+
821
+ score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
822
+ dump(acc, score_file)
823
+
824
+ return acc
825
+
826
+
827
+ class CustomMCQDataset(ImageMCQDataset):
828
+
829
+ def load_data(self, dataset):
830
+ data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
831
+
832
+ if file_size(data_path, 'GB') > 1:
833
+ local_path = data_path.replace('.tsv', '_local.tsv')
834
+ if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None):
835
+ from ..tools import LOCALIZE
836
+ LOCALIZE(data_path, local_path)
837
+ data_path = local_path
838
+ return load(data_path)
839
+
840
+
841
+ class NaturalBenchDataset(ImageMCQDataset):
842
+
843
+ DATASET_URL = {
844
+ 'NaturalBenchDataset': (
845
+ 'https://huggingface.co/datasets/BaiqiL/'
846
+ 'NaturalBench/resolve/main/NaturalBenchDataset.tsv'
847
+ ),
848
+ }
849
+ DATASET_MD5 = {
850
+ 'NaturalBenchDataset':'dbe25b044bc35696426381e9ba4fe930',
851
+ }
852
+
853
+ def build_prompt(self, line):
854
+ SUFFIX_FOR_VQA = {
855
+ "yes_no": "Please answer Yes or No.",
856
+ "multiple_choice": "Please output the letter corresponding to the correct option."
857
+ }
858
+ if isinstance(line, int):
859
+ line = self.data.iloc[line]
860
+
861
+ if self.meta_only:
862
+ tgt_path = toliststr(line['image_path'])
863
+ else:
864
+ tgt_path = self.dump_image(line)
865
+
866
+ question = line['question']
867
+ prompt = f'{question} {SUFFIX_FOR_VQA[line["type"]]}'
868
+ msgs = []
869
+ if isinstance(tgt_path, list):
870
+ msgs.extend([dict(type='image', value=p) for p in tgt_path])
871
+ else:
872
+ msgs = [dict(type='image', value=tgt_path)]
873
+ msgs.append(dict(type='text', value=prompt))
874
+
875
+ return msgs
876
+
877
+ def evaluate(self, eval_file, **judge_kwargs):
878
+ from .utils.naturalbench import extract_answer, get_scores
879
+
880
+ data = load(eval_file)
881
+ data = data.sort_values(by='index')
882
+ predictions = [str(x) for x in data['prediction']]
883
+ answers = [str(x) for x in data['answer']]
884
+ indexs = [str(x) for x in data['index']]
885
+ meta = self.data
886
+ types = [str(x) for x in meta['type']]
887
+ results = {}
888
+ assert len(predictions) == len(answers) == len(indexs) == len(types) == (1900 * 4)
889
+ number_answered_samples = len(predictions) // 4
890
+ for i in range(number_answered_samples):
891
+ results[i] = {
892
+ "q0_i0": extract_answer(predictions[i * 4], types[i * 4]),
893
+ "q0_i1": extract_answer(predictions[i * 4 + 1], types[i * 4 + 1]),
894
+ "q1_i0": extract_answer(predictions[i * 4 + 2], types[i * 4 + 2]),
895
+ "q1_i1": extract_answer(predictions[i * 4 + 3], types[i * 4 + 3])
896
+ }
897
+
898
+ scores = get_scores(results)
899
+ print(scores)
900
+ score_file = 'NaturalBench_acc.csv'
901
+ df = pd.DataFrame(list(scores.items()), columns=['Metric', 'Score'])
902
+ dump(df, score_file)
903
+
904
+ return scores
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_mt.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .image_base import ImageBaseDataset
2
+ from .utils.judge_util import build_judge
3
+ from ..smp import *
4
+ from ..utils import track_progress_rich
5
+
6
+
7
+ class ImageMTDataset(ImageBaseDataset):
8
+
9
+ TYPE = 'MT'
10
+
11
+ def build_prompt(self, line):
12
+ if isinstance(line, int):
13
+ line = self.data.iloc[line]
14
+
15
+ if self.meta_only:
16
+ tgt_path = toliststr(line['image_path'])
17
+ else:
18
+ tgt_path = self.dump_image(line)
19
+
20
+ questions = toliststr(line['question'])
21
+ if 'answer' in line:
22
+ answers = toliststr(line['answer'])
23
+ else:
24
+ answers = [''] * len(questions)
25
+ assert len(questions) == len(answers)
26
+
27
+ dlgs, pics_number = [], 0
28
+ for i in range(len(questions)):
29
+ q, a = questions[i], answers[i]
30
+ if '<ImageHere>' in q:
31
+ content = []
32
+ tag_number = q.count('<ImageHere>')
33
+ images = tgt_path[pics_number: pics_number + tag_number]
34
+ pics_number += tag_number
35
+ q_split = q.split('<ImageHere>')
36
+ for i in range(tag_number):
37
+ qsp, im = q_split[i], images[i]
38
+ if qsp != '':
39
+ content.append(dict(type='text', value=qsp))
40
+ content.append(dict(type='image', value=im))
41
+ if q_split[-1] != '':
42
+ content.append(dict(type='text', value=q_split[-1]))
43
+ else:
44
+ content = [dict(type='text', value=q)]
45
+ dlgs.append(dict(role='user', content=content))
46
+ assert '<ImageHere>' not in a, 'We currently do not support images in the answer. '
47
+ content = [dict(type='text', value=a)]
48
+ dlgs.append(dict(role='assistant', content=content))
49
+ return dlgs
50
+
51
+
52
+ class MMDUDataset(ImageMTDataset):
53
+
54
+ DATASET_URL = {'MMDU': 'https://opencompass.openxlab.space/utils/VLMEval/MMDU.tsv'}
55
+ DATASET_MD5 = {'MMDU': '848b635a88a078f49aebcc6e39792061'}
56
+ DIMS = [
57
+ 'Creativity', 'Richness', 'Visual Perception', 'Logical Coherence',
58
+ 'Answer Accuracy', 'Image Relationship Understanding', 'Overall Score'
59
+ ]
60
+
61
+ def calculat_metric(self, ans):
62
+ all = defaultdict(lambda: 0)
63
+ tot = defaultdict(lambda: 0)
64
+ valid = defaultdict(lambda: 0)
65
+ for k in ans:
66
+ res = ans[k]['res']
67
+ assert isinstance(res, pd.DataFrame)
68
+ lt = len(res)
69
+ for i in range(lt):
70
+ line = res.iloc[i]
71
+ for k in self.DIMS:
72
+ tot[k] += 1
73
+ if k in line and line[k] is not None:
74
+ try:
75
+ score = int(line[k])
76
+ score = np.clip(score, 0, 10)
77
+ all[k] += score
78
+ valid[k] += 1
79
+ except Exception as e:
80
+ print(f'Failed to parse the score: {str(e)}')
81
+ sp1 = {'set': 'all'}
82
+ sp1.update({k: all[k] / tot[k] * 10 for k in self.DIMS})
83
+ sp2 = {'set': 'valid'}
84
+ sp2.update({k: all[k] / valid[k] * 10 for k in self.DIMS})
85
+
86
+ return pd.DataFrame([sp1, sp2])
87
+
88
+ def evaluate(self, eval_file, **judge_kwargs):
89
+ suffix = eval_file.split('.')[-1]
90
+ model = judge_kwargs['model']
91
+
92
+ tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
93
+ score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
94
+ nproc = judge_kwargs.pop('nproc', 4)
95
+
96
+ data = load(eval_file)
97
+ model = judge_kwargs.pop('model', 'gpt-4o')
98
+ judge_model = build_judge(model=model, **judge_kwargs)
99
+
100
+ lt = len(data)
101
+ lines = [data.iloc[i] for i in range(lt)]
102
+ tups = [(judge_model, line) for line in lines]
103
+ indices = [line['index'] for line in lines]
104
+
105
+ ans = {}
106
+ if osp.exists(tmp_file):
107
+ ans = load(tmp_file)
108
+
109
+ tups = [x for x, i in zip(tups, indices) if i not in ans]
110
+ indices = [i for i in indices if i not in ans]
111
+
112
+ from .utils.mmdu import mmdu_score
113
+
114
+ if len(indices):
115
+ new_results = track_progress_rich(
116
+ mmdu_score,
117
+ tups,
118
+ nproc=nproc,
119
+ chunksize=nproc,
120
+ keys=indices,
121
+ save=tmp_file,)
122
+ ans = load(tmp_file)
123
+ for k, v in zip(indices, new_results):
124
+ assert k in ans
125
+
126
+ metric = self.calculat_metric(ans)
127
+ dump(metric, score_file)
128
+ return metric
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_vqa.py ADDED
@@ -0,0 +1,1475 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import tempfile
4
+ from functools import partial
5
+
6
+ import pandas as pd
7
+
8
+ from .image_base import ImageBaseDataset
9
+ from .utils import build_judge, DEBUG_MESSAGE
10
+ from ..smp import *
11
+ from ..utils import track_progress_rich
12
+
13
+
14
+ class ImageVQADataset(ImageBaseDataset):
15
+ TYPE = 'VQA'
16
+
17
+ DATASET_URL = {
18
+ 'OCRVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/OCRVQA_TEST.tsv',
19
+ 'OCRVQA_TESTCORE': 'https://opencompass.openxlab.space/utils/VLMEval/OCRVQA_TESTCORE.tsv',
20
+ 'TextVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/TextVQA_VAL.tsv',
21
+ 'DocVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/DocVQA_VAL.tsv',
22
+ 'DocVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/DocVQA_TEST.tsv',
23
+ 'InfoVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/InfoVQA_VAL.tsv',
24
+ 'InfoVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/InfoVQA_TEST.tsv',
25
+ 'ChartQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/ChartQA_TEST.tsv',
26
+ 'GQA_TestDev_Balanced': 'https://opencompass.openxlab.space/utils/VLMEval/GQA_TestDev_Balanced.tsv',
27
+ }
28
+
29
+ DATASET_MD5 = {
30
+ 'OCRVQA_TEST': 'ca46a6d74b403e9d6c0b670f6fc00db9',
31
+ 'OCRVQA_TESTCORE': 'c5239fe77db8bdc1f2ad8e55e0d1fe97',
32
+ 'TextVQA_VAL': 'b233b31f551bbf4056f2f955da3a92cd',
33
+ 'DocVQA_VAL': 'd5ee77e1926ff10690d469c56b73eabf',
34
+ 'DocVQA_TEST': '6a2f28cac26ef2d3447374e8c6f6c8e9',
35
+ 'InfoVQA_VAL': '2342e9c225222f0ef4dec545ebb126fe',
36
+ 'InfoVQA_TEST': 'df535bf51b88dc9718252c34131a6227',
37
+ 'ChartQA_TEST': 'c902e0aa9be5582a7aad6dcf52734b42',
38
+ 'GQA_TestDev_Balanced': '99b62f22e224d9b2f32dcbe41359d1c9',
39
+ }
40
+
41
+ def build_prompt(self, line):
42
+ msgs = super().build_prompt(line)
43
+ assert msgs[-1]['type'] == 'text'
44
+ msgs[-1]['value'] += '\nAnswer the question using a single word or phrase.'
45
+ return msgs
46
+
47
+ # It returns a DataFrame
48
+ def evaluate(self, eval_file, **judge_kwargs):
49
+ from .utils.vqa_eval import hit_calculate, process_line
50
+
51
+ data = load(eval_file)
52
+ dataset = self.dataset_name
53
+ assert 'answer' in data and 'prediction' in data
54
+ data['prediction'] = [str(x) for x in data['prediction']]
55
+ data['answer'] = [str(x) for x in data['answer']]
56
+ lt = len(data)
57
+ pool = mp.Pool(16)
58
+ lines = [data.iloc[i] for i in range(lt)]
59
+ if listinstr(['TextVQA'], dataset):
60
+ res = pool.map(partial(process_line, method='vqa_score'), lines)
61
+ elif listinstr(['ChartQA'], dataset):
62
+ res = pool.map(partial(process_line, method='relaxed_accuracy'), lines)
63
+ elif listinstr(['OCRVQA', 'GQA'], dataset):
64
+ res = pool.map(partial(process_line, method='accuracy'), lines)
65
+ elif listinstr(['DocVQA', 'InfoVQA'], dataset):
66
+ res = pool.map(partial(process_line, method='anls'), lines)
67
+ else: # default using vqa_score to calculate score
68
+ res = pool.map(process_line, lines)
69
+ hit = hit_calculate(res, dataset)
70
+ ret = dict()
71
+ if 'split' in data:
72
+ splits = set(data['split'])
73
+ for sp in splits:
74
+ sub = [r for l, r in zip(lines, res) if l['split'] == sp]
75
+ # [np.mean(x['match']) >= full_score_weight for x in sub]
76
+ hit = hit_calculate(sub, dataset)
77
+ ret[sp] = np.mean(hit) * 100
78
+ sub = [r for l, r in zip(lines, res)]
79
+ hit = hit_calculate(sub, dataset)
80
+ ret['Overall'] = np.mean(hit) * 100
81
+ else:
82
+ ret['Overall'] = np.mean(hit) * 100
83
+ if 'category' in data:
84
+ cates = list(set(data['category']))
85
+ cates.sort()
86
+ for c in cates:
87
+ sub = [r for l, r in zip(lines, res) if l['category'] == c]
88
+ # [np.mean(x['match']) >= full_score_weight for x in sub]
89
+ hit = hit_calculate(sub, dataset)
90
+ ret[c] = np.mean(hit) * 100
91
+ ret = d2df(ret)
92
+ ret.round(2)
93
+
94
+ suffix = eval_file.split('.')[-1]
95
+ result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
96
+ dump(ret, result_file)
97
+ return ret
98
+
99
+
100
+ class VizWiz(ImageBaseDataset):
101
+ TYPE = 'VQA'
102
+ DATASET_URL = {
103
+ 'VizWiz': 'https://opencompass.openxlab.space/utils/VLMEval/VizWiz.tsv'
104
+ }
105
+ DATASET_MD5 = {
106
+ 'VizWiz': 'fa4ac4164467563ed2fac6eac6631bd0'
107
+ }
108
+
109
+ @classmethod
110
+ def evaluate(self, eval_file, **judge_kwargs):
111
+ from .utils.vqa_eval import hit_calculate, process_line
112
+
113
+ suffix = eval_file.split('.')[-1]
114
+ result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
115
+
116
+ if not osp.exists(result_file):
117
+ data = load(eval_file)
118
+ assert 'answers' in data and 'prediction' in data
119
+ data['prediction'] = [str(x) for x in data['prediction']]
120
+ data['answer'] = [str(x) for x in data['answers']]
121
+
122
+ lt = len(data)
123
+ pool = mp.Pool(16)
124
+ lines = [data.iloc[i] for i in range(lt)]
125
+ res = pool.map(process_line, lines)
126
+
127
+ hit = hit_calculate(res, 'VizWiz')
128
+ ret = dict()
129
+
130
+ ret['Overall'] = np.mean(hit) * 100
131
+ ret = d2df(ret)
132
+ ret.round(2)
133
+
134
+ dump(ret, result_file)
135
+
136
+ retz = pd.read_csv(result_file)
137
+ return retz
138
+
139
+
140
+ class OCRBench(ImageBaseDataset):
141
+ TYPE = 'VQA'
142
+ DATASET_URL = {
143
+ 'OCRBench': 'https://opencompass.openxlab.space/utils/VLMEval/OCRBench.tsv'
144
+ }
145
+ DATASET_MD5 = {'OCRBench': 'e953d98a987cc6e26ef717b61260b778'}
146
+
147
+ # It returns a dictionary
148
+ @classmethod
149
+ def evaluate(self, eval_file, **judge_kwargs):
150
+ OCRBench_score = {
151
+ 'Regular Text Recognition': 0,
152
+ 'Irregular Text Recognition': 0,
153
+ 'Artistic Text Recognition': 0,
154
+ 'Handwriting Recognition': 0,
155
+ 'Digit String Recognition': 0,
156
+ 'Non-Semantic Text Recognition': 0,
157
+ 'Scene Text-centric VQA': 0,
158
+ 'Doc-oriented VQA': 0,
159
+ 'Key Information Extraction': 0,
160
+ 'Handwritten Mathematical Expression Recognition': 0,
161
+ }
162
+
163
+ data = load(eval_file)
164
+ lt = len(data)
165
+ lines = [data.iloc[i] for i in range(lt)]
166
+ for i in tqdm(range(len(lines))):
167
+ line = lines[i]
168
+ predict = str(line['prediction'])
169
+ answers = eval(line['answer'])
170
+ category = line['category']
171
+ if category == 'Handwritten Mathematical Expression Recognition':
172
+ for j in range(len(answers)):
173
+ answer = answers[j].strip().replace('\n', ' ').replace(' ', '')
174
+ predict = predict.strip().replace('\n', ' ').replace(' ', '')
175
+ if answer in predict:
176
+ OCRBench_score[category] += 1
177
+ break
178
+ else:
179
+ for j in range(len(answers)):
180
+ answer = answers[j].lower().strip().replace('\n', ' ')
181
+ predict = predict.lower().strip().replace('\n', ' ')
182
+ if answer in predict:
183
+ OCRBench_score[category] += 1
184
+ break
185
+
186
+ final_score_dict = {}
187
+ final_score_dict['Text Recognition'] = \
188
+ (OCRBench_score['Regular Text Recognition'] + OCRBench_score['Irregular Text Recognition']
189
+ + OCRBench_score['Artistic Text Recognition'] + OCRBench_score['Handwriting Recognition']
190
+ + OCRBench_score['Digit String Recognition'] + OCRBench_score['Non-Semantic Text Recognition'])
191
+ final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA']
192
+ final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA']
193
+ final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction']
194
+ final_score_dict['Handwritten Mathematical Expression Recognition'] = \
195
+ (OCRBench_score['Handwritten Mathematical Expression Recognition'])
196
+ final_score_dict['Final Score'] = \
197
+ (final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA']
198
+ + final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction']
199
+ + final_score_dict['Handwritten Mathematical Expression Recognition'])
200
+ final_score_dict['Final Score Norm'] = (float(final_score_dict['Final Score']) / 10)
201
+ score_pth = eval_file.replace('.xlsx', '_score.json')
202
+ dump(final_score_dict, score_pth)
203
+ return final_score_dict
204
+
205
+
206
+ class MathVista(ImageBaseDataset):
207
+ TYPE = 'VQA'
208
+ DATASET_URL = {
209
+ 'MathVista_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/MathVista_MINI.tsv'
210
+ }
211
+ DATASET_MD5 = {'MathVista_MINI': 'f199b98e178e5a2a20e7048f5dcb0464'}
212
+
213
+ # It returns a DataFrame
214
+ @classmethod
215
+ def evaluate(self, eval_file, **judge_kwargs):
216
+ from .utils.mathvista import MathVista_auxeval, MathVista_acc
217
+
218
+ model = judge_kwargs['model']
219
+ suffix = eval_file.split('.')[-1]
220
+ storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
221
+ tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
222
+ nproc = judge_kwargs.pop('nproc', 4)
223
+
224
+ if not osp.exists(storage):
225
+ data = load(eval_file)
226
+ model = build_judge(max_tokens=128, **judge_kwargs)
227
+ assert model.working(), ('MathVista evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
228
+ lt = len(data)
229
+ lines = [data.iloc[i] for i in range(lt)]
230
+ tups = [(model, line) for line in lines]
231
+ indices = [line['index'] for line in lines]
232
+
233
+ ans = {}
234
+ if osp.exists(tmp_file):
235
+ ans = load(tmp_file)
236
+ tups = [x for x, i in zip(tups, indices) if i not in ans]
237
+ indices = [i for i in indices if i not in ans]
238
+
239
+ if len(indices):
240
+ new_results = track_progress_rich(
241
+ MathVista_auxeval,
242
+ tups,
243
+ nproc=nproc,
244
+ chunksize=nproc,
245
+ keys=indices,
246
+ save=tmp_file,
247
+ )
248
+ ans = load(tmp_file)
249
+ for k, v in zip(indices, new_results):
250
+ assert k in ans
251
+ assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
252
+
253
+ data['res'] = [ans[idx]['res'] for idx in data['index']]
254
+ data['log'] = [ans[idx]['log'] for idx in data['index']]
255
+ dump(data, storage)
256
+
257
+ score = MathVista_acc(storage)
258
+ score_pth = storage.replace('.xlsx', '_score.csv')
259
+ dump(score, score_pth)
260
+ return score
261
+
262
+
263
+ class MathVerse(ImageBaseDataset):
264
+ TYPE = 'VQA'
265
+ DATASET_URL = {
266
+ 'MathVerse_MINI': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIV.tsv', # noqa
267
+ 'MathVerse_MINI_Vision_Only': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVOnly.tsv', # noqa
268
+ 'MathVerse_MINI_Vision_Dominant': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVDom.tsv', # noqa
269
+ 'MathVerse_MINI_Vision_Intensive': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVInt.tsv', # noqa
270
+ 'MathVerse_MINI_Text_Lite': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINITLite.tsv', # noqa
271
+ 'MathVerse_MINI_Text_Dominant': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINITDom.tsv', # noqa
272
+ }
273
+ DATASET_MD5 = {
274
+ 'MathVerse_MINI': '5017caca32b7fa110c350a1bea861b65',
275
+ 'MathVerse_MINI_Vision_Only': '68a11d4680014ac881fa37adeadea3a4',
276
+ 'MathVerse_MINI_Vision_Dominant': 'b8fb63852d261ab2aaefba29cc2414d3',
277
+ 'MathVerse_MINI_Vision_Intensive': '01cbd35be202bb0c4873a4186a63bc19',
278
+ 'MathVerse_MINI_Text_Lite': '19e4b13bdd30b89a03b2e358bcfefa04',
279
+ 'MathVerse_MINI_Text_Dominant': '4f5cd2fa6630ea00bb11d6fde1f6fe6a',
280
+ }
281
+
282
+ # It returns a DataFrame
283
+ @classmethod
284
+ def evaluate(self, eval_file, **judge_kwargs):
285
+ from .utils.mathverse import MathVerse_auxeval_extract, MathVerse_auxeval_score, MathVerse_acc
286
+
287
+ model = judge_kwargs['model']
288
+ suffix = eval_file.split('.')[-1]
289
+ storage_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.xlsx')
290
+ tmp_file_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.pkl')
291
+ storage_score = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
292
+ tmp_file_score = eval_file.replace(f'.{suffix}', f'_{model}_score.pkl')
293
+ nproc = judge_kwargs.pop('nproc', 4)
294
+ # stage1: extract the answer
295
+ if not osp.exists(storage_extract):
296
+ data = load(eval_file)
297
+ model = build_judge(max_tokens=128, **judge_kwargs)
298
+ assert model.working(), ('MathVerse evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
299
+ lt = len(data)
300
+ lines = [data.iloc[i] for i in range(lt)]
301
+ tups = [(model, line) for line in lines]
302
+ indices = [line['index'] for line in lines]
303
+
304
+ ans = {}
305
+ if osp.exists(tmp_file_extract):
306
+ ans = load(tmp_file_extract)
307
+ tups = [x for x, i in zip(tups, indices) if i not in ans]
308
+ indices = [i for i in indices if i not in ans]
309
+
310
+ if len(indices):
311
+ new_results = track_progress_rich(
312
+ MathVerse_auxeval_extract,
313
+ tups,
314
+ nproc=nproc,
315
+ chunksize=nproc,
316
+ keys=indices,
317
+ save=tmp_file_extract,
318
+ )
319
+ ans = load(tmp_file_extract)
320
+ for k, v in zip(indices, new_results):
321
+ assert k in ans
322
+ assert ans[k]['log_extract'] == v['log_extract'] and ans[k]['extract'] == v['extract']
323
+
324
+ data['extract'] = [ans[idx]['extract'] for idx in data['index']]
325
+ data['log_extract'] = [ans[idx]['log_extract'] for idx in data['index']]
326
+ dump(data, storage_extract)
327
+
328
+ # stage2: score the answer
329
+ if not osp.exists(storage_score):
330
+ data = load(storage_extract)
331
+ model = build_judge(max_tokens=128, **judge_kwargs)
332
+ assert model.working(), ('MathVerse evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
333
+ lt = len(data)
334
+ lines = [data.iloc[i] for i in range(lt)]
335
+ tups = [(model, line) for line in lines]
336
+ indices = [line['index'] for line in lines]
337
+
338
+ ans = {}
339
+ if osp.exists(tmp_file_score):
340
+ ans = load(tmp_file_score)
341
+ tups = [x for x, i in zip(tups, indices) if i not in ans]
342
+ indices = [i for i in indices if i not in ans]
343
+
344
+ if len(indices):
345
+ new_results = track_progress_rich(
346
+ MathVerse_auxeval_score,
347
+ tups,
348
+ nproc=nproc,
349
+ chunksize=nproc,
350
+ keys=indices,
351
+ save=tmp_file_score,
352
+ )
353
+ ans = load(tmp_file_score)
354
+ for k, v in zip(indices, new_results):
355
+ assert k in ans
356
+ assert ans[k]['log_score'] == v['log_score'] and ans[k]['score'] == v['score']
357
+
358
+ data['score'] = [ans[idx]['score'] for idx in data['index']]
359
+ data['log_score'] = [ans[idx]['log_score'] for idx in data['index']]
360
+ dump(data, storage_score)
361
+
362
+ score = MathVerse_acc(storage_score)
363
+ score_pth = storage_score.replace('.xlsx', '.csv')
364
+ dump(score, score_pth)
365
+ return score
366
+
367
+
368
+ class MathVision(ImageBaseDataset):
369
+ TYPE = 'VQA'
370
+ DATASET_URL = {
371
+ 'MathVision': 'https://opencompass.openxlab.space/utils/VLMEval/MathVision.tsv',
372
+ 'MathVision_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/MathVision_MINI.tsv'
373
+ }
374
+ DATASET_MD5 = {
375
+ 'MathVision': '93f6de14f7916e598aa1b7165589831e',
376
+ 'MathVision_MINI': '060fe4fa5d868987ce179307bd5f8a33'
377
+ }
378
+
379
+ # It returns a DataFrame
380
+ @classmethod
381
+ def evaluate(self, eval_file, **judge_kwargs):
382
+ from .utils.mathv import MATH_V_auxeval, MATH_V_acc
383
+
384
+ if 'model' in judge_kwargs:
385
+ model = judge_kwargs['model']
386
+ else:
387
+ model = os.path.basename(os.environ.get('LOCAL_LLM'))
388
+ suffix = eval_file.split('.')[-1]
389
+ storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
390
+ tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
391
+ nproc = judge_kwargs.pop('nproc', 4)
392
+
393
+ if not osp.exists(storage):
394
+ data = load(eval_file)
395
+ model = build_judge(max_tokens=128, **judge_kwargs)
396
+ assert model.working(), ('MATH-Vision evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
397
+ lt = len(data)
398
+ lines = [data.iloc[i] for i in range(lt)]
399
+ tups = [(model, line) for line in lines]
400
+ indices = [line['index'] for line in lines]
401
+
402
+ ans = {}
403
+ if osp.exists(tmp_file):
404
+ ans = load(tmp_file)
405
+ tups = [x for x, i in zip(tups, indices) if i not in ans]
406
+ indices = [i for i in indices if i not in ans]
407
+
408
+ if len(indices):
409
+ new_results = track_progress_rich(
410
+ MATH_V_auxeval,
411
+ tups,
412
+ nproc=nproc,
413
+ chunksize=nproc,
414
+ keys=indices,
415
+ save=tmp_file,
416
+ )
417
+ ans = load(tmp_file)
418
+ for k, v in zip(indices, new_results):
419
+ assert k in ans
420
+ assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
421
+
422
+ data['res'] = [ans[idx]['res'] for idx in data['index']]
423
+ data['log'] = [ans[idx]['log'] for idx in data['index']]
424
+ dump(data, storage)
425
+
426
+ score = MATH_V_acc(storage)
427
+ score_pth = storage.replace('.xlsx', '_score.csv')
428
+ dump(score, score_pth)
429
+ return score
430
+
431
+
432
+ class OlympiadBench(ImageBaseDataset):
433
+ TYPE = 'VQA_ex_prompt'
434
+ DATASET_URL = {
435
+ 'OlympiadBench': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench.tsv',
436
+ 'OlympiadBench_EN': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench_EN.tsv',
437
+ 'OlympiadBench_CN': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench_CN.tsv'
438
+ }
439
+ DATASET_MD5 = {
440
+ 'OlympiadBench': '9735ae0f0299eae1e7d07f5a7feab914',
441
+ 'OlympiadBench_EN': '5c68e100d394351fc7049f29d4d4efed',
442
+ 'OlympiadBench_CN': 'ea01b16788955702c79650c701e5b623'
443
+ }
444
+
445
+ def dump_image(self, line):
446
+ os.makedirs(self.img_root, exist_ok=True)
447
+
448
+ tgt_path_z = []
449
+ if isinstance(line['image'], list):
450
+ for i in range(len(line['image'])):
451
+ tgt_path = osp.join(self.img_root, f"{line['index']}--{i + 1}.jpg")
452
+ if not read_ok(tgt_path):
453
+ decode_base64_to_image_file(line['image'][i], tgt_path)
454
+ tgt_path_z.append(tgt_path)
455
+ else:
456
+ tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
457
+ if not read_ok(tgt_path):
458
+ decode_base64_to_image_file(line['image'], tgt_path)
459
+ tgt_path_z.append(tgt_path)
460
+ return tgt_path_z
461
+
462
+ def build_prompt(self, line):
463
+
464
+ from .utils.olympiadbench import get_answer_type_text, make_input
465
+
466
+ self.is_chinese = 'zh' in line['source']
467
+ self.is_math = 'maths' in line['source']
468
+ self.is_theorem_proving = 'TP' in line['source']
469
+
470
+ if self.is_chinese:
471
+ subject_content = '数学' if self.is_math else '物理'
472
+ if self.is_theorem_proving:
473
+ prompt = (
474
+ f"以下是中国{subject_content}竞赛中的证明题。请根据题目的要求,运用逻辑推理及常用定理证明题目中的命题。"
475
+ "证明过程中使用的变量和公式请使用LaTeX格式表示。"
476
+ )
477
+ else:
478
+ answer_type_text = get_answer_type_text(line['answer_type'], is_chinese=True,
479
+ multiple_answer=line['is_multiple_answer'])
480
+ if line['is_multiple_answer']:
481
+ multiple_answer_text = '\\boxed{用英文逗号连接的多个答案}'
482
+ else:
483
+ multiple_answer_text = '\\boxed{答案}'
484
+ unit_text = ''
485
+ if line['unit']:
486
+ multiple_answer_text += '(单位)'
487
+ unit_text = ',注意答案的单位不要放在\\boxed{}中'
488
+ prompt = (
489
+ f'以下是中国{subject_content}竞赛中的解答题{answer_type_text}。请根据题目的要求和所提供的信息计算得出答案。'
490
+ f'解答过程和结果中使用的变量和公式请使用LaTeX格式表示。请在最后以“所以最终答案是{multiple_answer_text}。”'
491
+ f'显式给出结果{unit_text}。'
492
+ )
493
+ else:
494
+ subject_content = 'Math' if self.is_math else 'Physics'
495
+ if self.is_theorem_proving:
496
+ prompt = (
497
+ f'The following is a theorem proving problem from an International {subject_content} competition. '
498
+ 'Please use logical reasoning and common theorems to prove the proposition in the problem '
499
+ 'according to the given requirements. '
500
+ 'Please use LaTeX format to represent the variables and formulas used in the proof.'
501
+ )
502
+ else:
503
+ if line['is_multiple_answer']:
504
+ multiple_answer_text = '\\boxed{multiple answers connected with commas}'
505
+ else:
506
+ multiple_answer_text = '\\boxed{answer}'
507
+ unit_text = ''
508
+ if line['unit']:
509
+ multiple_answer_text += '(unit)'
510
+ unit_text = ', note that the unit of the answer should not be included in \\boxed{}'
511
+ answer_type_text = get_answer_type_text(line['answer_type'], is_chinese=False,
512
+ multiple_answer=line['is_multiple_answer'])
513
+ prompt = (
514
+ f'The following is an open-ended problem from an International {subject_content} competition. '
515
+ f'{answer_type_text}Please calculate the answer according to the given requirements and '
516
+ 'the information provided. Please use LaTeX format to represent the variables and formulas '
517
+ 'used in the solution process and results. Please end your solution with "So the final answer '
518
+ f'is {multiple_answer_text}." and give the result explicitly{unit_text}.'
519
+ )
520
+
521
+ if self.is_math:
522
+ input = make_input(prompt, line['question'])
523
+ else:
524
+ if 'context' in line.keys() and str(line['context']) != 'nan': # cannot be null
525
+ input = make_input(prompt, line['context'] + '\n' + line['question'])
526
+ else:
527
+ input = make_input(prompt, line['question'])
528
+
529
+ ret = [dict(type='text', value=input)]
530
+ tgt_path = self.dump_image(line)
531
+
532
+ ret.extend([dict(type='image', value=s) for s in tgt_path])
533
+
534
+ return ret
535
+
536
+ @classmethod
537
+ def evaluate(self, eval_file, **judge_kwargs):
538
+ from .utils.olympiadbench import MathJudger, extract_answer
539
+ judger = MathJudger()
540
+
541
+ suffix = eval_file.split('.')[-1]
542
+ name_str1 = 'judge'
543
+ name_str2 = 'score'
544
+ result_file = eval_file.replace(f'.{suffix}', f'_{name_str1}_result.xlsx')
545
+ score_file = eval_file.replace(f'.{suffix}', f'_{name_str2}_result.csv')
546
+
547
+ if not osp.exists(result_file):
548
+ data = load(eval_file)
549
+ scorez = []
550
+
551
+ for i in tqdm(data.iterrows()):
552
+ line = i[1]
553
+ model_answer = line['prediction']
554
+ is_chinese = 'zh' in line['source']
555
+ model_answer = extract_answer(is_chinese, model_answer, is_deepseek=False)
556
+ answer_type = line['answer_type']
557
+
558
+ final_answer = line['final_answer'][2:-2]
559
+
560
+ if str(answer_type) != 'nan' and 'Tuple' in answer_type:
561
+ judge_result = judger.judge(model_answer, final_answer)
562
+ else:
563
+ if str(line['error']) != 'nan':
564
+ if ',' in line['error']:
565
+ precisions = line['error'].split(',')
566
+ precisions = [float(p) if p else 1e-8 for p in precisions]
567
+ judge_result = judger.judge(model_answer, final_answer, precisions)
568
+ else:
569
+ precision = float(line['error'])
570
+ judge_result = judger.judge(model_answer, final_answer, precision)
571
+ else:
572
+ judge_result = judger.judge(model_answer, final_answer)
573
+ scorez.append(judge_result)
574
+
575
+ data['score'] = scorez
576
+ dump(data, result_file)
577
+
578
+ judge_file = load(result_file)
579
+
580
+ if not osp.exists(score_file):
581
+ name_list = ['OE_MM_maths_en_COMP', 'OE_MM_maths_zh_CEE', 'OE_MM_maths_zh_COMP', 'OE_MM_physics_en_COMP',
582
+ 'OE_MM_physics_zh_CEE','OE_TO_maths_en_COMP', 'OE_TO_maths_zh_CEE', 'OE_TO_maths_zh_COMP',
583
+ 'OE_TO_physics_en_COMP', 'OE_TO_physics_zh_CEE']
584
+
585
+ sample_list = [[] for _ in range(len(name_list))]
586
+ for i in judge_file.iterrows():
587
+ line = i[1]
588
+ for j in range(len(name_list)):
589
+ if line['source'] == name_list[j]:
590
+ sample_list[j].append(line['score'])
591
+
592
+ acc_dict = {}
593
+ correct_list = []
594
+
595
+ # fine-grained
596
+ for i in range(len(name_list)):
597
+ correct_num = 0
598
+ for j in sample_list[i]:
599
+ if j:
600
+ correct_num += 1
601
+ correct_list.append(correct_num)
602
+ acc = 100 * correct_num / len(sample_list[i])
603
+ acc_dict[name_list[i]] = [acc]
604
+
605
+ # 4 grained
606
+ labela = ['zh', 'en']
607
+ labelb = ['maths', 'physics']
608
+
609
+ grain_list = [[x,y] for x in labela for y in labelb]
610
+ for j in grain_list:
611
+ dict_name = j[0] + "_" + j[1]
612
+ correct_num = 0
613
+ full_num = 0
614
+ for i in range(len(name_list)):
615
+ if all(k in name_list[i] for k in j):
616
+ correct_num += correct_list[i]
617
+ full_num += len(sample_list[i])
618
+ acc = 100 * correct_num / full_num
619
+ acc_dict[dict_name] = [acc]
620
+
621
+ # 2 grained
622
+ grain_list = ['maths', 'physics']
623
+ for j in grain_list:
624
+ dict_name = j
625
+ correct_num = 0
626
+ full_num = 0
627
+ for i in range(len(name_list)):
628
+ if j in name_list[i]:
629
+ correct_num += correct_list[i]
630
+ full_num += len(sample_list[i])
631
+ acc = 100 * correct_num / full_num
632
+ acc_dict[dict_name] = [acc]
633
+
634
+ # AVG
635
+ correct_num = sum(correct_list)
636
+ acc = 100 * correct_num / len(judge_file)
637
+ acc_dict['AVG'] = [acc]
638
+
639
+ acc_pd = pd.DataFrame(acc_dict)
640
+ acc_pd.to_csv(score_file, index=False, encoding='gbk')
641
+
642
+ accdz = pd.read_csv(score_file)
643
+ return accdz
644
+
645
+
646
+ class WeMath(ImageBaseDataset):
647
+ TYPE = 'VQA'
648
+ DATASET_URL = {
649
+ 'WeMath': 'https://opencompass.openxlab.space/utils/VLMEval/WeMath.tsv'
650
+ }
651
+ DATASET_MD5 = {'WeMath': '056142c89b09d864702450b5b5ea0913'}
652
+
653
+ def evaluate(self, eval_file, **judge_kwargs):
654
+ from .utils.wemath import wemath_evaluate_models, wemath_accuracy
655
+ from .utils.multiple_choice import mcq_vanilla_eval
656
+
657
+ # model = judge_kwargs['model']
658
+ model = judge_kwargs.get('model', 'exact_matching')
659
+ assert model in ['exact_matching', 'gpt-4-0125', 'gpt-4-turbo', 'gpt-4o-mini'], model
660
+ name_str_map = {'gpt-4-0125': 'gpt4', 'gpt-4-turbo': 'gpt4-turbo', 'gpt-4o-mini': 'gpt4o-mini'}
661
+ name_str = name_str_map[model] if model in name_str_map else model
662
+
663
+ if model == 'exact_matching':
664
+ model = None
665
+ elif gpt_key_set():
666
+ model = build_judge(**judge_kwargs)
667
+ if not model.working():
668
+ warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
669
+ warnings.warn(DEBUG_MESSAGE)
670
+ model = None
671
+ else:
672
+ warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
673
+ model = None
674
+
675
+ suffix = eval_file.split('.')[-1]
676
+ storage = eval_file.replace(f'.{suffix}', f'_{name_str}.xlsx')
677
+ nproc = judge_kwargs.pop('nproc', 4)
678
+
679
+ if not osp.exists(storage) and model is not None:
680
+ data = load(eval_file)
681
+ result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
682
+
683
+ data = load(eval_file)
684
+ data = data.sort_values(by='index')
685
+ data['prediction'] = [str(x) for x in data['prediction']]
686
+ # If not choice label, then use lower case
687
+ for k in data.keys():
688
+ data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
689
+
690
+ meta = self.data
691
+ meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
692
+ data_map = {x: y for x, y in zip(data['index'], data['question'])}
693
+ for k in data_map:
694
+ assert k in meta_q_map, (
695
+ f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
696
+ )
697
+ data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
698
+
699
+ if 'id' in data.columns:
700
+ # 更改列名
701
+ data.rename(columns={'id': 'ID'}, inplace=True)
702
+ dump(data, storage)
703
+ if osp.exists(storage):
704
+ accuracy_scores = wemath_evaluate_models(storage)
705
+ four_dim_scores = wemath_accuracy(storage)
706
+ else:
707
+ accuracy_scores = wemath_evaluate_models(eval_file)
708
+ four_dim_scores = wemath_accuracy(eval_file)
709
+ combine_score = {**accuracy_scores, **four_dim_scores}
710
+ combine_score = pd.DataFrame(combine_score)
711
+ score_pth = storage.replace('.xlsx', '_score.csv')
712
+ dump(combine_score, score_pth)
713
+ return combine_score
714
+
715
+
716
+ class LogicVista(ImageBaseDataset):
717
+ TYPE = 'VQA'
718
+ DATASET_URL = {
719
+ 'LogicVista': 'https://opencompass.openxlab.space/utils/VLMEval/LogicVista.tsv'
720
+ }
721
+ DATASET_MD5 = {'LogicVista': '41c5d33adf33765c399e0e6ae588c061'}
722
+
723
+ def evaluate(self, eval_file, **judge_kwargs):
724
+ from .utils.logicvista import LogicVista_auxeval, evaluate_logicvista
725
+
726
+ # model = judge_kwargs['model']
727
+ model = judge_kwargs.get('model', 'exact_matching')
728
+ assert model in ['exact_matching', 'gpt-4-0125', 'gpt-4-turbo', 'gpt-4o-mini'], model
729
+ name_str_map = {'gpt-4-0125': 'gpt4', 'gpt-4-turbo': 'gpt4-turbo', 'gpt-4o-mini': 'gpt4o-mini'}
730
+ name_str = name_str_map[model] if model in name_str_map else model
731
+
732
+ if model == 'exact_matching':
733
+ model = None
734
+ elif gpt_key_set():
735
+ model = build_judge(**judge_kwargs)
736
+ if not model.working():
737
+ warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
738
+ warnings.warn(DEBUG_MESSAGE)
739
+ model = None
740
+ else:
741
+ warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
742
+ model = None
743
+
744
+ suffix = eval_file.split('.')[-1]
745
+ storage = eval_file.replace(f'.{suffix}', f'_{name_str}.xlsx')
746
+ tmp_file = eval_file.replace(f'.{suffix}', f'_{name_str}.pkl')
747
+ nproc = judge_kwargs.pop('nproc', 4)
748
+
749
+ if not osp.exists(storage) and model is not None:
750
+ data = load(eval_file)
751
+ model = build_judge(max_tokens=128, **judge_kwargs)
752
+ assert model.working(), ('LogicVista evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
753
+ lt = len(data)
754
+ lines = [data.iloc[i] for i in range(lt)]
755
+ tups = [(model, line) for line in lines]
756
+ indices = [line['index'] for line in lines]
757
+
758
+ ans = {}
759
+ if osp.exists(tmp_file):
760
+ ans = load(tmp_file)
761
+ tups = [x for x, i in zip(tups, indices) if i not in ans]
762
+ indices = [i for i in indices if i not in ans]
763
+
764
+ if len(indices):
765
+ new_results = track_progress_rich(
766
+ LogicVista_auxeval,
767
+ tups,
768
+ nproc=nproc,
769
+ chunksize=nproc,
770
+ keys=indices,
771
+ save=tmp_file,
772
+ )
773
+ ans = load(tmp_file)
774
+ for k, v in zip(indices, new_results):
775
+ assert k in ans
776
+ assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res'] and ans[k]['hit'] == v['hit']
777
+
778
+ data['res'] = [ans[idx]['res'] for idx in data['index']]
779
+ data['log'] = [ans[idx]['log'] for idx in data['index']]
780
+ data['hit'] = [ans[idx]['hit'] for idx in data['index']]
781
+
782
+ dump(data, storage)
783
+ if osp.exists(storage):
784
+ accuracy_scores = evaluate_logicvista(storage)
785
+ score_pth = storage.replace('.xlsx', '_score.csv')
786
+ dump(accuracy_scores, score_pth)
787
+
788
+ return accuracy_scores
789
+
790
+ class LLaVABench(ImageBaseDataset):
791
+ TYPE = 'VQA'
792
+ DATASET_URL = {'LLaVABench': 'https://opencompass.openxlab.space/utils/VLMEval/LLaVABench.tsv'}
793
+ DATASET_MD5 = {'LLaVABench': 'd382a093f749a697820d3dadd61c8428'}
794
+
795
+ # It returns a DataFrame
796
+ @classmethod
797
+ def evaluate(self, eval_file, **judge_kwargs):
798
+ from .utils.llavabench import (
799
+ build_prompt,
800
+ LLaVABench_atomeval,
801
+ LLaVABench_score,
802
+ )
803
+
804
+ suffix = '.' + eval_file.split('.')[-1]
805
+ record_file = eval_file.replace(suffix, '_openai_result' + suffix)
806
+ score_file = eval_file.replace(suffix, '_score.csv')
807
+ nproc = judge_kwargs.pop('nproc', 4)
808
+ system_prompt = 'You are a helpful and precise assistant for checking the quality of the answer.'
809
+
810
+ if not osp.exists(record_file):
811
+ data = load(eval_file)
812
+ lines = [data.iloc[i] for i in range(len(data))]
813
+ model = build_judge(temperature=0.2, system_prompt=system_prompt, **judge_kwargs)
814
+ assert model.working(), ('LLaVABench evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
815
+
816
+ prompts = [build_prompt(line) for line in lines]
817
+ tups = [(model, prompt) for prompt in prompts]
818
+ scores = track_progress_rich(LLaVABench_atomeval, tups, nproc=nproc, chunksize=nproc)
819
+ data['gpt4_score'] = [x[0] for x in scores]
820
+ data['score'] = [x[1] for x in scores]
821
+ dump(data, record_file)
822
+
823
+ data = load(record_file)
824
+ ret = LLaVABench_score(data).round(1)
825
+ dump(ret, score_file)
826
+ return ret
827
+
828
+
829
+ class MMVet(ImageBaseDataset):
830
+ TYPE = 'VQA'
831
+ DATASET_URL = {
832
+ 'MMVet': 'https://opencompass.openxlab.space/utils/VLMEval/MMVet.tsv',
833
+ 'MMVet_Hard': 'http://opencompass.openxlab.space/utils/VLMEval/MMVet_Hard.tsv'
834
+ }
835
+ DATASET_MD5 = {'MMVet': '748aa6d4aa9d4de798306a63718455e3', 'MMVet_Hard': '63a598819a936a2e77c410a78a21ff16'}
836
+
837
+ # It returns a DataFrame
838
+ @classmethod
839
+ def evaluate(self, eval_file, **judge_kwargs):
840
+ from .utils.mmvet import MMVet_auxeval, MMVet_acc
841
+
842
+ suffix = eval_file.split('.')[-1]
843
+ model = judge_kwargs['model']
844
+ storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
845
+ tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
846
+ nproc = judge_kwargs.pop('nproc', 4)
847
+ if not osp.exists(storage):
848
+ data = load(eval_file)
849
+ model = build_judge(max_tokens=3, **judge_kwargs)
850
+ assert model.working(), ('MMVet evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
851
+
852
+ lt = len(data)
853
+ lines = [data.iloc[i] for i in range(lt)]
854
+ tups = [(model, line) for line in lines]
855
+ indices = [line['index'] for line in lines]
856
+
857
+ ans = load(tmp_file) if osp.exists(tmp_file) else {}
858
+ tups = [x for x, i in zip(tups, indices) if i not in ans]
859
+ indices = [i for i in indices if i not in ans]
860
+
861
+ if len(indices):
862
+ new_results = track_progress_rich(
863
+ MMVet_auxeval,
864
+ tups,
865
+ nproc=nproc,
866
+ chunksize=nproc,
867
+ keys=indices,
868
+ save=tmp_file,
869
+ )
870
+ ans = load(tmp_file)
871
+ for k, v in zip(indices, new_results):
872
+ assert k in ans
873
+ assert ans[k]['log'] == v['log'] and ans[k]['score'] == v['score']
874
+ data['score'] = [ans[idx]['score'] for idx in data['index']]
875
+ data['log'] = [ans[idx]['log'] for idx in data['index']]
876
+ dump(data, storage)
877
+
878
+ score, score_fine = MMVet_acc(storage)
879
+ score_pth = storage.replace('.xlsx', '_score.csv')
880
+ score_fine_pth = storage.replace('.xlsx', '_score_fine.csv')
881
+ dump(score, score_pth)
882
+ dump(score_fine, score_fine_pth)
883
+ return score
884
+
885
+
886
+ class MTVQADataset(ImageBaseDataset):
887
+ TYPE = 'VQA'
888
+ DATASET_URL = {'MTVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/MTVQA_TEST.tsv'}
889
+ DATASET_MD5 = {'MTVQA_TEST': 'd87c17dbab934b7cd89c0a3c1c5657f4'}
890
+
891
+ @classmethod
892
+ def evaluate(self, eval_file, **judge_kwargs):
893
+ data = load(eval_file)
894
+ assert 'answer' in data and 'prediction' in data and 'category' in data
895
+ data['prediction'] = [str(x) for x in data['prediction']]
896
+ data['answer'] = [str(x) for x in data['answer']]
897
+ if 'split' in data:
898
+ assert np.all([x.lower() == 'test' for x in data['split']]), 'We only support MTVQA_TEST for now. '
899
+ lt = len(data)
900
+ category_scores = defaultdict(list)
901
+ for i in range(lt):
902
+ line = data.iloc[i]
903
+ ans = line['answer'].strip().lower().replace('.', '')
904
+ pred = line['prediction'].strip().lower().replace('.', '')
905
+ cate = line['category']
906
+ score = 1.0 if ans in pred else 0.0
907
+ category_scores[cate].append(score)
908
+ category_scores['Average'].append(score)
909
+ # Calculate the average score for each category, the score is normalized to [0, 100]
910
+ category_averages = {category: np.mean(scores) * 100 for category, scores in category_scores.items()}
911
+
912
+ suffix = eval_file.split('.')[-1]
913
+ result_file = eval_file.replace(f'.{suffix}', '_acc.json')
914
+ dump(category_averages, result_file)
915
+
916
+ return category_averages
917
+
918
+ # MT-VQA adopts a custom prompt
919
+ def build_prompt(self, line):
920
+ msgs = super().build_prompt(line)
921
+ assert sum([x['type'] == 'text' for x in msgs]) == 1
922
+ for item in msgs:
923
+ if item['type'] == 'text':
924
+ item['value'] += '\nAnswer the question using a word or phrase in the language of the question.'
925
+ return msgs
926
+
927
+
928
+ class TableVQABench(ImageBaseDataset):
929
+ TYPE = 'VQA'
930
+ DATASET_URL = {
931
+ 'TableVQABench': 'https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/mentor-vil/datasets/tablevqa-bench.tsv'
932
+ }
933
+ DATASET_MD5 = {'TableVQABench': '2550adc61bdc82d8e62f3b003de7c62d'}
934
+
935
+ from .utils.tablevqabench import FINTABNETQA_PROMPT, VTABFACT_PROMPT, VWTQ_PROMPT
936
+
937
+ # It returns a DataFrame
938
+ @classmethod
939
+ def evaluate(self, eval_file, **judge_kwargs):
940
+ import pandas as pd
941
+ from .utils.tablevqabench import evaluate_fintabnet, evaluate_tabfact, evaluate_wtq
942
+
943
+ data = load(eval_file)
944
+ assert 'answer' in data and 'prediction' in data
945
+
946
+ data['prediction'] = data['prediction'].str.replace('^Answer: ', '', regex=True)
947
+ data_group = dict(tuple(data.groupby('split')))
948
+ eval_result = {'split': [], 'average_scores': []}
949
+ for split in ['fintabnetqa', 'vtabfact', 'vwtq', 'vwtq_syn']:
950
+ data_split = data_group[split].to_dict(orient='records')
951
+ if split == 'fintabnetqa':
952
+ split_eval_meta = evaluate_fintabnet(data_split, ['accuracy'])
953
+ elif split == 'vtabfact':
954
+ split_eval_meta = evaluate_tabfact(data_split, ['accuracy'])
955
+ elif split == 'vwtq' or split == 'vwtq_syn':
956
+ split_eval_meta = evaluate_wtq(data_split, ['accuracy'])
957
+ eval_result['split'].append(split)
958
+ eval_result['average_scores'].append(split_eval_meta['average_scores'])
959
+
960
+ suffix = eval_file.split('.')[-1]
961
+ result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
962
+ eval_result = pd.DataFrame(eval_result)
963
+ dump(eval_result, result_file)
964
+
965
+ return eval_result
966
+
967
+ # TableVQABench adopts a custom prompt
968
+ def build_prompt(self, line):
969
+ msgs = super().build_prompt(line)
970
+ assert sum([x['type'] == 'text' for x in msgs]) == 1
971
+ for item in msgs:
972
+ if item['type'] == 'text':
973
+ if line['split'] == 'fintabnetqa':
974
+ item['value'] = self.FINTABNETQA_PROMPT.format_map({'question': item['value']})
975
+ elif line['split'] == 'vtabfact':
976
+ item['value'] = self.VTABFACT_PROMPT.format_map({'question': item['value']})
977
+ elif line['split'] == 'vwtq_syn' or line['split'] == 'vwtq':
978
+ item['value'] = self.VWTQ_PROMPT.format_map({'question': item['value']})
979
+ return msgs
980
+
981
+
982
+ class CustomVQADataset(ImageBaseDataset):
983
+ TYPE = 'VQA'
984
+
985
+ def load_data(self, dataset):
986
+ data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
987
+
988
+ if file_size(data_path, 'GB') > 1:
989
+ local_path = data_path.replace('.tsv', '_local.tsv')
990
+ if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None):
991
+ from ..tools import LOCALIZE
992
+
993
+ LOCALIZE(data_path, local_path)
994
+ data_path = local_path
995
+ return load(data_path)
996
+
997
+ def evaluate(self, eval_file, **judge_kwargs):
998
+ raise NotImplementedError
999
+
1000
+
1001
+ class CRPE(ImageBaseDataset):
1002
+ TYPE = 'VQA'
1003
+ DATASET_URL = {
1004
+ 'CRPE_EXIST': 'https://huggingface.co/datasets/petter12321/crpe_vlmevalkit/resolve/main/CRPE_EXIST.tsv',
1005
+ 'CRPE_RELATION': 'https://huggingface.co/datasets/petter12321/crpe_vlmevalkit/resolve/main/CRPE_RELATION.tsv'
1006
+ }
1007
+ DATASET_MD5 = {
1008
+ 'CRPE_EXIST': '315584e23ac1ff7f8719ed3b7ad90f08',
1009
+ 'CRPE_RELATION': 'bad7094cde0b572288f4b119c2d0c656'}
1010
+
1011
+ @classmethod
1012
+ def evaluate(self, eval_file, **judge_kwargs):
1013
+ from .utils.crpe import is_correct
1014
+ # find-image, count-text, find-text,
1015
+ # infer-choose, count-image, visual-reasoning
1016
+ score = {
1017
+ 'exist': 0,
1018
+ 'subject': 0,
1019
+ 'predicate': 0,
1020
+ 'object': 0,
1021
+ 'total': 0,
1022
+ }
1023
+ num = {
1024
+ 'exist': 0,
1025
+ 'subject': 0,
1026
+ 'predicate': 0,
1027
+ 'object': 0,
1028
+ 'total': 0,
1029
+ }
1030
+ final_score_dict = {
1031
+ 'exist': 0,
1032
+ 'subject': 0,
1033
+ 'predicate': 0,
1034
+ 'object': 0,
1035
+ 'total': 0,
1036
+ }
1037
+ data = load(eval_file)
1038
+ lt = len(data)
1039
+ lines = [data.iloc[i] for i in range(lt)]
1040
+ for i in tqdm(range(len(lines))):
1041
+ line = lines[i]
1042
+ predict = str(line['prediction'])
1043
+ answers = str(line['answer'])
1044
+ # print("predict =", predict)
1045
+ # print("answers =", answers)
1046
+ category = line['category']
1047
+ if is_correct(answers, predict):
1048
+ score[category] += 1
1049
+ score['total'] += 1
1050
+ num[category] += 1
1051
+ num['total'] += 1
1052
+
1053
+ for category in ['exist', 'subject', 'predicate', 'object', 'total']:
1054
+ if num[category] != 0:
1055
+ final_score_dict[category] = score[category] / num[category]
1056
+ else:
1057
+ final_score_dict[category] = None
1058
+
1059
+ score_pth = eval_file.replace('.xlsx', '_score.json')
1060
+ dump(final_score_dict, score_pth)
1061
+ return final_score_dict
1062
+
1063
+ def build_prompt(self, line):
1064
+ ROOT = LMUDataRoot()
1065
+ msgs = super().build_prompt(line)
1066
+ for msg in msgs:
1067
+ if msg['type'] == 'image':
1068
+ msg['value'] = osp.join(osp.join(ROOT, 'images', self.dataset_name), msg['value'])
1069
+ return msgs
1070
+
1071
+
1072
+ class QSpatial(ImageBaseDataset):
1073
+ TYPE = 'VQA'
1074
+ DATASET_URL = {
1075
+ 'QSpatial_plus': '',
1076
+ 'QSpatial_scannet': ''
1077
+ }
1078
+
1079
+ # NOTE: To evaluate Q-Spatial-ScanNet, you need to get the permission from ScanNet website
1080
+ # Once you get the permission, you can use the helper code here to download and extract necessary images:
1081
+ # https://github.com/andrewliao11/Q-Spatial-Bench-code?tab=readme-ov-file#for-qspatial_scannet
1082
+ qspatial_root = "TO_BE_REPLACED_WITH_THE_PATH_TO_QSPATIAL_DATASET"
1083
+ url = "https://raw.githubusercontent.com/andrewliao11/Q-Spatial-Bench-code/refs/heads/main/prompt_templates/"
1084
+
1085
+ def post_build(self, dataset):
1086
+ # Download the prompt templates from github
1087
+
1088
+ links = [
1089
+ self.url + "system_prompt.txt",
1090
+ self.url + "spatial_prompt_single.txt",
1091
+ self.url + "spatial_prompt_steps.txt",
1092
+ self.url + "standard_prompt.txt",
1093
+ self.url + "zero_shot_prompt.txt"
1094
+ ]
1095
+ with tempfile.TemporaryDirectory() as temp_dir:
1096
+ for link in links:
1097
+ tgt_path = os.path.join(temp_dir, link.split("/")[-1])
1098
+ os.system(f"wget {link} -O {tgt_path}")
1099
+
1100
+ self.system_prompt = open(os.path.join(temp_dir, "system_prompt.txt")).read()
1101
+ self._prompt_templates = dict(
1102
+ spatial_prompt_single=open(os.path.join(temp_dir, "spatial_prompt_single.txt")).read(),
1103
+ spatial_prompt_steps=open(os.path.join(temp_dir, "spatial_prompt_steps.txt")).read(),
1104
+ standard_prompt=open(os.path.join(temp_dir, "standard_prompt.txt")).read(),
1105
+ zero_shot_prompt=open(os.path.join(temp_dir, "zero_shot_prompt.txt")).read(),
1106
+ )
1107
+
1108
+ # Given one data record, return the built prompt (a multi-modal message), can override
1109
+ def build_prompt(self, line):
1110
+ from jinja2.sandbox import SandboxedEnvironment
1111
+ text_prompt_template = self._prompt_templates["spatial_prompt_single"]
1112
+ env = SandboxedEnvironment()
1113
+ text_prompt = env.from_string(text_prompt_template).render(question=line["question"])
1114
+ tgt_path = self.dump_image(line)
1115
+
1116
+ msgs = []
1117
+ if isinstance(tgt_path, list):
1118
+ msgs.extend([dict(type='image', value=p) for p in tgt_path])
1119
+ else:
1120
+ msgs = [dict(type='image', value=tgt_path)]
1121
+
1122
+ msgs.append(dict(type='text', value=f"{self.system_prompt}\n{text_prompt}"))
1123
+ return msgs
1124
+
1125
+ # Given the dataset name, return the dataset as a pandas dataframe, can override
1126
+ def load_data(self, dataset):
1127
+ import io
1128
+ import pandas as pd
1129
+ from datasets import load_dataset
1130
+
1131
+ hf_dataset = load_dataset("andrewliao11/Q-Spatial-Bench", split=dataset)
1132
+ df = hf_dataset.to_pandas()
1133
+
1134
+ df.reset_index(drop=True, inplace=True)
1135
+ df['index'] = df.index
1136
+ df['answer'] = list(zip(df['answer_value'], df['answer_unit']))
1137
+ df = df[['index'] + [col for col in df.columns if col != 'index']]
1138
+
1139
+ if dataset == "QSpatial_scannet":
1140
+ df = df.drop(columns=["image"])
1141
+ df["image"] = [Image.open(os.path.join(self.qspatial_root, image_path)) for image_path in df["image_path"]]
1142
+ else:
1143
+ df["image"] = [Image.open(io.BytesIO(image_dict["bytes"])) for image_dict in df["image"]]
1144
+
1145
+ df["image"] = [encode_image_to_base64(image) for image in df["image"]]
1146
+ return df
1147
+
1148
+ @classmethod
1149
+ def get_multiplier(self, unit):
1150
+
1151
+ unit = unit.lower()
1152
+ if unit in ["meters", "meter", "m", "metre", "metres"]:
1153
+ multiplier = 100
1154
+ elif unit in ["centimeters", "centimeter", "cm"]:
1155
+ multiplier = 1
1156
+ elif unit in ["feet", "foot", "ft"]:
1157
+ multiplier = 30.48
1158
+ elif unit in ["inch", "inches", "in"]:
1159
+ multiplier = 2.54
1160
+ elif unit in ["mm"]:
1161
+ multiplier = 0.1
1162
+ else:
1163
+ print(f"Unknown unit: {unit}")
1164
+ multiplier = 0.
1165
+
1166
+ return multiplier
1167
+
1168
+ @classmethod
1169
+ def parse_string(self, input_str):
1170
+ # Regular expression to match the pattern (number or range, text)
1171
+ match = re.match(r'\(([\d.-]+), (.+)\)', input_str)
1172
+ if match:
1173
+ number_part = match.group(1)
1174
+ text = match.group(2)
1175
+
1176
+ if '-' in number_part:
1177
+ start, end = map(float, number_part.split('-'))
1178
+ number = (start + end) / 2
1179
+ else:
1180
+ number = float(number_part)
1181
+
1182
+ return number * self.get_multiplier(text)
1183
+ else:
1184
+ print(f"Unable to parse the input string {input_str}")
1185
+ return 0
1186
+
1187
+ @classmethod
1188
+ def parse_prediction(self, vlm_response):
1189
+ # Value
1190
+ pattern = r'scalar{([^}]*)}'
1191
+ str_inside_scalar_boxes = re.findall(pattern, vlm_response)[-1]
1192
+ scalar_list = re.findall(r'\d+\.?\d*', str_inside_scalar_boxes)
1193
+ parsed_scalar = np.array(scalar_list).astype(float).mean()
1194
+
1195
+ # Unit
1196
+ pattern = r'distance_unit{([^}]*)}'
1197
+ str_inside_unit_boxes = re.findall(pattern, vlm_response)
1198
+ parsed_unit = str_inside_unit_boxes[-1]
1199
+
1200
+ pred_value_in_cms = parsed_scalar * self.get_multiplier(parsed_unit)
1201
+ return pred_value_in_cms
1202
+
1203
+ # It returns a dictionary
1204
+ @classmethod
1205
+ def evaluate(self, eval_file, **judge_kwargs):
1206
+
1207
+ data = load(eval_file)
1208
+ if "model" in judge_kwargs:
1209
+ from .utils.qspatial import QSpatial_auxeval
1210
+
1211
+ # extract using model
1212
+ model = judge_kwargs['model']
1213
+ suffix = eval_file.split('.')[-1]
1214
+ storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
1215
+ tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
1216
+ nproc = judge_kwargs.pop('nproc', 4)
1217
+
1218
+ if not osp.exists(storage):
1219
+ model = build_judge(max_tokens=128, **judge_kwargs)
1220
+
1221
+ assert model.working(), ('Evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
1222
+ lt = len(data)
1223
+ lines = [data.iloc[i] for i in range(lt)]
1224
+ tups = [(model, line) for line in lines]
1225
+ indices = [line['index'] for line in lines]
1226
+
1227
+ ans = {}
1228
+ if osp.exists(tmp_file):
1229
+ ans = load(tmp_file)
1230
+ tups = [x for x, i in zip(tups, indices) if i not in ans]
1231
+ indices = [i for i in indices if i not in ans]
1232
+
1233
+ if len(indices):
1234
+ new_results = track_progress_rich(
1235
+ QSpatial_auxeval,
1236
+ tups,
1237
+ nproc=nproc,
1238
+ chunksize=nproc,
1239
+ keys=indices,
1240
+ save=tmp_file,
1241
+ )
1242
+ ans = load(tmp_file)
1243
+ for k, v in zip(indices, new_results):
1244
+ assert k in ans
1245
+ assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
1246
+
1247
+ data['res'] = [ans[idx]['res'] for idx in data['index']]
1248
+ data['log'] = [ans[idx]['log'] for idx in data['index']]
1249
+ dump(data, storage)
1250
+
1251
+ data = load(storage)
1252
+
1253
+ pred_value_in_cms = []
1254
+ for res in data["res"]:
1255
+ try:
1256
+ pred_value_in_cms.append(self.parse_string(res))
1257
+ except ValueError:
1258
+ pred_value_in_cms.append(0.)
1259
+
1260
+ pred_value_in_cms = np.array(pred_value_in_cms) + 1e-8
1261
+ else:
1262
+ # regex parsing
1263
+ pred_value_in_cms = []
1264
+ n_errors_in_parsing = 0
1265
+ for pred in data["prediction"]:
1266
+ try:
1267
+ parsed_value = self.parse_prediction(pred)
1268
+ except IndexError:
1269
+ n_errors_in_parsing += 1
1270
+ parsed_value = 1e-8
1271
+
1272
+ pred_value_in_cms.append(parsed_value)
1273
+
1274
+ print(f"Encounter {n_errors_in_parsing} errors in parsing")
1275
+ pred_value_in_cms = np.array(pred_value_in_cms) + 1e-8
1276
+
1277
+ # Ground truth
1278
+ ground_truth_value_in_cms = []
1279
+ for answer in data["answer"]:
1280
+ value, unit = eval(answer)
1281
+ ground_truth_value_in_cms.append(value * self.get_multiplier(unit))
1282
+ ground_truth_value_in_cms = np.array(ground_truth_value_in_cms) + 1e-8
1283
+
1284
+ # Calculate the score
1285
+ pred_gt = pred_value_in_cms / ground_truth_value_in_cms
1286
+ gt_pred = ground_truth_value_in_cms / pred_value_in_cms
1287
+ delta_2 = np.stack([pred_gt, gt_pred]).max(0) < 2.
1288
+ delta_1_point_5 = np.stack([pred_gt, gt_pred]).max(0) < 1.5
1289
+
1290
+ data["eval_score_delta_2"] = delta_2
1291
+ data["eval_score_delta_1_point_5"] = delta_1_point_5
1292
+
1293
+ final_score_dict = {
1294
+ "delta_2": delta_2.mean(),
1295
+ "delta_1_point_5": delta_1_point_5.mean()
1296
+ }
1297
+ for question_type in set(data["question_type"]):
1298
+ filtered_data = data[data["question_type"] == question_type]
1299
+ delta_2_per_question_type = filtered_data["eval_score_delta_2"].mean()
1300
+ delta_1_point_5_per_question_type = filtered_data["eval_score_delta_1_point_5"].mean()
1301
+ final_score_dict.update({f"{question_type}_delta_2": delta_2_per_question_type})
1302
+ final_score_dict.update({f"{question_type}_delta_1_point_5": delta_1_point_5_per_question_type})
1303
+
1304
+ score_pth = eval_file.replace('.xlsx', '_score.json')
1305
+ dump(final_score_dict, score_pth)
1306
+ return final_score_dict
1307
+
1308
+
1309
+ class MMNIAH(ImageBaseDataset):
1310
+ TYPE = 'VQA'
1311
+ DATASET_URL = {
1312
+ 'MM_NIAH_VAL':
1313
+ 'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/MM_NIAH_VAL.tsv',
1314
+ 'MM_NIAH_TEST':
1315
+ ['https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-aa',
1316
+ 'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ab',
1317
+ 'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ac',
1318
+ 'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ad',
1319
+ 'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ae']}
1320
+ DATASET_MD5 = {'MM_NIAH_VAL': '27e5a8c3cef7746cb38f89cd86c474c5',
1321
+ 'MM_NIAH_TEST': 'f490eb2a43096307465fe9e7ef13497c'}
1322
+
1323
+ def prepare_tsv(self, url, file_md5=None):
1324
+ import os
1325
+ data_root = LMUDataRoot()
1326
+ os.makedirs(data_root, exist_ok=True)
1327
+ update_flag = False
1328
+ file_name = 'MM_NIAH_VAL.tsv' if 'MM_NIAH_VAL' in url else 'MM_NIAH_TEST.tsv'
1329
+ data_path = osp.join(data_root, file_name)
1330
+ if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
1331
+ pass
1332
+ elif file_name == 'MM_NIAH_TEST.tsv':
1333
+ warnings.warn('The dataset tsv is not downloaded')
1334
+ for i in range(len(url)):
1335
+ if osp.exists(osp.join(data_root, 'part-a' + chr(ord('a') + i))):
1336
+ print('part_a' + chr(ord('a') + i) + ' is existed')
1337
+ continue
1338
+ download_file(url[i], data_path)
1339
+ file_prefix = 'part-'
1340
+ output_file = data_path
1341
+ split_files = sorted([f for f in os.listdir(data_root) if f.startswith(file_prefix)])
1342
+ with open(output_file, 'wb') as outfile:
1343
+ # 逐个读取每个拆分文件并写入到输出文件
1344
+ for filename in split_files:
1345
+ with open(osp.join(data_root, filename), 'rb') as infile:
1346
+ outfile.write(infile.read())
1347
+ update_flag = True
1348
+ else:
1349
+ warnings.warn('The dataset tsv is not downloaded')
1350
+ download_file(url, data_path)
1351
+ update_flag = True
1352
+
1353
+ if file_size(data_path, 'GB') > 1:
1354
+ local_path = data_path.replace('.tsv', '_local.tsv')
1355
+ if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag:
1356
+ from ..tools import LOCALIZE
1357
+ LOCALIZE(data_path, local_path)
1358
+ data_path = local_path
1359
+ return load(data_path)
1360
+
1361
+ @classmethod
1362
+ def evaluate(self, eval_file, **judge_kwargs):
1363
+ from .utils.mmniah import is_correct
1364
+ # find-image, count-text, find-text,
1365
+ # infer-choose, count-image, visual-reasoning
1366
+ MMNIAH_score = {
1367
+ 'count-text': 0,
1368
+ 'find-image': 0,
1369
+ 'find-text': 0,
1370
+ 'infer-choose': 0,
1371
+ 'count-image': 0,
1372
+ 'visual-reasoning': 0,
1373
+ 'total': 0,
1374
+ }
1375
+ MMNIAH_num = {
1376
+ 'count-text': 0,
1377
+ 'find-image': 0,
1378
+ 'find-text': 0,
1379
+ 'infer-choose': 0,
1380
+ 'count-image': 0,
1381
+ 'visual-reasoning': 0,
1382
+ 'total': 0,
1383
+ }
1384
+ final_score_dict = {
1385
+ 'count-text': 0,
1386
+ 'find-image': 0,
1387
+ 'find-text': 0,
1388
+ 'infer-choose': 0,
1389
+ 'count-image': 0,
1390
+ 'visual-reasoning': 0,
1391
+ 'total': 0,
1392
+ }
1393
+ data = load(eval_file)
1394
+ lt = len(data)
1395
+ lines = [data.iloc[i] for i in range(lt)]
1396
+ for i in tqdm(range(len(lines))):
1397
+ line = lines[i]
1398
+ predict = line['prediction']
1399
+ answers = line['answer']
1400
+ category = line['category']
1401
+ if category in ['visual-reasoning', 'find-image']:
1402
+ answers = int(answers)
1403
+ if is_correct(answers, predict):
1404
+ MMNIAH_score[category] += 1
1405
+ MMNIAH_score['total'] += 1
1406
+ MMNIAH_num[category] += 1
1407
+ MMNIAH_num['total'] += 1
1408
+
1409
+ for category in ['find-image', 'count-text', 'find-text',
1410
+ 'infer-choose', 'count-image', 'visual-reasoning', 'total']:
1411
+ if MMNIAH_num[category] != 0:
1412
+ final_score_dict[category] = MMNIAH_score[category] / MMNIAH_num[category]
1413
+ else:
1414
+ final_score_dict[category] = None
1415
+
1416
+ score_pth = eval_file.replace('.xlsx', '_score.json')
1417
+ dump(final_score_dict, score_pth)
1418
+ return final_score_dict
1419
+
1420
+ def build_prompt(self, line):
1421
+ msgs = super().build_prompt(line)
1422
+ if isinstance(line, int):
1423
+ line = self.data.iloc[line]
1424
+ totalchoice = line['multi-choice options']
1425
+ totalchoice = eval(totalchoice)
1426
+ # find-image, count-text, find-text,
1427
+ # infer-choose, count-image, visual-reasoning
1428
+ context = msgs[-1]['value']
1429
+ context = eval(context)
1430
+ question = context[0] + '\n' + context[1]
1431
+ # tgt_path是所有图像地址列表
1432
+ tgt_path = []
1433
+ for i in range(len(msgs) - 1):
1434
+ tgt_path.append(msgs[i]['value'])
1435
+ choices = totalchoice[0]
1436
+ choices_image = totalchoice[1]
1437
+ if choices:
1438
+ for c_idx, c in enumerate(choices):
1439
+ question = f"{question}\n{chr(c_idx + ord('A'))}. {c}"
1440
+ question += "\nAnswer with the option's letter from the given choices directly."
1441
+ elif choices_image:
1442
+ for c_idx in range(len(choices_image)):
1443
+ question = f"{question}\n{chr(c_idx + ord('A'))}. <image>"
1444
+ question += "\nAnswer with the option's letter from the given choices directly."
1445
+ else:
1446
+ question += '\nAnswer the question using a single word or phrase.'
1447
+ question = '<start>' + question + '<end>'
1448
+ question = question.split('<image>')
1449
+ if choices_image:
1450
+ for i in range(len(question) - 5):
1451
+ question[i] = question[i] + '\n<image>'
1452
+ for i in range(len(question) - 5, len(question) - 1):
1453
+ question[i] = question[i] + '<image>'
1454
+ else:
1455
+ for i in range(len(question) - 1):
1456
+ question[i] = question[i] + '\n<image>'
1457
+ assert len(tgt_path) + 1 == len(question)
1458
+ context = []
1459
+ for i in range(len(tgt_path)):
1460
+ context.append(question[i])
1461
+ context.append(tgt_path[i])
1462
+ context.append(question[-1])
1463
+ context[0] = context[0][7:]
1464
+ context[-1] = context[-1][:-5]
1465
+ msgs = []
1466
+ for i in range(len(context)):
1467
+ if i % 2 == 0:
1468
+ msgs.append(dict(type='text', value=context[i]))
1469
+ else:
1470
+ ROOT = LMUDataRoot()
1471
+ msgs.append(dict(type='image', value=osp.join(osp.join(ROOT, 'images', self.dataset_name), context[i])))
1472
+ for element in msgs:
1473
+ if element['value'] == '':
1474
+ msgs.remove(element)
1475
+ return msgs
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_yorn.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ..smp import *
2
+ from ..utils import *
3
+ from .image_base import ImageBaseDataset
4
+ from .utils import build_judge, DEBUG_MESSAGE
5
+
6
+
7
+ class ImageYORNDataset(ImageBaseDataset):
8
+
9
+ TYPE = 'Y/N'
10
+
11
+ DATASET_URL = {
12
+ 'MME': 'https://opencompass.openxlab.space/utils/VLMEval/MME.tsv',
13
+ 'HallusionBench': 'https://opencompass.openxlab.space/utils/VLMEval/HallusionBench.tsv',
14
+ 'POPE': 'https://opencompass.openxlab.space/utils/VLMEval/POPE.tsv',
15
+ 'AMBER': 'https://huggingface.co/datasets/yifanzhang114/AMBER_base64/resolve/main/AMBER.tsv',
16
+ }
17
+
18
+ DATASET_MD5 = {
19
+ 'MME': 'b36b43c3f09801f5d368627fb92187c3',
20
+ 'HallusionBench': '0c23ac0dc9ef46832d7a24504f2a0c7c',
21
+ 'POPE': 'c12f5acb142f2ef1f85a26ba2fbe41d5',
22
+ 'AMBER': '970d94c0410916166e0a76ba75da7934',
23
+ }
24
+
25
+ # It returns a dataframe
26
+ def evaluate(self, eval_file, **judge_kwargs):
27
+ from .utils.yorn import YOrN_Extraction, YOrN_auxeval
28
+ from .utils.yorn import default_rating, MME_rating, Hallusion_rating, POPE_rating, AMBER_rating
29
+
30
+ dataset = self.dataset_name
31
+ data = load(eval_file)
32
+ data['prediction'] = [str(x) for x in data['prediction']]
33
+ storage = eval_file.replace('.xlsx', '_auxmatch.xlsx')
34
+ tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
35
+ nproc = judge_kwargs.pop('nproc', 4)
36
+
37
+ if not osp.exists(storage):
38
+ ans_map = {k: YOrN_Extraction(v) for k, v in zip(data['index'], data['prediction'])}
39
+ if osp.exists(tmp_file):
40
+ tmp = load(tmp_file)
41
+ for k in tmp:
42
+ if ans_map[k] == 'Unknown' and tmp[k] != 'Unknown':
43
+ ans_map[k] = tmp[k]
44
+
45
+ data['extracted'] = [ans_map[x] for x in data['index']]
46
+ unknown = data[data['extracted'] == 'Unknown']
47
+
48
+ model = judge_kwargs.get('model', 'exact_matching')
49
+ if model == 'exact_matching':
50
+ model = None
51
+ elif gpt_key_set():
52
+ model = build_judge(**judge_kwargs)
53
+ if not model.working():
54
+ warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
55
+ warnings.warn(DEBUG_MESSAGE)
56
+ model = None
57
+ else:
58
+ model = None
59
+ warnings.warn('OPENAI_API_KEY is not working properly, will use exact matching for evaluation')
60
+
61
+ if model is not None:
62
+ lt = len(unknown)
63
+ lines = [unknown.iloc[i] for i in range(lt)]
64
+ tups = [(model, line) for line in lines]
65
+ indices = list(unknown['index'])
66
+ if len(tups):
67
+ res = track_progress_rich(
68
+ YOrN_auxeval, tups, nproc=nproc, chunksize=nproc, keys=indices, save=tmp_file)
69
+ for k, v in zip(indices, res):
70
+ ans_map[k] = v
71
+
72
+ data['extracted'] = [ans_map[x] for x in data['index']]
73
+ dump(data, storage)
74
+
75
+ data = load(storage)
76
+ if listinstr(['AMBER'], dataset):
77
+ data['score'] = (data['answer'].str.lower() == data['extracted'].str.lower())
78
+ else:
79
+ data['score'] = (data['answer'] == data['extracted'])
80
+ dump(data, storage)
81
+
82
+ if dataset is not None and listinstr(['MME'], dataset):
83
+ score = MME_rating(storage)
84
+ elif dataset is not None and listinstr(['Hallusion'], dataset):
85
+ score = Hallusion_rating(storage)
86
+ elif dataset is not None and listinstr(['POPE'], dataset):
87
+ score = POPE_rating(storage)
88
+ elif dataset is not None and listinstr(['AMBER'], dataset):
89
+ score = AMBER_rating(storage)
90
+ else:
91
+ score = default_rating(storage)
92
+
93
+ score_tgt = eval_file.replace('.xlsx', '_score.csv')
94
+ dump(score, score_tgt)
95
+ return score
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/longvideobench.py ADDED
@@ -0,0 +1,328 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import snapshot_download
2
+ from ..smp import *
3
+ from .video_base import VideoBaseDataset
4
+ from .utils import build_judge, DEBUG_MESSAGE
5
+ from glob import glob
6
+
7
+ FAIL_MSG = 'Failed to obtain answer via API.'
8
+
9
+
10
+ def timestamp_to_seconds(timestamp):
11
+ # Split the timestamp into hours, minutes, and seconds
12
+ h, m, s = timestamp.split(":")
13
+ # Convert hours, minutes, and total seconds (including fractions) to float and compute total seconds
14
+ total_seconds = int(h) * 3600 + int(m) * 60 + float(s)
15
+ return total_seconds
16
+
17
+
18
+ def uniformly_subsample(lst, K):
19
+ n = len(lst)
20
+ if K >= n:
21
+ return lst
22
+ step = n / K
23
+ return [lst[int(i * step)] for i in range(K)]
24
+
25
+
26
+ def insert_subtitles_into_frames(
27
+ frames,
28
+ frame_timestamps,
29
+ subtitles,
30
+ starting_timestamp_for_subtitles,
31
+ duration,
32
+ ):
33
+ interleaved_list = []
34
+ cur_i = 0
35
+
36
+ for subtitle in subtitles:
37
+ if "timestamp" in subtitle:
38
+ start, end = subtitle["timestamp"]
39
+
40
+ if not isinstance(end, float):
41
+ end = duration
42
+
43
+ start -= starting_timestamp_for_subtitles
44
+ end -= starting_timestamp_for_subtitles
45
+
46
+ subtitle_timestamp = (start + end) / 2
47
+ subtitle_text = subtitle["text"]
48
+ else:
49
+ start, end = subtitle["start"], subtitle["end"]
50
+ start = timestamp_to_seconds(start)
51
+ end = timestamp_to_seconds(end)
52
+ start -= starting_timestamp_for_subtitles
53
+ end -= starting_timestamp_for_subtitles
54
+
55
+ subtitle_timestamp = (start + end) / 2
56
+ subtitle_text = subtitle["line"]
57
+
58
+ for i, (frame, frame_timestamp) in enumerate(
59
+ zip(frames[cur_i:], frame_timestamps[cur_i:])
60
+ ):
61
+ if frame_timestamp <= subtitle_timestamp:
62
+ # print("frame:", frame_timestamp)
63
+ interleaved_list.append({"type": "image", "value": frame})
64
+ cur_i += 1
65
+ else:
66
+ break
67
+
68
+ if end - start < 1:
69
+ end = subtitle_timestamp + 0.5
70
+ start = subtitle_timestamp - 0.5
71
+
72
+ covering_frames = False
73
+ for frame, frame_timestamp in zip(frames, frame_timestamps):
74
+ if frame_timestamp < end and frame_timestamp > start:
75
+ covering_frames = True
76
+ break
77
+
78
+ if covering_frames:
79
+ interleaved_list.append({"type": "text", "value": subtitle_text + "\n"})
80
+ else:
81
+ pass
82
+
83
+ for i, (frame, frame_timestamp) in enumerate(
84
+ zip(frames[cur_i:], frame_timestamps[cur_i:])
85
+ ):
86
+ interleaved_list.append({"type": "image", "value": frame})
87
+ return interleaved_list
88
+
89
+
90
+ class LongVideoBench(VideoBaseDataset):
91
+
92
+ MD5 = '82905eae3a5ae7383c5a8ee9655e1ab9'
93
+ SYS = ''
94
+
95
+ TYPE = 'Video-MCQ'
96
+
97
+ def __init__(self, dataset='LongVideoBench', use_subtitle=False, nframe=0, fps=-1):
98
+ super().__init__(dataset=dataset, nframe=nframe, fps=fps)
99
+ self.use_subtitle = use_subtitle
100
+ self.dataset_name = dataset
101
+
102
+ @classmethod
103
+ def supported_datasets(cls):
104
+ return ['LongVideoBench']
105
+
106
+ def prepare_dataset(self, dataset_name='LongVideoBench', repo_id='longvideobench/LongVideoBench'):
107
+
108
+ def check_integrity(pth):
109
+ data_file = osp.join(pth, f'{dataset_name}.tsv')
110
+
111
+ if not osp.exists(data_file):
112
+ return False
113
+
114
+ if md5(data_file) != self.MD5:
115
+ print("md5 mismatch", md5(data_file), self.MD5)
116
+ return False
117
+ data = load(data_file)
118
+ for video_pth in data['video_path']:
119
+ if not osp.exists(osp.join(pth, video_pth)):
120
+ print(video_pth, "is not found")
121
+ return False
122
+ return True
123
+
124
+ if modelscope_flag_set():
125
+ repo_id = "AI-ModelScope/LongVideoBench"
126
+
127
+ cache_path = get_cache_path(repo_id)
128
+ if cache_path is not None and check_integrity(cache_path):
129
+ dataset_path = cache_path
130
+ else:
131
+ def generate_tsv(pth):
132
+ data_file = osp.join(pth, f'{dataset_name}.tsv')
133
+ if osp.exists(data_file) and md5(data_file) == self.MD5:
134
+ return
135
+
136
+ data_file = pd.read_json(osp.join(pth, 'lvb_val.json'))
137
+ data_file = data_file.assign(index=range(len(data_file)))
138
+ data_file['video'] = data_file['video_id']
139
+ data_file['video_path'] = data_file['video_path'].apply(lambda x: f'./videos/{x}')
140
+
141
+ data_file.to_csv(osp.join(pth, f'{dataset_name}.tsv'), sep='\t', index=False)
142
+
143
+ if modelscope_flag_set():
144
+ from modelscope import dataset_snapshot_download
145
+ dataset_snapshot_download(dataset_id=repo_id)
146
+ else:
147
+ snapshot_download(repo_id=repo_id, repo_type='dataset')
148
+ print("All videos are downloaded for LongVideoBench")
149
+
150
+ if not glob(osp.join(cache_path, "videos")):
151
+ tar_files = glob(osp.join(cache_path, "**/*.tar*"), recursive=True)
152
+
153
+ def untar_video_data(tar_file, cache_dir):
154
+ import tarfile
155
+ with tarfile.open(tar_file, "r") as tar_ref:
156
+ tar_ref.extractall(cache_dir)
157
+ print(f"Extracted all files from {tar_file} to {cache_dir}")
158
+
159
+ def concat_tar_parts(tar_parts, output_tar):
160
+ with open(output_tar, "wb") as out_tar:
161
+ from tqdm import tqdm
162
+ for part in tqdm(sorted(tar_parts)):
163
+ with open(part, "rb") as part_file:
164
+ out_tar.write(part_file.read())
165
+ print(f"Concatenated parts {tar_parts} into {output_tar}")
166
+
167
+ tar_parts_dict = {}
168
+
169
+ # Group tar parts together
170
+ for tar_file in tar_files:
171
+ base_name = tar_file.split(".tar")[0]
172
+ if base_name not in tar_parts_dict:
173
+ tar_parts_dict[base_name] = []
174
+ tar_parts_dict[base_name].append(tar_file)
175
+
176
+ # Concatenate and untar split parts
177
+ for base_name, parts in tar_parts_dict.items():
178
+ print(f"Extracting following tar files: {parts}")
179
+ output_tar = base_name + ".tar"
180
+ if not osp.exists(output_tar):
181
+ print('Start concatenating tar files')
182
+
183
+ concat_tar_parts(parts, output_tar)
184
+ print('Finish concatenating tar files')
185
+
186
+ if not osp.exists(osp.join(cache_path, osp.basename(base_name))):
187
+ untar_video_data(output_tar, cache_path)
188
+
189
+ print('All videos are extracted for LongVideoBench')
190
+
191
+ dataset_path = cache_path
192
+ generate_tsv(dataset_path)
193
+
194
+ data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
195
+
196
+ return dict(data_file=data_file, root=dataset_path)
197
+
198
+ def save_video_frames(self, video_path, video_llm=False):
199
+
200
+ vid_path = osp.join(self.data_root, video_path)
201
+ vid = decord.VideoReader(vid_path)
202
+ video_info = {
203
+ 'fps': vid.get_avg_fps(),
204
+ 'n_frames': len(vid),
205
+ }
206
+ if self.nframe > 0 and self.fps < 0:
207
+ step_size = len(vid) / (self.nframe + 1)
208
+ indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
209
+ frame_paths = self.frame_paths(video_path[:-4])
210
+ elif self.fps > 0:
211
+ # not constrained by num_frames, get frames by fps
212
+ total_duration = video_info['n_frames'] / video_info['fps']
213
+ required_frames = int(total_duration * self.fps)
214
+ step_size = video_info['fps'] / self.fps
215
+ indices = [int(i * step_size) for i in range(required_frames)]
216
+ frame_paths = self.frame_paths_fps(video_path[:-4], len(indices))
217
+
218
+ flag = np.all([osp.exists(p) for p in frame_paths])
219
+
220
+ if not flag:
221
+ images = [vid[i].asnumpy() for i in indices]
222
+ images = [Image.fromarray(arr) for arr in images]
223
+ for im, pth in zip(images, frame_paths):
224
+ if not osp.exists(pth) and not video_llm:
225
+ im.save(pth)
226
+
227
+ return frame_paths, indices, video_info
228
+
229
+ # def save_video_into_images(self, line, num_frames=8):
230
+ # frame_paths, indices, video_info = self.save_video_frames(line['video_path'], num_frames)
231
+ # return frame_paths
232
+
233
+ def build_prompt(self, line, video_llm):
234
+ if isinstance(line, int):
235
+ assert line < len(self)
236
+ line = self.data.iloc[line]
237
+
238
+ frames, indices, video_info = self.save_video_frames(line['video_path'], video_llm)
239
+ fps = video_info["fps"]
240
+
241
+ message = [dict(type='text', value=self.SYS)]
242
+ if video_llm:
243
+ message.append(dict(type='video', value=osp.join(self.data_root, line['video_path'])))
244
+ else:
245
+ if not self.use_subtitle:
246
+ with open(osp.join(self.data_root, "subtitles", line["subtitle_path"])) as f:
247
+ subtitles = json.load(f)
248
+
249
+ frame_message = insert_subtitles_into_frames(
250
+ frames,
251
+ [ind_ / fps for ind_ in indices],
252
+ subtitles,
253
+ line["starting_timestamp_for_subtitles"],
254
+ line["duration"]
255
+ )
256
+
257
+ message += frame_message
258
+ else:
259
+ for im in frames:
260
+ message.append(dict(type='image', value=im))
261
+
262
+ line['question'] += '\n' + '\n'.join(
263
+ ["{}. {}".format(chr(ord("A") + i), cand) for i, cand in enumerate(eval(line['candidates']))]
264
+ )
265
+ prompt = line["question"] + "\nAnswer with the option's letter from the given choices directly."
266
+ message.append(dict(type='text', value=prompt))
267
+ return message
268
+
269
+ # It returns a dictionary
270
+ @classmethod
271
+ def evaluate(self, eval_file, **judge_kwargs):
272
+ from .utils.longvideobench import get_dimension_rating, extract_characters_regex, extract_option
273
+
274
+ assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
275
+
276
+ tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
277
+ tgt_file = eval_file.replace('.xlsx', '_rating.json')
278
+ score_file = eval_file.replace('.xlsx', '_score.xlsx')
279
+
280
+ if not osp.exists(score_file):
281
+ model = judge_kwargs.get('model', 'exact_matching')
282
+ assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
283
+
284
+ if model == 'exact_matching':
285
+ model = None
286
+ elif gpt_key_set():
287
+ model = build_judge(**judge_kwargs)
288
+ if not model.working():
289
+ warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
290
+ warnings.warn(DEBUG_MESSAGE)
291
+ model = None
292
+ else:
293
+ warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
294
+ model = None
295
+ res = {} if not osp.exists(tmp_file) else load(tmp_file)
296
+ res = {k: v for k, v in res.items() if FAIL_MSG not in v}
297
+
298
+ data = load(eval_file)
299
+ data_un = data[~pd.isna(data['prediction'])]
300
+
301
+ for idx in data['index']:
302
+ ans = data.loc[data['index'] == idx, 'correct_choice'].values[0]
303
+ ans = chr(ord("A") + ans)
304
+ pred = str(data.loc[data['index'] == idx, 'prediction'].values[0])
305
+
306
+ if extract_characters_regex(pred) == '':
307
+ extract_pred = extract_option(
308
+ model,
309
+ data.loc[data['index'] == idx].to_dict(orient='records')[0],
310
+ 'LongVideoBench'
311
+ )
312
+ data.loc[idx, 'score'] = int(extract_pred == ans)
313
+ else:
314
+ data.loc[idx, 'score'] = int(extract_characters_regex(pred) == ans)
315
+
316
+ rejected = [x for x in data['score'] if x == -1]
317
+
318
+ print(
319
+ f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
320
+ f'failed to obtain the score for another {len(rejected)} questions. '
321
+ f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
322
+ )
323
+
324
+ dump(data, score_file)
325
+
326
+ rating = get_dimension_rating(score_file)
327
+ dump(rating, tgt_file)
328
+ return rating
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/miabench.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import pandas as pd
5
+
6
+ from .image_base import ImageBaseDataset
7
+ from ..smp import *
8
+ from .utils import build_judge, DEBUG_MESSAGE
9
+ from ..utils import track_progress_rich
10
+
11
+
12
+ def generate_prompt(d):
13
+ question = d['question']
14
+ weights = eval(d['component_weight'])
15
+ components = eval(d['components'])
16
+ num_of_component = int(d['num_of_component'])
17
+ response = d['prediction']
18
+
19
+ if num_of_component == 1:
20
+ components = f"The first component is: '{components[0]}'. "
21
+ score = f"The first component is worth: {weights[0]} scores. "
22
+ elif num_of_component == 2:
23
+ components = f"The first component is: '{components[0]}', and the second component is '{components[1]}'. "
24
+ score = f"The first and second component is each worth {weights[0]} and {weights[1]} scores. "
25
+ elif num_of_component == 3:
26
+ components = (
27
+ f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
28
+ f"and the third component is '{components[2]}'. "
29
+ )
30
+ score = (
31
+ "The first, second, and third component is each worth "
32
+ f"{weights[0]}, {weights[1]}, and {weights[2]} scores."
33
+ )
34
+ elif num_of_component == 4:
35
+ components = (
36
+ f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
37
+ f"and the third component is '{components[2]}', and the fourth component is '{components[3]}'. "
38
+ )
39
+ score = (
40
+ "The first, second, third, and fourth component is each worth "
41
+ f"{weights[0]}, {weights[1]}, {weights[2]}, and {weights[3]} scores."
42
+ )
43
+ elif num_of_component == 5:
44
+ components = (
45
+ f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
46
+ f"and the third component is '{components[2]}', and the fourth component is '{components[3]}', "
47
+ f"and the fifth component is '{components[4]}'. "
48
+ )
49
+ score = (
50
+ "The first, second, third, fourth, and fifth component is each worth "
51
+ f"{weights[0]}, {weights[1]}, {weights[2]}, {weights[3]}, and {weights[4]} scores."
52
+ )
53
+
54
+ return (
55
+ "Here is an instruction for a multimodal LLM: '"
56
+ f"{question}"
57
+ "'. You need to grade if the response from the model follows each component of the instruction. "
58
+ f"{components}"
59
+ "The response is: '"
60
+ f"{response}"
61
+ "'. You need to score the response and be strict. The total score ranges from 0 to 10, "
62
+ "depending on if the response follows the instruction. "
63
+ f"{score}"
64
+ "List scores of each component, and the total score in one sentence in this format: "
65
+ "score of component 1: x/2, score of component 2: y/8, total score: z/10. Then explain your reasons."
66
+ )
67
+
68
+
69
+ def process_rawscore(component_type, raw_score):
70
+ first_sentence = raw_score.split('.')[0].split(',')
71
+ score_dict = {}
72
+ for i in range(len(first_sentence) - 1):
73
+ score_ = first_sentence[i].split(':')[1][1:].split('/')
74
+ score = int(score_[0]) / int(score_[1])
75
+ score_dict[component_type[i]] = score
76
+ total_score_ = first_sentence[i + 1].split(':')[1][1:].split('/')
77
+ total_score = int(total_score_[0]) / int(total_score_[1])
78
+ score_dict['total_score'] = total_score
79
+ return score_dict
80
+
81
+
82
+ def get_score_dict(data, score_raw):
83
+ cat_score_dict = {}
84
+ for i in range(len(data)):
85
+ try:
86
+ cmp = data['component_type'][i][2:-2]
87
+ cmp_list = cmp.split('\', \'')
88
+ score_dict = process_rawscore(cmp_list, score_raw[i])
89
+ for key, val in score_dict.items():
90
+ if key not in cat_score_dict.keys():
91
+ cat_score_dict[key] = [val]
92
+ else:
93
+ cat_score_dict[key].append(val)
94
+ except:
95
+ pass
96
+ cat_score_dict_average = {}
97
+ for key, val in cat_score_dict.items():
98
+ cat_score_dict_average[key] = sum(val) / len(val)
99
+ return cat_score_dict_average
100
+
101
+
102
+ class MIABench(ImageBaseDataset):
103
+ TYPE = 'VQA'
104
+
105
+ DATASET_URL = {
106
+ 'MIA-Bench': 'https://opencompass.openxlab.space/utils/VLMEval/Mia-Bench.tsv',
107
+ }
108
+ DATASET_MD5 = {
109
+ 'MIA-Bench': '0b9de595f4dd40af18a69b94d89aba82',
110
+ }
111
+
112
+ @classmethod
113
+ def evaluate(self, eval_file, **judge_kwargs):
114
+ judge_name = judge_kwargs.pop('model', 'gpt-4o')
115
+
116
+ model = build_judge(model=judge_name, **judge_kwargs)
117
+ suffix = eval_file.split('.')[-1]
118
+
119
+ storage = eval_file.replace(f'.{suffix}', f'_{judge_name}.xlsx') # noqa: F841
120
+ tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_name}.pkl') # noqa: F841
121
+ nproc = judge_kwargs.pop('nproc', 4) # noqa: F841
122
+
123
+ if not osp.exists(storage):
124
+ data = load(eval_file)
125
+ num_samples = len(data)
126
+ lines = [data.loc[i] for i in range(num_samples)]
127
+ prompts = [generate_prompt(line) for line in lines]
128
+ org_data = MIABench('MIA-Bench').data
129
+ img_map = {x: y for x, y in zip(org_data['index'], org_data['image'])}
130
+ image_b64 = [img_map[idx] for idx in data['index']]
131
+ indices = list(data['index'])
132
+ mm_messages = [
133
+ dict(message=[
134
+ dict(type='text', value=prompt),
135
+ dict(type='image', value=f'data:image/jpeg;base64,{b64}')
136
+ ])
137
+ for prompt, b64 in zip(prompts, image_b64)
138
+ ]
139
+
140
+ res = {}
141
+ if osp.exists(tmp_file):
142
+ res = load(tmp_file)
143
+
144
+ jobs = {k: v for k, v in zip(indices, mm_messages) if k not in res}
145
+ job_keys = list(jobs.keys())
146
+ job_vals = [jobs[k] for k in job_keys]
147
+
148
+ resps = track_progress_rich(
149
+ model.generate,
150
+ job_vals,
151
+ nproc=nproc,
152
+ chunksize=nproc,
153
+ keys=job_keys,
154
+ save=tmp_file,
155
+ )
156
+ for k, resp in zip(job_keys, resps):
157
+ res[k] = resp
158
+ data['score_raw'] = [res[idx] for idx in indices]
159
+ dump(data, storage)
160
+
161
+ goresult = load(storage)
162
+ results = get_score_dict(goresult, goresult['score_raw'])
163
+ result_pth = storage.replace('.xlsx', '_score.csv')
164
+ results_pd = pd.DataFrame.from_dict(list(results.items()))
165
+ dump(results_pd, result_pth)
166
+
167
+ return results
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mlvu.py ADDED
@@ -0,0 +1,455 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import huggingface_hub
2
+ from huggingface_hub import snapshot_download
3
+ from ..smp import *
4
+ from .video_concat_dataset import ConcatVideoDataset
5
+ from .video_base import VideoBaseDataset
6
+ from .utils import build_judge, DEBUG_MESSAGE
7
+ from ..utils import track_progress_rich
8
+ import torchvision.transforms as T
9
+ from torchvision import transforms
10
+ from torchvision.transforms.functional import InterpolationMode
11
+ from decord import VideoReader, cpu
12
+ import pandas as pd
13
+ import imageio
14
+ import cv2
15
+ import zipfile
16
+ import os
17
+ import glob
18
+ from .utils.mlvu import *
19
+
20
+ FAIL_MSG = 'Failed to obtain answer via API.'
21
+
22
+
23
+ class MLVU(ConcatVideoDataset):
24
+ def __init__(self, dataset='MLVU', nframe=0, fps=-1):
25
+ self.DATASET_SETS[dataset] = ['MLVU_MCQ', 'MLVU_OpenEnded']
26
+ self.type_data_dict = {
27
+ 'M-Avg':['plotQA', 'needle', 'ego', 'count', 'anomaly_reco', 'topic_reasoning'],
28
+ 'G-Avg':['sub_scene', 'summary']
29
+ }
30
+ super().__init__(dataset=dataset, nframe=nframe, fps=fps)
31
+
32
+ @classmethod
33
+ def supported_datasets(cls):
34
+ return ['MLVU']
35
+
36
+ def evaluate(self, eval_file, **judge_kwargs):
37
+ result = super().evaluate(eval_file=eval_file, **judge_kwargs)
38
+ suffix = eval_file.split('.')[-1]
39
+ score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
40
+ for key in self.type_data_dict:
41
+ result.loc[key] = 0.0
42
+ for name, item in result.iterrows():
43
+ if name in self.type_data_dict[key]:
44
+ result.loc[key, 'success'] += item['success']
45
+ result.loc[key, 'overall'] += item['overall']
46
+ if key == 'G-Avg':
47
+ result.loc[key, 'acc'] = round(
48
+ result.loc[key, 'success'] / result.loc[key, 'overall'], 2
49
+ )
50
+ else:
51
+ result.loc[key, 'acc'] = round(
52
+ result.loc[key, 'success'] / result.loc[key, 'overall'] * 100, 1
53
+ )
54
+ result = result.reset_index().rename(columns={'index': 'task'})
55
+ dump(result, score_file)
56
+ return result
57
+
58
+
59
+ class MLVU_MCQ(VideoBaseDataset):
60
+
61
+ MD5 = 'bb5c37e7cf8d43fc9a25c23d2b4633f5'
62
+ BASE_SYS = 'Carefully watch this video and pay attention to every detail. '
63
+ SYS = BASE_SYS + 'Based on your observations, select the best option that accurately addresses the question.'
64
+ TYPE = 'Video-MCQ'
65
+
66
+ def __init__(self, dataset='MLVU_MCQ', nframe=0, fps=-1):
67
+ self.type_data_list = {
68
+ 'plotQA': ('1_plotQA.json', './MLVU/video/1_plotQA', 'MCQ'),
69
+ 'needle': ('2_needle.json', './MLVU/video/2_needle', 'MCQ'),
70
+ 'ego': ('3_ego.json', './MLVU/video/3_ego', 'MCQ'),
71
+ 'count': ('4_count.json', './MLVU/video/4_count', 'MCQ'),
72
+ 'order': ('5_order.json', './MLVU/video/5_order', 'MCQ'),
73
+ 'anomaly_reco': ('6_anomaly_reco.json', './MLVU/video/6_anomaly_reco', 'MCQ'),
74
+ 'topic_reasoning': ('7_topic_reasoning.json', './MLVU/video/7_topic_reasoning', 'MCQ'),
75
+ }
76
+ super().__init__(dataset=dataset, nframe=nframe, fps=fps)
77
+
78
+ @classmethod
79
+ def supported_datasets(cls):
80
+ return ['MLVU_MCQ']
81
+
82
+ def prepare_dataset(self, dataset_name='MLVU_MCQ', repo_id='MLVU/MVLU'):
83
+ def check_integrity(pth):
84
+ data_file = osp.join(pth, f'{dataset_name}.tsv')
85
+
86
+ if not os.path.exists(data_file):
87
+ return False
88
+
89
+ if md5(data_file) != self.MD5:
90
+ return False
91
+
92
+ data = load(data_file)
93
+ for idx, item in data.iterrows():
94
+ if not osp.exists(osp.join(pth, item['prefix'], item['video'])):
95
+ return False
96
+ return True
97
+
98
+ if modelscope_flag_set():
99
+ repo_id = "AI-ModelScope/MLVU"
100
+
101
+ cache_path = get_cache_path(repo_id)
102
+ if cache_path is not None and check_integrity(cache_path):
103
+ dataset_path = cache_path
104
+ else:
105
+ def generate_tsv(pth):
106
+ data_file = osp.join(pth, f'{dataset_name}.tsv')
107
+ if os.path.exists(data_file) and md5(data_file) == self.MD5:
108
+ return
109
+ json_data_dir = os.path.join(dataset_path, 'MLVU', 'json')
110
+ self.data_list = []
111
+ for k, v in self.type_data_list.items():
112
+ with open(os.path.join(json_data_dir, v[0]), 'r') as f:
113
+ json_data = json.load(f)
114
+ for data in json_data:
115
+ self.data_list.append({
116
+ 'task_type': k,
117
+ 'prefix': v[1],
118
+ 'duration': data['duration'],
119
+ 'video': data['video'],
120
+ 'question': data['question'],
121
+ 'answer': data['answer'],
122
+ 'candidates': data['candidates'],
123
+ })
124
+
125
+ data_df = pd.DataFrame(self.data_list)
126
+ data_df = data_df.assign(index=range(len(data_df)))
127
+ data_df.to_csv(data_file, sep='\t', index=False)
128
+
129
+ if modelscope_flag_set():
130
+ from modelscope import dataset_snapshot_download
131
+ dataset_path = dataset_snapshot_download(dataset_id=repo_id)
132
+ else:
133
+ hf_token = os.environ.get('HUGGINGFACE_TOKEN')
134
+ huggingface_hub.login(hf_token)
135
+ dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
136
+
137
+ generate_tsv(dataset_path)
138
+
139
+ data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
140
+ return dict(root=dataset_path, data_file=data_file)
141
+
142
+ def qa_template(self, data):
143
+ question = f"Question: {data['question']}\n"
144
+ question += 'Options:\n'
145
+ answer = data['answer']
146
+ answer_idx = -1
147
+ for idx, c in enumerate(eval(data['candidates'])):
148
+ question += f"({chr(ord('A') + idx)}) {c}\n"
149
+ if c == answer:
150
+ answer_idx = idx
151
+ question = question.rstrip()
152
+ answer = f"({chr(ord('A') + answer_idx)}) {answer}"
153
+ return question, answer
154
+
155
+ def save_video_frames(self, line):
156
+ suffix = line['video'].split('.')[-1]
157
+ video = line['video'].replace(f'.{suffix}','')
158
+ vid_path = osp.join(self.data_root, line['prefix'], line['video'])
159
+ vid = decord.VideoReader(vid_path)
160
+ video_info = {
161
+ 'fps': vid.get_avg_fps(),
162
+ 'n_frames': len(vid),
163
+ }
164
+ if self.nframe > 0 and self.fps < 0:
165
+ step_size = len(vid) / (self.nframe + 1)
166
+ indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
167
+ frame_paths = self.frame_paths(video)
168
+ elif self.fps > 0:
169
+ # not constrained by num_frames, get frames by fps
170
+ total_duration = video_info['n_frames'] / video_info['fps']
171
+ required_frames = int(total_duration * self.fps)
172
+ step_size = video_info['fps'] / self.fps
173
+ indices = [int(i * step_size) for i in range(required_frames)]
174
+ frame_paths = self.frame_paths_fps(video, len(indices))
175
+
176
+ flag = np.all([osp.exists(p) for p in frame_paths])
177
+
178
+ if not flag:
179
+ images = [vid[i].asnumpy() for i in indices]
180
+ images = [Image.fromarray(arr) for arr in images]
181
+ for im, pth in zip(images, frame_paths):
182
+ if not osp.exists(pth):
183
+ im.save(pth)
184
+
185
+ return frame_paths
186
+
187
+ def save_video_into_images(self, line):
188
+ frame_paths = self.save_video_frames(line)
189
+ return frame_paths
190
+
191
+ def build_prompt(self, line, video_llm):
192
+ if isinstance(line, int):
193
+ assert line < len(self)
194
+ line = self.data.iloc[line]
195
+
196
+ question, answer = self.qa_template(line)
197
+ message = [dict(type='text', value=self.SYS, role='system')]
198
+ message.append(dict(type='text', value=question))
199
+ video_path = os.path.join(self.data_root, line['prefix'], line['video'])
200
+ if video_llm:
201
+ message.append(dict(type='video', value=video_path))
202
+ else:
203
+ img_frame_paths = self.save_video_into_images(line)
204
+ for im in img_frame_paths:
205
+ message.append(dict(type='image', value=im))
206
+ message.append(dict(type='text', value='\nOnly give the best option.'))
207
+ return message
208
+
209
+ @classmethod
210
+ def evaluate(self, eval_file, **judge_kwargs):
211
+ assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
212
+
213
+ tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
214
+ score_file = eval_file.replace('.xlsx', '_score.xlsx')
215
+
216
+ if not osp.exists(score_file):
217
+ model = judge_kwargs.setdefault('model', 'chatgpt-0125')
218
+ assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
219
+
220
+ if model == 'exact_matching':
221
+ model = None
222
+ elif gpt_key_set():
223
+ model = build_judge(**judge_kwargs)
224
+ if not model.working():
225
+ warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
226
+ warnings.warn(DEBUG_MESSAGE)
227
+ model = None
228
+ else:
229
+ warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
230
+ model = None
231
+ res = {} if not osp.exists(tmp_file) else load(tmp_file)
232
+ res = {k: v for k, v in res.items() if FAIL_MSG not in v}
233
+
234
+ data = load(eval_file)
235
+ data_un = data[~pd.isna(data['prediction'])]
236
+
237
+ for idx in data['index']:
238
+ ans = data.loc[data['index'] == idx, 'answer'].values[0]
239
+ pred = data.loc[data['index'] == idx, 'prediction'].values[0]
240
+ options = eval(data.loc[data['index'] == idx, 'candidates'].values[0])
241
+ answer_idx = -1
242
+ for id, c in enumerate(options):
243
+ if c == ans:
244
+ answer_idx = id
245
+ ans = f"({chr(ord('A') + answer_idx)}) {ans}"
246
+ input_item = data.loc[data['index'] == idx].to_dict(orient='records')[0]
247
+ for id, option_content in enumerate(eval(input_item['candidates'])):
248
+ input_item[chr(ord('A') + id)] = option_content
249
+ if option_content == input_item['answer']:
250
+ input_item['answer'] = chr(ord('A') + id)
251
+
252
+ if FAIL_MSG in pred:
253
+ data.loc[idx, 'score'] = -1
254
+ else:
255
+ data.loc[idx, 'score'] = int(check_ans_with_model(
256
+ pred, ans, model,
257
+ input_item,
258
+ 'MLVU_MCQ'
259
+ ))
260
+
261
+ rejected = [x for x in data['score'] if x == -1]
262
+
263
+ print(
264
+ f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
265
+ f'failed to obtain the score for another {len(rejected)} questions. '
266
+ f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
267
+ )
268
+
269
+ dump(data, score_file)
270
+
271
+ rating = get_dimension_rating(score_file)
272
+ return rating
273
+
274
+
275
+ class MLVU_OpenEnded(VideoBaseDataset):
276
+
277
+ MD5 = 'cee573a3627c6ac434ded704c60511ba'
278
+ BASE_SYS = 'Carefully watch this video and pay attention to every detail. '
279
+ SYS = BASE_SYS + 'Based on your observations, answer the given questions.'
280
+ TYPE = 'Video-VQA'
281
+
282
+ def __init__(self, dataset='MLVU_OpenEnded', nframe=0, fps=-1):
283
+ self.type_data_list = {
284
+ 'sub_scene': ('8_sub_scene.json', './MLVU/video/8_sub_scene', 'VQA'),
285
+ 'summary': ('9_summary.json', './MLVU/video/9_summary', 'VQA')
286
+ }
287
+ super().__init__(dataset=dataset, nframe=nframe, fps=fps)
288
+
289
+ @classmethod
290
+ def supported_datasets(cls):
291
+ return ['MLVU_OpenEnded']
292
+
293
+ def prepare_dataset(self, dataset_name='MLVU_OpenEnded', repo_id='MLVU/MVLU'):
294
+ def check_integrity(pth):
295
+ data_file = osp.join(pth, f'{dataset_name}.tsv')
296
+
297
+ if not os.path.exists(data_file):
298
+ return False
299
+
300
+ if md5(data_file) != self.MD5:
301
+ return False
302
+
303
+ data = load(data_file)
304
+ for idx, item in data.iterrows():
305
+ if not osp.exists(osp.join(pth, item['prefix'], item['video'])):
306
+ return False
307
+ return True
308
+
309
+ if modelscope_flag_set():
310
+ repo_id = "AI-ModelScope/MLVU"
311
+
312
+ cache_path = get_cache_path(repo_id)
313
+ if cache_path is not None and check_integrity(cache_path):
314
+ dataset_path = cache_path
315
+ else:
316
+ def generate_tsv(pth):
317
+ data_file = osp.join(pth, f'{dataset_name}.tsv')
318
+ if os.path.exists(data_file) and md5(data_file) == self.MD5:
319
+ return
320
+ json_data_dir = os.path.join(dataset_path, 'MLVU', 'json')
321
+ self.data_list = []
322
+ for k, v in self.type_data_list.items():
323
+ with open(os.path.join(json_data_dir, v[0]), 'r') as f:
324
+ json_data = json.load(f)
325
+ for data in json_data:
326
+ self.data_list.append({
327
+ 'task_type': k,
328
+ 'prefix': v[1],
329
+ 'duration': data['duration'],
330
+ 'video': data['video'],
331
+ 'question': data['question'],
332
+ 'answer': data['answer'],
333
+ 'scoring_points': data['scoring_points'] if 'scoring_points' in data else ''
334
+ })
335
+
336
+ data_df = pd.DataFrame(self.data_list)
337
+ data_df = data_df.assign(index=range(len(data_df)))
338
+ data_df.to_csv(data_file, sep='\t', index=False)
339
+
340
+ if modelscope_flag_set():
341
+ from modelscope import dataset_snapshot_download
342
+ dataset_path = dataset_snapshot_download(dataset_id=repo_id)
343
+ else:
344
+ hf_token = os.environ.get('HUGGINGFACE_TOKEN')
345
+ huggingface_hub.login(hf_token)
346
+ dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
347
+
348
+ generate_tsv(dataset_path)
349
+
350
+ data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
351
+ return dict(root=dataset_path, data_file=data_file)
352
+
353
+ def qa_template(self, data):
354
+ question = f"{data['question']}"
355
+ answer = data['answer']
356
+ return question, answer
357
+
358
+ def save_video_frames(self, line):
359
+ suffix = line['video'].split('.')[-1]
360
+ video = line['video'].replace(f'.{suffix}','')
361
+ vid_path = osp.join(self.data_root, line['prefix'], line['video'])
362
+ vid = decord.VideoReader(vid_path)
363
+ video_info = {
364
+ 'fps': vid.get_avg_fps(),
365
+ 'n_frames': len(vid),
366
+ }
367
+ if self.nframe > 0 and self.fps < 0:
368
+ step_size = len(vid) / (self.nframe + 1)
369
+ indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
370
+ frame_paths = self.frame_paths(video)
371
+ elif self.fps > 0:
372
+ # not constrained by num_frames, get frames by fps
373
+ total_duration = video_info['n_frames'] / video_info['fps']
374
+ required_frames = int(total_duration * self.fps)
375
+ step_size = video_info['fps'] / self.fps
376
+ indices = [int(i * step_size) for i in range(required_frames)]
377
+ frame_paths = self.frame_paths_fps(video, len(indices))
378
+
379
+ flag = np.all([osp.exists(p) for p in frame_paths])
380
+
381
+ if not flag:
382
+ images = [vid[i].asnumpy() for i in indices]
383
+ images = [Image.fromarray(arr) for arr in images]
384
+ for im, pth in zip(images, frame_paths):
385
+ if not osp.exists(pth):
386
+ im.save(pth)
387
+
388
+ return frame_paths
389
+
390
+ def save_video_into_images(self, line):
391
+ frame_paths = self.save_video_frames(line)
392
+ return frame_paths
393
+
394
+ def build_prompt(self, line, video_llm):
395
+ if isinstance(line, int):
396
+ assert line < len(self)
397
+ line = self.data.iloc[line]
398
+
399
+ question, answer = self.qa_template(line)
400
+ message = [dict(type='text', value=self.SYS, role='system')]
401
+ message.append(dict(type='text', value=question))
402
+ video_path = os.path.join(self.data_root, line['prefix'], line['video'])
403
+ if video_llm:
404
+ message.append(dict(type='video', value=video_path))
405
+ else:
406
+ img_frame_paths = self.save_video_into_images(line)
407
+ for im in img_frame_paths:
408
+ message.append(dict(type='image', value=im))
409
+ return message
410
+
411
+ @classmethod
412
+ def evaluate(self, eval_file, **judge_kwargs):
413
+
414
+ model = judge_kwargs['model'] if 'model' in judge_kwargs else judge_kwargs.setdefault('model', 'gpt-4-0125')
415
+ if model != 'gpt-4-0125':
416
+ print('MLVU Open Ended default using gpt-4-0125! So judge model is changed to gpt-4-0125')
417
+ judge_kwargs['model'] = 'gpt-4-0125'
418
+
419
+ suffix = eval_file.split('.')[-1]
420
+ score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
421
+ tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
422
+ nproc = judge_kwargs.pop('nproc', 4)
423
+
424
+ if not osp.exists(score_file):
425
+ data = load(eval_file)
426
+ model_dict = {
427
+ 'sub_scene': build_judge(system_prompt=system_prompt_sub_scene, **judge_kwargs),
428
+ 'summary': build_judge(system_prompt=system_prompt_summary, **judge_kwargs)
429
+ }
430
+ lt = len(data)
431
+ lines = [data.iloc[i] for i in range(lt)]
432
+ tups = [(model_dict[line['task_type']], line) for line in lines]
433
+ indices = [line['index'] for line in lines]
434
+
435
+ ans = {}
436
+ if osp.exists(tmp_file):
437
+ ans = load(tmp_file)
438
+ tups = [x for x, i in zip(tups, indices) if i not in ans]
439
+ indices = [i for i in indices if i not in ans]
440
+
441
+ if len(indices):
442
+ _ = track_progress_rich(
443
+ MLVU_OpenEnded_generate,
444
+ tups,
445
+ nproc=nproc,
446
+ chunksize=nproc,
447
+ keys=indices,
448
+ save=tmp_file,
449
+ )
450
+ ans = load(tmp_file)
451
+ data = MLVU_OpenEnded_extract(ans, data)
452
+ dump(data, score_file)
453
+
454
+ rating = get_dimension_rating(score_file)
455
+ return rating
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mmbench_video.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import snapshot_download
2
+ from ..smp import *
3
+ from .video_base import VideoBaseDataset
4
+ from .utils import build_judge, DEBUG_MESSAGE
5
+ from ..utils import track_progress_rich
6
+
7
+
8
+ FAIL_MSG = 'Failed to obtain answer via API.'
9
+
10
+
11
+ def unwrap_hf_pkl(pth, suffix='.mp4'):
12
+ base_dir = os.path.join(pth, 'video_pkl/')
13
+ target_dir = os.path.join(pth, 'video/')
14
+ pickle_files = [os.path.join(base_dir, file) for file in os.listdir(base_dir)]
15
+ pickle_files.sort()
16
+
17
+ if not os.path.exists(target_dir):
18
+ os.makedirs(target_dir, exist_ok=True)
19
+ for pickle_file in pickle_files:
20
+ with open(pickle_file, 'rb') as file:
21
+ video_data = pickle.load(file)
22
+ # For each video file in the pickle file, write its contents to a new mp4 file
23
+ for video_name, video_content in video_data.items():
24
+ output_path = os.path.join(target_dir, f'{video_name}{suffix}')
25
+ with open(output_path, 'wb') as output_file:
26
+ output_file.write(video_content)
27
+ print('The video file has been restored and stored from the pickle file.')
28
+ else:
29
+ print('The video file already exists.')
30
+
31
+
32
+ class MMBenchVideo(VideoBaseDataset):
33
+
34
+ MD5 = '98f7df3eb1007fc375ea6fe88a98e2ff'
35
+ SYS = 'You are an AI assistant responsible for answering questions about videos.'
36
+ FRAMES_TMPL_PACK = """
37
+ You will be provided with {} separate frames uniformly sampled from a video, \
38
+ the frames are provided in chronological order of the video.
39
+ Please analyze these images and provide the answer / answers to the \
40
+ following question / questions about the video content.
41
+ If multiple questions are provided (with indices I1, I2, I3, ...), \
42
+ you should organize your answers in the following json format:
43
+ {{
44
+ 'I1': 'Answer to Question I1',
45
+ 'I2': 'Answer to Question I2',
46
+ ...
47
+ }}
48
+ Otherwise, please directly reply with your response to the only question.
49
+ Even if the information in these separate frames is not enough to give an answer,
50
+ PLEASE GIVE A RESPONSE TO EACH OF THE QUESTIONS IN THE FORMAT DESCRIBED ABOVE.
51
+ """
52
+
53
+ FRAMES_TMPL_NOPACK = """
54
+ You will be provided with {} separate frames uniformly sampled from a video, \
55
+ the frames are provided in chronological order of the video.
56
+ Please analyze these images and provide the answer to the question about the video content.
57
+ Please directly reply with your response to the only question.
58
+ """
59
+
60
+ TYPE = 'Video-VQA'
61
+
62
+ def __init__(self, dataset='MMBench-Video', pack=False, nframe=0, fps=-1):
63
+ super().__init__(dataset=dataset, pack=pack, nframe=nframe, fps=fps)
64
+
65
+ @classmethod
66
+ def supported_datasets(cls):
67
+ return ['MMBench-Video']
68
+
69
+ def prepare_dataset(self, dataset_name='MMBench-Video', repo_id='opencompass/MMBench-Video'):
70
+ def check_integrity(pth):
71
+ data_file = osp.join(pth, f'{dataset_name}.tsv')
72
+ if md5(data_file) != self.MD5:
73
+ return False
74
+ data = load(data_file)
75
+ for video_pth in data['video_path']:
76
+ if not osp.exists(osp.join(pth, video_pth)):
77
+ return False
78
+ return True
79
+
80
+ cache_path = get_cache_path(repo_id)
81
+ if cache_path is not None and check_integrity(cache_path):
82
+ dataset_path = cache_path
83
+ else:
84
+ if modelscope_flag_set():
85
+ from modelscope import dataset_snapshot_download
86
+ dataset_path = dataset_snapshot_download(dataset_id=repo_id)
87
+ else:
88
+ dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
89
+ unwrap_hf_pkl(dataset_path)
90
+ self.video_path = osp.join(dataset_path, 'video/')
91
+ data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
92
+
93
+ return dict(data_file=data_file, root=osp.join(dataset_path, 'video'))
94
+
95
+ def build_prompt_pack(self, line):
96
+ if isinstance(line, int):
97
+ assert line < len(self)
98
+ video = self.videos[line]
99
+ elif isinstance(line, pd.Series):
100
+ video = line['video']
101
+ elif isinstance(line, str):
102
+ video = line
103
+
104
+ frames = self.save_video_frames(video)
105
+ sub = self.data[self.data['video'] == video]
106
+ sys_prompt = self.SYS + self.FRAMES_TMPL_PACK.format(len(frames))
107
+ message = [dict(type='text', value=sys_prompt)]
108
+ for im in frames:
109
+ message.append(dict(type='image', value=im))
110
+ nq = len(sub)
111
+ prompt = 'Questions: \n{}\nAnswers: \n'
112
+ qs = {int(sub.iloc[i]['index']): sub.iloc[i]['question'] for i in range(nq)}
113
+ prompt = prompt.format(json.dumps(qs))
114
+ message.append(dict(type='text', value=prompt))
115
+ return message
116
+
117
+ def build_prompt_nopack(self, line, video_llm):
118
+ if isinstance(line, int):
119
+ assert line < len(self)
120
+ line = self.data.iloc[line]
121
+ if video_llm:
122
+ question = line['question']
123
+ prefix, video_idx_path = os.path.split(line['video_path'])
124
+ message = [dict(type='text', value=question)]
125
+ message.append(dict(type='video', value=os.path.join(self.video_path, video_idx_path)))
126
+ return message
127
+ else:
128
+ frames = self.save_video_frames(line['video'])
129
+ sys_prompt = self.FRAMES_TMPL_NOPACK.format(len(frames))
130
+ message = [dict(type='text', value=sys_prompt)]
131
+ for im in frames:
132
+ message.append(dict(type='image', value=im))
133
+ prompt = 'Question: {}\nAnswer: '.format(line['question'])
134
+ message.append(dict(type='text', value=prompt))
135
+ return message
136
+
137
+ def build_prompt(self, line, video_llm):
138
+ if self.pack and not video_llm:
139
+ return self.build_prompt_pack(line)
140
+ else:
141
+ return self.build_prompt_nopack(line, video_llm)
142
+
143
+ @staticmethod
144
+ def remove_side_quote(s, syms=[',', '"', "'"]):
145
+ if np.all([x in syms for x in s]):
146
+ return ''
147
+ while s[0] in syms:
148
+ s = s[1:]
149
+ while s[-1] in syms:
150
+ s = s[:-1]
151
+ return s
152
+
153
+ @staticmethod
154
+ def robust_json_load(s):
155
+ try:
156
+ jsons = list(extract_json_objects(s))
157
+ assert len(jsons) == 1
158
+ return jsons[0]
159
+ except:
160
+ if '{' in s and s.find('{') == s.rfind('{'):
161
+ sub_str = s[s.find('{') + 1:].strip()
162
+ lines = sub_str.split('\n')
163
+ res = {}
164
+ for l in lines:
165
+ l = l.strip()
166
+ if ': ' in l:
167
+ key = l.split(': ')[0].strip()
168
+ val = l.split(': ')[1].strip()
169
+ key = MMBenchVideo.remove_side_quote(key)
170
+ val = MMBenchVideo.remove_side_quote(val)
171
+ if len(key) and len(val):
172
+ res[key] = val
173
+ return res
174
+ return None
175
+
176
+ def load_pack_answers(self, data_raw):
177
+ vstats = defaultdict(lambda: 0)
178
+ data = defaultdict(lambda: {})
179
+
180
+ for k in data_raw:
181
+ ans = data_raw[k].strip()
182
+ if FAIL_MSG in ans:
183
+ vstats['GEN_FAIL'] += 1
184
+ continue
185
+ res = self.robust_json_load(ans)
186
+ if res is not None:
187
+ data[k] = res
188
+ vstats['PARSE_OK'] += 1
189
+ else:
190
+ vstats['PARSE_FAIL'] += 1
191
+
192
+ # return data
193
+ meta = cp.deepcopy(self.data)
194
+ lt = len(meta)
195
+ prediction = []
196
+ for i in range(lt):
197
+ line = meta.iloc[i]
198
+ vid = line['video']
199
+ idx = str(line['index'])
200
+ prediction.append(data[vid][idx] if idx in data[vid] else None)
201
+ meta['prediction'] = prediction
202
+ vstats['VALIDQ'] = len([x for x in prediction if x is not None])
203
+ vstats['INVALIDQ'] = len([x for x in prediction if x is None])
204
+ return meta, vstats
205
+
206
+ # It returns a dictionary
207
+ @classmethod
208
+ def evaluate(self, eval_file, **judge_kwargs):
209
+ from .utils.mmbench_video import get_dimension_rating, system_prompt, build_prompt
210
+
211
+ assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
212
+ judge = judge_kwargs['model']
213
+ nproc = judge_kwargs.pop('nproc', 4)
214
+
215
+ tmp_file = eval_file.replace('.xlsx', f'_{judge}_tmp.pkl')
216
+ tgt_file = eval_file.replace('.xlsx', f'_{judge}_rating.json')
217
+ score_file = eval_file.replace('.xlsx', f'_{judge}_score.xlsx')
218
+
219
+ model = build_judge(system_prompt=system_prompt, **judge_kwargs)
220
+ assert model.working(), 'MMBench-Video evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE
221
+
222
+ if not osp.exists(score_file):
223
+ res = {} if not osp.exists(tmp_file) else load(tmp_file)
224
+ res = {k: v for k, v in res.items() if model.fail_msg not in v}
225
+
226
+ data = load(eval_file)
227
+ data_un = data[~data['index'].isin(res)]
228
+ data_un = data_un[~pd.isna(data_un['prediction'])]
229
+ lt = len(data_un)
230
+ prompts = [build_prompt(data_un.iloc[i]) for i in range(lt)]
231
+ indices = [data_un.iloc[i]['index'] for i in range(lt)]
232
+
233
+ if len(prompts):
234
+ _ = track_progress_rich(
235
+ model.generate,
236
+ prompts,
237
+ keys=indices,
238
+ save=tmp_file,
239
+ nproc=nproc,
240
+ chunksize=nproc
241
+ )
242
+ score_map = load(tmp_file)
243
+ data['score'] = [score_map[idx] if idx in score_map else -1 for idx in data['index']]
244
+ rejected = [x for x in score_map.values() if FAIL_MSG in x]
245
+ data['score'] = [int(x) if istype(x, int) else -1 for x in data['score']]
246
+ print(
247
+ f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(score_map)} questions, '
248
+ f'failed to obtain the score for another {len(rejected)} questions. '
249
+ f'Those questions will be counted as 0 score in ALL rating, and will not be counted in VALID rating.'
250
+ )
251
+
252
+ dump(data, score_file)
253
+
254
+ rating = get_dimension_rating(score_file)
255
+ dump(rating, tgt_file)
256
+ return rating
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mmgenbench.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ import pandas as pd
3
+ from abc import abstractmethod
4
+ from ..smp import *
5
+ from .image_base import ImageBaseDataset
6
+
7
+
8
+ class MMGenBench(ImageBaseDataset):
9
+
10
+ prompt_list = [
11
+ """
12
+ # Role
13
+ You are an expert in the field of image understanding, focusing on the \
14
+ understanding of images and generating the image caption-prompt.
15
+
16
+ # Definition Explanation
17
+ image caption-prompt: Refers to the caption or description of an image, \
18
+ used to provide to a Text-to-Image model to generate a new image.
19
+ Text-to-Image model: Can generate a new image based on the provided image \
20
+ caption-prompt, such as stable diffusion 3, flux, and other image generation models.
21
+
22
+ # Task Description
23
+ Generate an image caption-prompt based on the input image.
24
+
25
+ # Key Points and Requirements
26
+ 1. Accurately understand the input image and precisely generate an image caption-prompt.
27
+ 2. The generated image caption-prompt, when provided to the Text-to-Image model, requires the \
28
+ Text-to-Image model to generate a new image that is as consistent as possible with the input image.
29
+ 3. The generated image caption-prompt must conform to the preferences of the Text-to-Image model.
30
+ 4. The generated image caption-prompt should describe the input image in as much \
31
+ detail as possible, and it should be between 20 to 60 words.
32
+
33
+ # Output Format
34
+ A string, that is the image caption-prompt. No extra output needed.
35
+ """
36
+ ]
37
+ TYPE = 'GenerateImgPrompt'
38
+ DATASET_URL = {
39
+ 'MMGenBench-Test': 'https://huggingface.co/datasets/lerogo/MMGenBench/resolve/main/MMGenBench-Test.tsv',
40
+ 'MMGenBench-Domain': 'https://huggingface.co/datasets/lerogo/MMGenBench/resolve/main/MMGenBench-Domain.tsv',
41
+ }
42
+ PROMPT_MAP = {
43
+ 'MMGenBench-Test': prompt_list[0],
44
+ 'MMGenBench-Domain': prompt_list[0],
45
+ }
46
+ DATASET_MD5 = {
47
+ 'MMGenBench-Test': "94f8dac6bbf7c20be403f99adeaa73da",
48
+ 'MMGenBench-Domain': "5c10daf6e2c5f08bdfb0701aa6db86bb",
49
+ }
50
+
51
+ def __init__(self, dataset='MMGenBench', **kwargs):
52
+ super().__init__(dataset, **kwargs)
53
+ warnings.warn('This dataset is for inference only and does not support direct output of evaluation results.\n')
54
+ warnings.warn('Please refer to "https://github.com/lerogo/MMGenBench" for more evaluation information.\n')
55
+
56
+ def load_data(self, dataset):
57
+ data = super().load_data(dataset)
58
+ if 'question' not in data:
59
+ data['question'] = [(
60
+ self.PROMPT_MAP[dataset]
61
+ )] * len(data)
62
+ return data
63
+
64
+ # Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
65
+ @abstractmethod
66
+ def evaluate(self, eval_file, **judge_kwargs):
67
+ warnings.warn('This evaluation method is not supported.\n')
68
+ warnings.warn('Please refer to "https://github.com/lerogo/MMGenBench" for more evaluation information.\n')
69
+ return None
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mmlongbench.py ADDED
@@ -0,0 +1,584 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import math
3
+ from urllib.request import urlopen
4
+ from PIL import Image, ImageDraw, ImageFont
5
+ import torchvision.transforms as transforms
6
+
7
+ from vlmeval.dataset.utils import build_judge, levenshtein_distance
8
+ from vlmeval.smp import *
9
+ from .image_base import ImageBaseDataset
10
+
11
+ FAIL_MSG = 'Failed to obtain answer via API.'
12
+
13
+
14
+ def get_gpt4_ICE():
15
+ example_1 = """
16
+ ---
17
+ Question: List the primary questions asked about the services in this report.
18
+ Analysis: The primary questions asked about the services in the report for The Limes Residential Home are:\n\n
19
+ 1. Is the service safe?\n
20
+ 2. Is the service effective?\n
21
+ 3. Is the service caring?\n
22
+ 4. Is the service responsive?\n
23
+ 5. Is the service well-led?
24
+ Extracted answer: [
25
+ 'Is the servife safe?',
26
+ 'Is the service effective',
27
+ 'Is the serve caring?',
28
+ 'Is the service responsive?',
29
+ 'Is the service well-led?'
30
+ ]
31
+ Answer format: List\n
32
+ """
33
+
34
+ example_2 = """
35
+ ---
36
+ Question: How many regulations of the HSCA 2008 are breached in all according to this report?
37
+ Analysis: According to the report, the provider breached 10 Health and Social Care Act 2008 (Regulated Activities)
38
+ Regulations in total. Here are the specifics:\n\n1. Regulation 13: Safeguarding service users from abuse and
39
+ improper treatment\n2. Regulation 12: Safe care and treatment\n3. Regulation 18: Staffing\n4. Regulation 11:
40
+ Need for consent\n5. Regulation 10: Dignity and respect\n6. Regulation 9: Person-centred care\n7. Regulation 17:
41
+ Good governance\n8. Regulation 18 (CQC Registration Regulations 2009): Notification of other incidents\n9.
42
+ Regulation 18: Failure to maintain an accurate and up-to-date care plan\n10. Regulation 11: Failure to implement
43
+ the Mental Capacity Act 2005 code of practice effectively\n\nThese breaches involve issues concerning staffing,
44
+ safeguarding, medicines management, dignity and respect, consent, care planning, governance, and failure to
45
+ notify the CQC of incidents.
46
+ Extracted answer: 10
47
+ Answer format: Integer\n
48
+ """
49
+
50
+ example_3 = """
51
+ ---
52
+ Question: According to the survey that is the percentage of Chinese who are paying more or
53
+ about the same attention to politics after Trump's election?
54
+ Analysis: The survey provided does not specify the percentage of Chinese individuals specifically who are paying
55
+ more or about the same attention to politics after Trump's election. The report focuses primarily on American
56
+ demographics and does not include specific details about the Chinese population in relation to this question. If
57
+ you need information about a different demographic or a summary of the findings from the American demographic,
58
+ I can certainly help with that!
59
+ Extracted answer: Not answerable
60
+ Answer format: String\n
61
+ """
62
+
63
+ example_4 = """
64
+ ---
65
+ Question: How many quotations from male respondent over 50 years old are included in this report?
66
+ Analysis: The image you've provided appears to be a screenshot of a document with multiple charts. However, the
67
+ text is too small and blurry to read accurately. If you can provide a clearer image or more context, I might be
68
+ able to help you with your question.
69
+ Extracted answer: Fail to answer
70
+ Answer format: String\n
71
+ """
72
+
73
+ return [example_1, example_2, example_3, example_4]
74
+
75
+
76
+ def build_mmlongbench_gpt4_prompt(line):
77
+ task_description = """
78
+ Given the question and analysis, you are tasked to extract answers with required formats from the free-form analysis.
79
+ - Your extracted answers should be one of the following formats: (1) Integer, (2) Float, (3) String and (4) List.
80
+ If you find the analysis the question can not be answered from the given documents, type "Not answerable".
81
+ Exception: If the analysis only tells you that it can not read/understand the images or documents,
82
+ type "Fail to answer".
83
+ - Please make your response as concise as possible. Also note that your response should be formatted as below:
84
+ ```
85
+ Extracted answer: [answer]
86
+ Answer format: [answer format]
87
+ ```
88
+ Please read the following example, then extract the answer from the model response
89
+ and type it at the end of the prompt.\n
90
+ """
91
+ question = line['question']
92
+ prediction = str(line['prediction'])
93
+ prompt = task_description
94
+ examples = get_gpt4_ICE()
95
+ for example in examples:
96
+ prompt += example
97
+ prompt += '---\nQuestion:' + question + '\n'
98
+ prompt += 'Analysis: ' + prediction
99
+ return prompt
100
+
101
+
102
+ def anls_compute(groundtruth, prediction, threshold=0.5):
103
+ dist = levenshtein_distance(groundtruth, prediction)
104
+ length = max(len(groundtruth.upper()), len(prediction.upper()))
105
+ value = 0.0 if length == 0 else float(dist) / float(length)
106
+ anls = 1.0 - value
107
+ if anls <= threshold:
108
+ anls = 0.0
109
+ return anls
110
+
111
+
112
+ def is_float_equal(reference, prediction, include_percentage: bool = False, is_close: float = False) -> bool:
113
+ def get_precision(gt_ans: float) -> int:
114
+ precision = 3
115
+ if '.' in str(gt_ans):
116
+ precision = len(str(gt_ans).split('.')[-1])
117
+ return precision
118
+
119
+ reference = float(str(reference).strip().rstrip('%').strip())
120
+ try:
121
+ prediction = float(str(prediction).strip().rstrip('%').strip())
122
+ except:
123
+ return False
124
+
125
+ if include_percentage:
126
+ gt_result = [reference / 100, reference, reference * 100]
127
+ else:
128
+ gt_result = [reference]
129
+ for item in gt_result:
130
+ try:
131
+ if is_close:
132
+ if math.isclose(item, prediction, rel_tol=0.01):
133
+ return True
134
+ precision = max(min(get_precision(prediction), get_precision(item)), 2)
135
+ if round(prediction, precision) == round(item, precision):
136
+ return True
137
+ except Exception:
138
+ continue
139
+ return False
140
+
141
+
142
+ def get_clean_string(s):
143
+ s = str(s).lower().strip()
144
+ if s.endswith('mile'):
145
+ s.rstrip('mile').strip()
146
+ if s.endswith('miles'):
147
+ s.rstrip('miles').strip()
148
+ if s.endswith('million'):
149
+ s.rstrip('million').strip()
150
+ # remove parenthesis
151
+ s = re.sub(r'\s*\([^)]*\)', '', s).strip()
152
+ # remove quotes
153
+ s = re.sub(r"^['\"]|['\"]$", '', s).strip()
154
+ s = s.strip().lstrip('$').strip()
155
+ s = s.strip().rstrip('%').strip()
156
+ return s
157
+
158
+
159
+ def is_exact_match(s):
160
+ flag = False
161
+ # Website
162
+ if 'https://' in s:
163
+ flag = True
164
+ # code file
165
+ if s.endswith('.py') or s.endswith('ipynb'):
166
+ flag = True
167
+ if s.startswith('page'):
168
+ flag = True
169
+ # telephone number
170
+ if re.fullmatch(r'\b\d+(-\d+|\s\d+)?\b', s):
171
+ flag = True
172
+ # time
173
+ if 'a.m.' in s or 'p.m.' in s:
174
+ flag = True
175
+ # YYYY-MM-DD
176
+ if re.fullmatch(r'\b\d{4}[-\s]\d{2}[-\s]\d{2}\b', s):
177
+ flag = True
178
+ # YYYY-MM
179
+ if re.fullmatch(r'\b\d{4}[-\s]\d{2}\b', s):
180
+ flag = True
181
+ # Email address
182
+ if re.fullmatch(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', s):
183
+ flag = True
184
+ return flag
185
+
186
+
187
+ def isfloat(num):
188
+ try:
189
+ float(num)
190
+ return True
191
+ except ValueError:
192
+ return False
193
+
194
+
195
+ def get_font():
196
+ try:
197
+ truetype_url = "http://opencompass.openxlab.space/utils/Fonts/SimHei.ttf"
198
+ ff = urlopen(truetype_url)
199
+ font = ImageFont.truetype(ff, size=40)
200
+ except Exception as e:
201
+ logging.warning(f'{type(e)}: {e}')
202
+ logging.warning("Fail to download the font. Use the default one.")
203
+ font = ImageFont.load_default(size=40)
204
+ return font
205
+
206
+
207
+ def frame2img(img_path_list, font, save_path=None, idx_start=0):
208
+ imgs = [Image.open(img_path) for img_path in img_path_list]
209
+
210
+ new_imgs = []
211
+ for img in imgs:
212
+ w, h = img.size
213
+ scale = w / h
214
+ if w > h:
215
+ new_w = 560 * 2
216
+ new_h = int(560 * 2 / scale)
217
+ else:
218
+ new_w = int(560 * 2 * scale)
219
+ new_h = 560 * 2
220
+ img = transforms.functional.resize(img, [new_h, new_w],)
221
+ new_imgs.append(img)
222
+ imgs = new_imgs
223
+ new_w = 0
224
+ new_h = 0
225
+ pad = 40
226
+ if w > h:
227
+ for im in imgs:
228
+ w, h = im.size
229
+ new_w = max(new_w, w)
230
+ new_h += h + 10 + pad
231
+ new_img = Image.new("RGB", (new_w, new_h), "white")
232
+ draw = ImageDraw.Draw(new_img)
233
+ curr_h = 0
234
+ for idx, im in enumerate(imgs):
235
+ w, h = im.size
236
+ new_img.paste(im, (0, pad + curr_h))
237
+ draw.text((0, curr_h), f"<IMAGE {idx+idx_start}>", font=font, fill="black")
238
+ if idx + 1 < len(imgs):
239
+ draw.line([(0, pad + curr_h + h + 5), (new_w, pad + curr_h + h + 5)], fill='black', width=2)
240
+ curr_h += h + 10 + pad
241
+ else:
242
+ for im in imgs:
243
+ w, h = im.size
244
+ new_w += w + 10
245
+ new_h = max(new_h, h)
246
+ new_h += pad
247
+ new_img = Image.new('RGB', (new_w, new_h), 'white')
248
+ draw = ImageDraw.Draw(new_img)
249
+ curr_w = 0
250
+ for idx, im in enumerate(imgs):
251
+ w, h = im.size
252
+ new_img.paste(im, (curr_w, pad))
253
+ draw.text((curr_w, 0), f"<IMAGE {idx+idx_start}>", font=font, fill='black')
254
+ if idx + 1 < len(imgs):
255
+ draw.line([(curr_w + w + 5, 0), (curr_w + w + 5, new_h)], fill='black', width=2)
256
+ curr_w += w + 10
257
+
258
+ if save_path is not None:
259
+ new_img.save(save_path)
260
+
261
+ return new_img
262
+
263
+
264
+ def concat_images(image_list, max_concat=1, column_num=1):
265
+ concatenated_images = []
266
+ if column_num == -1:
267
+ MAX_COLUMN_NUM = 20
268
+ max_concat = 1
269
+ while len(image_list) / max_concat > MAX_COLUMN_NUM:
270
+ max_concat += 1
271
+ interval = max(math.ceil(len(image_list) / max_concat), 1)
272
+ for i in range(0, len(image_list), interval):
273
+ batch_images = image_list[i:i + interval]
274
+ concatenated_image = frame2img(batch_images, font=get_font(), idx_start=i)
275
+ concatenated_images.append(concatenated_image)
276
+ else:
277
+ interval = max(math.ceil(len(image_list) / max_concat), 1)
278
+ for i in range(0, len(image_list), interval):
279
+ batch_images = [Image.open(filename) for filename in image_list[i:i + interval]]
280
+ if column_num == 1:
281
+ total_height = batch_images[0].height * len(batch_images)
282
+ else:
283
+ total_height = batch_images[0].height * ((len(batch_images) - 1) // column_num + 1)
284
+ concatenated_image = Image.new('RGB', (batch_images[0].width * column_num, total_height), 'white')
285
+
286
+ x_offset, y_offset = 0, 0
287
+ for count, image in enumerate(batch_images):
288
+ concatenated_image.paste(image, (x_offset, y_offset))
289
+ x_offset += image.width
290
+ if (count + 1) % column_num == 0:
291
+ y_offset += image.height
292
+ x_offset = 0
293
+ concatenated_images.append(concatenated_image)
294
+ return concatenated_images
295
+
296
+
297
+ def eval_score(gt, pred, answer_type):
298
+ if answer_type == 'Int':
299
+ try:
300
+ gt, pred = int(gt), int(float(pred))
301
+ except:
302
+ pred = ''
303
+ score = (gt == pred)
304
+ elif answer_type == 'Float':
305
+ try:
306
+ gt = float(get_clean_string(str(gt)))
307
+ pred = float(get_clean_string(str(pred)))
308
+ except:
309
+ pred = ''
310
+ score = is_float_equal(gt, pred, include_percentage=True, is_close=True)
311
+ elif answer_type == 'Str':
312
+ gt = get_clean_string(gt)
313
+ pred = get_clean_string(pred)
314
+ if is_exact_match(gt):
315
+ score = (gt == pred)
316
+ else:
317
+ score = anls_compute(gt, pred)
318
+ else:
319
+ if isinstance(gt, str) and gt.startswith('['):
320
+ gt = eval(gt)
321
+ if not isinstance(gt, list):
322
+ gt = [gt]
323
+ if isinstance(pred, str) and pred.startswith('['):
324
+ pred = eval(pred)
325
+ if not isinstance(pred, list):
326
+ pred = [pred]
327
+ print(len(gt), len(pred))
328
+ if len(gt) != len(pred):
329
+ score = 0.0
330
+ else:
331
+ gt = sorted([get_clean_string(a) for a in gt])
332
+ pred = sorted([get_clean_string(a) for a in pred])
333
+ print(gt, pred)
334
+ if isfloat(gt[0]) or is_exact_match(gt[0]):
335
+ score = ('-'.join(gt) == '-'.join(pred))
336
+ else:
337
+ score = min([anls_compute(gt_v, pred_v) for gt_v, pred_v in zip(gt, pred)])
338
+
339
+ return float(score)
340
+
341
+
342
+ def MMLongBench_auxeval(model, line):
343
+ prompt = build_mmlongbench_gpt4_prompt(line)
344
+ log = ''
345
+ retry = 5
346
+
347
+ for i in range(retry):
348
+ prediction = line['prediction']
349
+ res = model.generate(prompt, temperature=i * 0.5)
350
+
351
+ if FAIL_MSG in res:
352
+ log += f'Try {i}: output is {prediction}, failed to parse.\n'
353
+ else:
354
+ log += 'Succeed'
355
+ try:
356
+ pred = res.split('Answer format:')[0].split('Extracted answer:')[1].strip()
357
+ except:
358
+ pred = ''
359
+ return dict(log=log, res=res, pred=pred)
360
+ log += 'All 5 retries failed.\n'
361
+ return dict(log=log, res='', pred='')
362
+
363
+
364
+ def get_f1(data):
365
+ gt_pos_data = data[data.apply(lambda k: k['answer'] != 'Not answerable', axis=1)]
366
+ pred_pos_data = data[data.apply(lambda k: k['pred'] != 'Not answerable', axis=1)]
367
+ recall = sum(gt_pos_data['score'].tolist()) / len(gt_pos_data)
368
+ precision = sum(pred_pos_data['score'].tolist()) / len(pred_pos_data)
369
+ return 2 * recall * precision / (recall + precision)
370
+
371
+
372
+ def MMLongBench_acc(result_file):
373
+ data = load(result_file)
374
+ overall_score = 0.0
375
+ score_list = list()
376
+ for i in range(len(data)):
377
+ item = data.iloc[i]
378
+ try:
379
+ score = eval_score(item['answer'], item['pred'], item['answer_format'])
380
+ except:
381
+ score = 0.0
382
+ score_list.append(score)
383
+ overall_score += score
384
+
385
+ data['score'] = score_list
386
+ dump(data, result_file)
387
+
388
+ data_chart = data[data.apply(lambda k: 'Chart' in eval(k['evidence_sources']), axis=1)]
389
+ data_table = data[data.apply(lambda k: 'Table' in eval(k['evidence_sources']), axis=1)]
390
+ data_image = data[data.apply(lambda k: 'Figure' in eval(k['evidence_sources']), axis=1)]
391
+ data_text = data[data.apply(lambda k: 'Pure-text (Plain-text)' in eval(k['evidence_sources']), axis=1)]
392
+ data_layout = data[data.apply(lambda k: 'Generalized-text (Layout)' in eval(k['evidence_sources']), axis=1)]
393
+
394
+ data_single = data[data.apply(lambda k: len(eval(k['evidence_pages'])) == 1, axis=1)]
395
+ data_multi = data[data.apply(lambda k: len(eval(k['evidence_pages'])) > 1, axis=1)]
396
+ data_unans = data[data.apply(lambda k: len(eval(k['evidence_pages'])) == 0, axis=1)]
397
+
398
+ res = dict()
399
+ res['category'] = [
400
+ 'overall_f1', 'overall_acc', 'text', 'layout', 'table', 'chart',
401
+ 'image', 'single-page', 'multi-page', 'unanswerable'
402
+ ]
403
+ res['num'] = [
404
+ len(data), len(data), len(data_text), len(data_layout), len(data_table),
405
+ len(data_chart), len(data_image), len(data_single), len(data_multi), len(data_unans)
406
+ ]
407
+ res['avg_score'] = [
408
+ get_f1(data),
409
+ overall_score / len(data),
410
+ sum(data_text['score'].tolist()) / len(data_text) if len(data_text) > 0 else 0.0,
411
+ sum(data_layout['score'].tolist()) / len(data_layout) if len(data_layout) > 0 else 0.0,
412
+ sum(data_table['score'].tolist()) / len(data_table) if len(data_table) > 0 else 0.0,
413
+ sum(data_chart['score'].tolist()) / len(data_chart) if len(data_chart) > 0 else 0.0,
414
+ sum(data_image['score'].tolist()) / len(data_image) if len(data_image) > 0 else 0.0,
415
+ sum(data_single['score'].tolist()) / len(data_single) if len(data_single) > 0 else 0.0,
416
+ sum(data_multi['score'].tolist()) / len(data_multi) if len(data_multi) > 0 else 0.0,
417
+ sum(data_unans['score'].tolist()) / len(data_unans) if len(data_unans) > 0 else 0.0,
418
+ ]
419
+ res = pd.DataFrame(res)
420
+ return res
421
+
422
+
423
+ class MMLongBench(ImageBaseDataset):
424
+
425
+ TYPE = 'VQA'
426
+
427
+ DATASET_URL = {
428
+ 'MMLongBench_DOC': 'https://opencompass.openxlab.space/utils/VLMEval/MMLongBench_DOC.tsv',
429
+ }
430
+ DATASET_MD5 = {
431
+ 'MMLongBench_DOC': '9b393e1f4c52718380d50586197eac9b',
432
+ }
433
+
434
+ SUPPORTED_MODELS = {
435
+ 'GPT4': (1, 1),
436
+ 'GPT4V': (1, 1),
437
+ 'GPT4V_HIGH': (1, 1),
438
+ 'GPT4o': (1, 1),
439
+ 'GPT4o_HIGH': (1, 1),
440
+ 'GPT4o_MINI': (1, 1),
441
+ 'MiniCPM-Llama3-V-2_5': (1, 5),
442
+ 'InternVL-Chat-V1-5': (5, 2),
443
+ 'XComposer2_4KHD': (1, 5),
444
+ 'XComposer2d5': (1, -1),
445
+ }
446
+
447
+ def __init__(self, dataset, **kwargs):
448
+ self.model_list = list(self.SUPPORTED_MODELS.keys())
449
+ model_name = kwargs['model']
450
+ if not listinstr(self.model_list, model_name):
451
+ raise AssertionError("{} doesn't support the evaluation on MMLongBench_DOC.".format(model_name))
452
+ super(MMLongBench, self).__init__(dataset)
453
+
454
+ self.is_api = True if listinstr(['GPT4'], model_name) else False
455
+ self.max_pages = 120
456
+ concat_num, column_num = self.SUPPORTED_MODELS.get(model_name)
457
+ self.concat_num = concat_num
458
+ self.column_num = column_num
459
+
460
+ def dump_image(self, origin_line):
461
+ os.makedirs(self.img_root, exist_ok=True)
462
+ try:
463
+ import fitz
464
+ except Exception as e:
465
+ logging.critical(f'{type(e)}: {e}')
466
+ logging.critical('Please use `pip install pymupdf` to parse PDF files.')
467
+
468
+ line = origin_line.copy()
469
+ line['image_path'] = line['image_path'][:self.max_pages]
470
+ skip_pdf_parse = True
471
+ for im_name in line['image_path']:
472
+ path = osp.join(self.img_root, im_name)
473
+ if not read_ok(path):
474
+ skip_pdf_parse = False
475
+ break
476
+
477
+ # Just for being compatible with the zooped loop: zip(line['image'], line['image_path'])
478
+ if skip_pdf_parse:
479
+ line['image'] = line['image_path']
480
+ else:
481
+ pdf_data = base64.b64decode(line['image'])
482
+ pdf_file = io.BytesIO(pdf_data)
483
+ encoded_images = []
484
+ with fitz.open(stream=pdf_file, filetype='pdf') as doc:
485
+ doc = doc[:self.max_pages]
486
+ for page in doc:
487
+ image = page.get_pixmap(dpi=144)
488
+ image_file = io.BytesIO(image.tobytes(output='png'))
489
+ image = Image.open(image_file)
490
+ encoded_image = encode_image_to_base64(image)
491
+ encoded_images.append(encoded_image)
492
+ line['image'] = encoded_images
493
+ print('process {}'.format(line['doc_id']))
494
+
495
+ if 'image' in line:
496
+ if isinstance(line['image'], list):
497
+ tgt_path = []
498
+ assert 'image_path' in line
499
+ for img, im_name in zip(line['image'], line['image_path']):
500
+ path = osp.join(self.img_root, im_name)
501
+ if not read_ok(path):
502
+ decode_base64_to_image_file(img, path)
503
+ tgt_path.append(path)
504
+ else:
505
+ tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
506
+ if not read_ok(tgt_path):
507
+ decode_base64_to_image_file(line['image'], tgt_path)
508
+ tgt_path = [tgt_path]
509
+ else:
510
+ assert 'image_path' in line
511
+ tgt_path = toliststr(line['image_path'])
512
+
513
+ if self.concat_num > 0 and not self.is_api:
514
+ concatenated_images = concat_images(tgt_path, max_concat=self.concat_num, column_num=self.column_num)
515
+
516
+ old_tgt_path = tgt_path
517
+ assert isinstance(old_tgt_path, list)
518
+ if self.column_num != -1:
519
+ tgt_path = [
520
+ '_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat{}_{}.jpg'.format(self.concat_num, i)
521
+ for i in range(len(concatenated_images))
522
+ ]
523
+ else:
524
+ tgt_path = [
525
+ '_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat_all_{}.jpg'.format(i)
526
+ for i in range(len(concatenated_images))
527
+ ]
528
+
529
+ for path, concatenated_image in zip(tgt_path, concatenated_images):
530
+ if not read_ok(path):
531
+ decode_base64_to_image_file(encode_image_to_base64(concatenated_image), path)
532
+ num_images, image_size = len(old_tgt_path), concatenated_image.size
533
+ print('concat {} images to a new one with size {}. save at {}'.format(num_images, image_size, path))
534
+ return tgt_path
535
+
536
+ @classmethod
537
+ def evaluate(self, eval_file, **judge_kwargs):
538
+ logger = get_logger('Evaluation')
539
+ model = judge_kwargs['model']
540
+
541
+ suffix = eval_file.split('.')[-1]
542
+ storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
543
+ tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
544
+
545
+ if osp.exists(storage):
546
+ logger.warning(f'GPT scoring file {storage} already exists, will reuse it in MMLongBench_eval. ')
547
+ else:
548
+ data = load(eval_file)
549
+ model = build_judge(max_tokens=128, **judge_kwargs)
550
+ lt = len(data)
551
+ lines = [data.iloc[i] for i in range(lt)]
552
+ tups = [(model, line) for line in lines]
553
+ indices = [line['index'] for line in lines]
554
+
555
+ ans = {}
556
+ if osp.exists(tmp_file):
557
+ ans = load(tmp_file)
558
+ tups = [x for x, i in zip(tups, indices) if i not in ans]
559
+ indices = [i for i in indices if i not in ans]
560
+
561
+ if len(indices):
562
+ new_results = list()
563
+ for model, line in tqdm(tups):
564
+ res = MMLongBench_auxeval(model, line)
565
+ new_results.append(res)
566
+
567
+ log_map, res_map, pred_map = {}, {}, {}
568
+ all_inds = [line['index'] for line in lines]
569
+ for k, v in zip(all_inds, new_results):
570
+ log_map[k] = v['log']
571
+ res_map[k] = v['res']
572
+ pred_map[k] = v['pred']
573
+ data['res'] = [res_map[idx] for idx in data['index']]
574
+ data['log'] = [log_map[idx] for idx in data['index']]
575
+ data['pred'] = [pred_map[idx] for idx in data['index']]
576
+ dump(data, storage)
577
+
578
+ score = MMLongBench_acc(storage)
579
+ score_pth = storage.replace('.xlsx', '_score.csv')
580
+
581
+ dump(score, score_pth)
582
+ logger.info(f'MMLongBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
583
+ logger.info('Score: ')
584
+ logger.info(score)
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mmmath.py ADDED
@@ -0,0 +1,446 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ import sympy as sp
4
+ import numpy as np
5
+ from sympy import simplify, Eq, sympify, Pow, pi
6
+ from sympy.parsing.latex import parse_latex
7
+ import sys
8
+ import math
9
+ import os
10
+ import argparse
11
+
12
+ from .image_base import ImageBaseDataset
13
+ from ..utils import track_progress_rich
14
+ from ..smp import load, dump
15
+
16
+
17
+ class AutoScoringJudge:
18
+ def __init__(self):
19
+ # Map of special symbols to their replacements
20
+ self.special_signal_map = {
21
+ "\\left": "",
22
+ "\\right": "",
23
+ "厘米":"",
24
+ # "∶": ":",
25
+ ",": ",",
26
+ "$": "",
27
+ "(":"(",
28
+ ")":")",
29
+ "\\infty":"oo",
30
+ "\\colon ":":",
31
+ # "\\approx": "=",
32
+ # "\\simeq": "=",
33
+ # "\\sim": "=",
34
+ # "^\\prime": "'",
35
+ # "^{\\prime}": "'",
36
+ "+":"+",
37
+ "\\, ": "",
38
+ "\\,":"",
39
+ "^\\circ": "",
40
+ "^{\\circ}": "",
41
+ # "%": "",
42
+ }
43
+ self.pi = parse_latex("\\pi")
44
+ # MM-Math default precision
45
+ self.precision = 1e-2
46
+
47
+ def trans_greater_sign_to_interval(self, expr:str):
48
+ expr_tmp = expr.split("<")
49
+ return "(" + expr_tmp[0] + ", " + expr_tmp[-1] + ")"
50
+
51
+ def split_by_comma(self, expr: str):
52
+ # Splits expressions by commas outside of brackets
53
+ in_bracket_num = 0
54
+ splitted_expr = []
55
+ start_idx = 0
56
+ for i, char in enumerate(expr):
57
+ if char in ["(", "["]:
58
+ in_bracket_num += 1
59
+ elif char in [")", "]"]:
60
+ in_bracket_num -= 1
61
+ elif char == "," and in_bracket_num == 0:
62
+ splitted_expr.append(expr[start_idx:i].strip())
63
+ start_idx = i + 1
64
+
65
+ if start_idx < len(expr):
66
+ splitted_expr.append(expr[start_idx:].strip())
67
+
68
+ return splitted_expr
69
+
70
+ def trans_plus_minus_sign(self, expr_list: list):
71
+ # Translates plus-minus signs into separate expressions
72
+ new_expr_list = []
73
+ for expr in expr_list:
74
+ if "\\pm" in expr:
75
+ new_expr_list.append(expr.replace("\\pm", "+"))
76
+ new_expr_list.append(expr.replace("\\pm", "-"))
77
+ else:
78
+ new_expr_list.append(expr)
79
+
80
+ return new_expr_list
81
+
82
+ def judge(self, expression1, expression2, precision=1e-2):
83
+ # Judge if two expressions are equal (expression1 is considered as the Ground Truth)
84
+ # Default precision is a list for supporting multiple expressions
85
+ precision = precision if isinstance(precision, list) else [precision]
86
+
87
+ try:
88
+ expression1, expression2 = self.preprocess(expression1, expression2)
89
+ except:
90
+ return False
91
+ if expression1 == expression2:
92
+ # print("Exactly equal")
93
+ return True
94
+
95
+ # Remove Chinese characters from the string, as answers like "yes" or "no" in Chinese have been considered
96
+ expression1 = expression1 if re.fullmatch(r"[\u4e00-\u9fff]+", expression1) else re.sub(r'[\u4e00-\u9fff]+', '', expression1) # noqa: E501
97
+ expression2 = expression2 if re.fullmatch(r'[\u4e00-\u9fff]+', expression2) else re.sub(r'[\u4e00-\u9fff]+', '', expression2) # noqa: E501
98
+ # Check if two < or > in expression
99
+ if self.is_two_greater_sign(expression1):
100
+ expression1 = self.trans_greater_sign_to_interval(expression1)
101
+
102
+ if self.is_two_greater_sign(expression2):
103
+ expression2 = self.trans_greater_sign_to_interval(expression2)
104
+
105
+ expression1 = self.split_by_comma(expression1)
106
+ expression2 = self.split_by_comma(expression2)
107
+
108
+ temp_list1 = self.trans_plus_minus_sign(expression1)
109
+ temp_list2 = self.trans_plus_minus_sign(expression2)
110
+
111
+ # Set up a list for allowed errors
112
+ if len(precision) <= 1:
113
+ precision = precision * len(temp_list1)
114
+
115
+ if len(temp_list1) != len(temp_list2):
116
+ return False
117
+
118
+ # Check if elements in both lists can be paired and are equal
119
+ idx = -1
120
+ while len(temp_list1) != 0:
121
+ idx = (idx + 1) % len(temp_list1)
122
+
123
+ item1 = temp_list1[idx]
124
+ self.precision = precision[idx]
125
+
126
+ for item2 in temp_list2:
127
+ if self.is_equal(item1, item2):
128
+ temp_list1.remove(item1)
129
+ temp_list2.remove(item2)
130
+ precision.remove(self.precision)
131
+ break
132
+ else:
133
+ # If no match was found, return False
134
+ return False
135
+
136
+ # If all elements are matched, return True
137
+ return True
138
+
139
+ def is_interval(self, expr):
140
+ # Checks if an expression is an interval
141
+ return expr.startswith(("(", "[")) and expr.endswith((")", "]"))
142
+
143
+ def is_two_greater_sign(self, expr):
144
+ match = re.findall(r'<', expr)
145
+ return len(match) == 2
146
+
147
+ def sympy_sub_pi(self, expression_sympy):
148
+ # Replaces the symbol for pi in sympy expressions with its numerical value
149
+ return expression_sympy.subs(self.pi, math.pi)
150
+
151
+ def is_equal(self, expression1, expression2):
152
+ # Default first expression is ground truth. Check if expressions are equal in different aspects
153
+ if expression1 == expression2 and expression1 != "" and expression2 != "":
154
+ # print("Equivalent natively")
155
+ return True
156
+
157
+ # First check if both are intervals
158
+ if self.is_interval(expression1) and self.is_interval(expression2):
159
+ try:
160
+ if self.interval_equal(expression1, expression2):
161
+ # print("Interval equivalent")
162
+ return True
163
+ except:
164
+ return False
165
+
166
+ # Then check for numerical equality
167
+ try:
168
+ if self.numerical_equal(expression1, expression2):
169
+ # print("Numerically equivalent")
170
+ return True
171
+ except:
172
+ pass
173
+ # Then check if expressions are mathematically equal
174
+ try:
175
+ if self.expression_equal(expression1, expression2) and not ("=" in expression1 and "=" in expression2):
176
+ # print("Expression equivalent")
177
+ return True
178
+ except:
179
+ pass
180
+
181
+ # Lastly, check for equation equality
182
+ try:
183
+ if self.equation_equal(expression1, expression2):
184
+ # print("Equation equivalent")
185
+ return True
186
+ except:
187
+ pass
188
+
189
+ return False
190
+
191
+ def numerical_equal(self, expression1: str, expression2: str, include_percentage: bool = True):
192
+ # Check if two numerical values are equal within an allowed error range
193
+ # Includes possible percentage cases
194
+ reference = float(expression1)
195
+ prediction = float(expression2)
196
+
197
+ if include_percentage:
198
+ gt_result = [reference / 100, reference, reference * 100]
199
+ else:
200
+ gt_result = [reference]
201
+
202
+ for item in gt_result:
203
+ if abs(item - prediction) <= self.precision * 1.01:
204
+ return True
205
+ return False
206
+
207
+ def expression_equal(self, exp1, exp2):
208
+ # Check if two expressions are mathematically equivalent
209
+ # Extract expression and use sympy for equivalence checking
210
+ def extract_expression(expression):
211
+ if "=" in expression:
212
+ expression = expression.split("=")[1]
213
+ return expression.strip()
214
+
215
+ exp1 = extract_expression(exp1)
216
+ exp2 = extract_expression(exp2)
217
+
218
+ exp_too_long = len(exp1) > 300 or len(exp2) > 300
219
+
220
+ expr1_sym = sympify(parse_latex(exp1))
221
+ expr2_sym = sympify(parse_latex(exp2))
222
+ if expr1_sym == expr2_sym:
223
+ return True
224
+ else:
225
+ expr1_sym = self.sympy_sub_pi(expr1_sym)
226
+ expr2_sym = self.sympy_sub_pi(expr2_sym)
227
+
228
+ if (expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol)) or \
229
+ (not expr1_sym.has(sp.Symbol) and expr2_sym.has(sp.Symbol)):
230
+ return False
231
+ elif not expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol):
232
+ try:
233
+ if not (self.can_compute_power(expr1_sym) and self.can_compute_power(expr2_sym)):
234
+ print("These two numbers cannot be calculated by the current computer for: "
235
+ f"\"{str(expr1_sym)}\" and \"{str(expr2_sym)}\"")
236
+ return False
237
+ if exp_too_long:
238
+ print(f'Expression {exp1} or {exp2} is too long to compute. ')
239
+ return False
240
+ if abs(expr1_sym.evalf() - expr2_sym.evalf()) <= self.precision * 1.01:
241
+ return True
242
+ else:
243
+ return False
244
+ except:
245
+ return False
246
+ elif exp_too_long:
247
+ print(f'Expression {exp1} or {exp2} is too long to compute. ')
248
+ return False
249
+ else:
250
+ try:
251
+ simplified_expr = simplify(expr1_sym - expr2_sym)
252
+ num_value = simplified_expr.evalf()
253
+ return abs(num_value) < 1e-3
254
+ except:
255
+ return False
256
+
257
+ def equation_equal(self, expression1, expression2):
258
+ # Check if two equations are mathematically equivalent
259
+ # Simplify equations and use sympy for equivalence checking
260
+ def simplify_equation(latex_eq):
261
+ lhs, rhs = latex_eq.split('=')
262
+
263
+ lhs_expr = parse_latex(lhs)
264
+ rhs_expr = parse_latex(rhs)
265
+
266
+ equation = Eq(lhs_expr, rhs_expr)
267
+
268
+ simplified_eq = simplify(equation.lhs - equation.rhs)
269
+
270
+ return simplified_eq
271
+
272
+ expr1_sym = simplify_equation(expression1)
273
+ expr2_sym = simplify_equation(expression2)
274
+
275
+ division_result_1 = simplify(expr1_sym / expr2_sym)
276
+ division_result_2 = simplify(expr2_sym / expr1_sym)
277
+
278
+ if ((division_result_1.is_Integer and division_result_1 != 0) or # noqa: W504
279
+ (division_result_2.is_Integer and division_result_2 != 0)):
280
+ return True
281
+ else:
282
+ return False
283
+
284
+ def interval_equal(self, expression1, expression2):
285
+ # Check if two intervals are mathematically equivalent
286
+ def compare_two_interval(inter1, inter2):
287
+ if inter1[0] != inter2[0] or inter1[-1] != inter2[-1]:
288
+ return False
289
+
290
+ inter1 = inter1.strip('[]()')
291
+ inter2 = inter2.strip('[]()')
292
+
293
+ items_1 = inter1.split(',')
294
+ items_2 = inter2.split(',')
295
+
296
+ for item_1, item_2 in zip(items_1, items_2):
297
+ if not self.expression_equal(item_1, item_2):
298
+ return False
299
+ return True
300
+
301
+ interval1 = expression1
302
+ interval2 = expression2
303
+
304
+ if interval1 == interval2:
305
+ return True
306
+ else:
307
+ inter_list1 = interval1.split("\\cup")
308
+ inter_list2 = interval2.split("\\cup")
309
+
310
+ if len(inter_list1) != len(inter_list2):
311
+ return False
312
+ else:
313
+ for inter1, inter2 in zip(inter_list1, inter_list2):
314
+ if not compare_two_interval(inter1, inter2):
315
+ return False
316
+ return True
317
+
318
+ def preprocess(self, expression1, expression2):
319
+ # Preprocess expressions to extract and replace special symbols
320
+ def extract_boxed_content(latex_str):
321
+ boxed_matches = re.finditer(r'\\boxed{', latex_str)
322
+ results = ""
323
+
324
+ for match in boxed_matches:
325
+ start_index = match.end()
326
+ end_index = start_index
327
+ stack = 1
328
+
329
+ while stack > 0 and end_index < len(latex_str):
330
+ if latex_str[end_index] == '{':
331
+ stack += 1
332
+ elif latex_str[end_index] == '}':
333
+ stack -= 1
334
+ end_index += 1
335
+
336
+ if stack == 0:
337
+ content = latex_str[start_index:end_index - 1]
338
+ results += content + ","
339
+ else:
340
+ raise ValueError("Mismatched braces in LaTeX string.")
341
+
342
+ if results == "":
343
+ last_line_ans = latex_str.strip().split("\n")[-1]
344
+ dollar_pattern = r"\$(.*?)\$"
345
+ answers = re.findall(dollar_pattern, last_line_ans)
346
+
347
+ if answers:
348
+ for ans in answers:
349
+ results += ans + ","
350
+ else:
351
+ results = latex_str
352
+
353
+ return results
354
+
355
+ def sepcial_symbol_replace(expression):
356
+
357
+ expression = expression.replace("\\text{cm}^2", '').replace("\\text{cm}", "").replace("\\,cm", '').replace("\\text{ cm}", '').replace("cm", '').replace("\\text{分米}^2", '').replace("cm^{2}", '').replace("60 \\text{ cm}^2",'').replace("\\ \\text{m}", "").replace("\\text{米}","").strip() # noqa: E501
358
+
359
+ expression = re.sub(r"(.+)m$", r"\1", expression)
360
+
361
+ if "\\in " in expression:
362
+ expression = expression.split("\\in ")[1]
363
+
364
+ for signal in self.special_signal_map:
365
+ expression = expression.replace(signal, self.special_signal_map[signal])
366
+
367
+ expression = re.sub(r'(\\sin|\\cos|\\tan)(\d+)', r'\1((\2/180)\\pi)', expression)
368
+
369
+ expression = expression.strip("\n,.:;^_=+`!@#%^&*~,。")
370
+
371
+ pattern = r'\\(?:mathrm|mathbf)\{~?([^}]*)\}'
372
+ expression = re.sub(pattern, r'\1', expression)
373
+
374
+ return expression
375
+
376
+ exp1, exp2 = extract_boxed_content(expression1), extract_boxed_content(expression2)
377
+
378
+ exp1, exp2 = sepcial_symbol_replace(exp1), sepcial_symbol_replace(exp2)
379
+
380
+ return exp1, exp2
381
+
382
+ def can_compute_power(self, expr):
383
+ # Checks if a power expression can be computed
384
+ if isinstance(expr, Pow):
385
+ base, exp = expr.as_base_exp()
386
+ if base.is_number and exp.is_number:
387
+ MAX_EXP = 1000 # Adjust based on computing environment
388
+ if abs(exp.evalf()) > MAX_EXP:
389
+ return False
390
+ else:
391
+ return True
392
+ else:
393
+ return False
394
+ else:
395
+ return True # Not a power expression, can compute
396
+
397
+
398
+ class MMMath(ImageBaseDataset):
399
+
400
+ TYPE = 'VQA'
401
+
402
+ DATASET_URL = {
403
+ 'MM-Math': 'https://opencompass.openxlab.space/utils/VLMEval/MM-Math.tsv',
404
+ }
405
+ DATASET_MD5 = {
406
+ 'MM-Math': '1f064ed7c4e0e8926a3fa65849419ca5',
407
+ }
408
+
409
+ @classmethod
410
+ def evaluate(self, eval_file, **kwargs):
411
+
412
+ data = load(eval_file)
413
+ judger = AutoScoringJudge()
414
+ func = judger.judge
415
+
416
+ tups = [dict(expression1=x, expression2=y) for x, y in zip(data['answer'], data['prediction'])]
417
+
418
+ res = track_progress_rich(func, tups, nproc=16)
419
+ data['hit'] = res
420
+ dump(data, eval_file)
421
+
422
+ score_file = eval_file.replace('.xlsx', '_score.json')
423
+ score = {}
424
+ score['overall'] = np.mean(data['hit'])
425
+ # Results by Difficulty
426
+ difficulties = set(data['difficulty'])
427
+ for d in difficulties:
428
+ score[f'Difficulty-{d}'] = np.mean(data[data['difficulty'] == d]['hit'])
429
+
430
+ # Results by Year
431
+ years = set(data['year'])
432
+ for y in years:
433
+ score[f'Year-{y}'] = np.mean(data[data['year'] == y]['hit'])
434
+
435
+ # Results by Knowledge-L1
436
+ points = set(data['knowledge_l1'])
437
+ for p in points:
438
+ score[f'Knowledge-L1-{p}'] = np.mean(data[data['knowledge_l1'] == p]['hit'])
439
+
440
+ # Results by Knowledge-L2
441
+ points = set(data['knowledge_l2'])
442
+ for p in points:
443
+ score[f'Knowledge-L2-{p}'] = np.mean(data[data['knowledge_l2'] == p]['hit'])
444
+
445
+ dump(score, score_file)
446
+ return score
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mvbench.py ADDED
@@ -0,0 +1,666 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import huggingface_hub
2
+ from huggingface_hub import snapshot_download
3
+ from ..smp import *
4
+ from .video_base import VideoBaseDataset
5
+ from .utils import build_judge, DEBUG_MESSAGE
6
+ from ..utils import track_progress_rich
7
+ import torchvision.transforms as T
8
+ from torchvision import transforms
9
+ from torchvision.transforms.functional import InterpolationMode
10
+ from decord import VideoReader, cpu
11
+ import imageio
12
+ import cv2
13
+ import zipfile
14
+ import os
15
+ import glob
16
+ from .utils.mvbench import *
17
+
18
+ FAIL_MSG = 'Failed to obtain answer via API.'
19
+
20
+
21
+ class MVBench(VideoBaseDataset):
22
+
23
+ MD5 = 'fd21d36522cdedd46d84dc46715ad832'
24
+ SYS = """Carefully watch the video and pay attention to the cause and sequence of events, \
25
+ the detail and movement of objects, and the action and pose of persons. \
26
+ Based on your observations, select the best option that accurately addresses the question.
27
+ """
28
+
29
+ TYPE = 'Video-MCQ'
30
+
31
+ def __init__(self, dataset='MVBench', nframe=0, fps=-1):
32
+ self.type_data_list = {
33
+ 'Action Sequence': ('action_sequence.json',
34
+ 'your_data_path/star/Charades_v1_480/', 'video', True), # has start & end
35
+ 'Action Prediction': ('action_prediction.json',
36
+ 'your_data_path/star/Charades_v1_480/', 'video', True), # has start & end
37
+ 'Action Antonym': ('action_antonym.json',
38
+ 'your_data_path/ssv2_video/', 'video', False),
39
+ 'Fine-grained Action': ('fine_grained_action.json',
40
+ 'your_data_path/Moments_in_Time_Raw/videos/', 'video', False),
41
+ 'Unexpected Action': ('unexpected_action.json',
42
+ 'your_data_path/FunQA_test/test/', 'video', False),
43
+ 'Object Existence': ('object_existence.json',
44
+ 'your_data_path/clevrer/video_validation/', 'video', False),
45
+ 'Object Interaction': ('object_interaction.json',
46
+ 'your_data_path/star/Charades_v1_480/', 'video', True), # has start & end
47
+ 'Object Shuffle': ('object_shuffle.json',
48
+ 'your_data_path/perception/videos/', 'video', False),
49
+ 'Moving Direction': ('moving_direction.json',
50
+ 'your_data_path/clevrer/video_validation/', 'video', False),
51
+ 'Action Localization': ('action_localization.json',
52
+ 'your_data_path/sta/sta_video/', 'video', True), # has start & end
53
+ 'Scene Transition': ('scene_transition.json',
54
+ 'your_data_path/scene_qa/video/', 'video', False),
55
+ 'Action Count': ('action_count.json',
56
+ 'your_data_path/perception/videos/', 'video', False),
57
+ 'Moving Count': ('moving_count.json',
58
+ 'your_data_path/clevrer/video_validation/', 'video', False),
59
+ 'Moving Attribute': ('moving_attribute.json',
60
+ 'your_data_path/clevrer/video_validation/', 'video', False),
61
+ 'State Change': ('state_change.json',
62
+ 'your_data_path/perception/videos/', 'video', False),
63
+ 'Fine-grained Pose': ('fine_grained_pose.json',
64
+ 'your_data_path/nturgbd/', 'video', False),
65
+ 'Character Order': ('character_order.json',
66
+ 'your_data_path/perception/videos/', 'video', False),
67
+ 'Egocentric Navigation': ('egocentric_navigation.json',
68
+ 'your_data_path/vlnqa/', 'video', False),
69
+ 'Episodic Reasoning': ('episodic_reasoning.json',
70
+ 'your_data_path/tvqa/frames_fps3_hq/', 'frame', True), # has start & end, read frame
71
+ 'Counterfactual Inference': ('counterfactual_inference.json',
72
+ 'your_data_path/clevrer/video_validation/', 'video', False),
73
+ }
74
+ super().__init__(dataset=dataset, nframe=nframe, fps=fps)
75
+
76
+ @classmethod
77
+ def supported_datasets(cls):
78
+ return ['MVBench']
79
+
80
+ def prepare_dataset(self, dataset_name='MVBench', repo_id='OpenGVLab/MVBench'):
81
+ def check_integrity(pth):
82
+ data_file = osp.join(pth, f'{dataset_name}.tsv')
83
+
84
+ if not os.path.exists(data_file):
85
+ return False
86
+
87
+ if md5(data_file) != self.MD5:
88
+ return False
89
+
90
+ data = load(data_file)
91
+ for idx, item in data.iterrows():
92
+ if not osp.exists(osp.join(pth, item['prefix'], item['video'])):
93
+ return False
94
+ return True
95
+
96
+ if modelscope_flag_set():
97
+ repo_id = 'modelscope/MVBench'
98
+
99
+ cache_path = get_cache_path(repo_id, branch='main')
100
+ if cache_path is not None and check_integrity(cache_path):
101
+ dataset_path = cache_path
102
+ else:
103
+ def unzip_hf_zip(pth):
104
+ pth = os.path.join(pth, 'video/')
105
+ for filename in os.listdir(pth):
106
+ if filename.endswith('.zip'):
107
+ # 构建完整的文件路径
108
+ zip_path = os.path.join(pth, filename)
109
+
110
+ # 解压 ZIP 文件
111
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
112
+ zip_ref.extractall(pth)
113
+
114
+ def generate_tsv(pth):
115
+ data_file = osp.join(pth, f'{dataset_name}.tsv')
116
+ if os.path.exists(data_file) and md5(data_file) == self.MD5:
117
+ return
118
+ json_data_dir = os.path.join(pth, 'json')
119
+ self.data_list = []
120
+ for k, v in self.type_data_list.items():
121
+ with open(os.path.join(json_data_dir, v[0]), 'r') as f:
122
+ json_data = json.load(f)
123
+ for data in json_data:
124
+ if os.path.exists(os.path.join(pth, v[1].replace('your_data_path', 'video'), data['video'])):
125
+ self.data_list.append({
126
+ 'task_type': k,
127
+ 'prefix': v[1].replace('your_data_path', 'video'),
128
+ 'data_type': v[2],
129
+ 'bound': v[3],
130
+ 'start': data['start'] if 'start' in data.keys() else None,
131
+ 'end': data['end'] if 'end' in data.keys() else None,
132
+ 'video': data['video'],
133
+ 'question': data['question'],
134
+ 'answer': data['answer'],
135
+ 'candidates': data['candidates']
136
+ })
137
+ else:
138
+ print(
139
+ 'NTURGB-D zip file is removed according to MVBench, you can view it at '
140
+ 'https://huggingface.co/datasets/OpenGVLab/MVBench for detailed reason.'
141
+ )
142
+ raise Exception(
143
+ f"{os.path.join(v[1].replace('your_data_path', 'video'), data['video'])} does not exist"
144
+ )
145
+
146
+ data_df = pd.DataFrame(self.data_list)
147
+ data_df = data_df.assign(index=range(len(data_df)))
148
+ data_df.to_csv(data_file, sep='\t', index=False)
149
+
150
+ def move_files(pth):
151
+ src_folder = os.path.join(pth, 'video/data0613')
152
+ if not os.path.exists(src_folder):
153
+ return
154
+ for subdir in os.listdir(src_folder):
155
+ subdir_path = os.path.join(src_folder, subdir)
156
+ if os.path.isdir(subdir_path):
157
+ for subsubdir in os.listdir(subdir_path):
158
+ subsubdir_path = os.path.join(subdir_path, subsubdir)
159
+ if os.path.isdir(subsubdir_path):
160
+ for item in os.listdir(subsubdir_path):
161
+ item_path = os.path.join(subsubdir_path, item)
162
+ target_folder = os.path.join(pth, 'video', subdir, subsubdir)
163
+ if not os.path.exists(target_folder):
164
+ os.makedirs(target_folder)
165
+ target_path = os.path.join(target_folder, item)
166
+ try:
167
+ shutil.move(item_path, target_path)
168
+ except Exception as e:
169
+ print(f"Error moving {item_path} to {target_path}: {e}")
170
+
171
+ if modelscope_flag_set():
172
+ from modelscope import dataset_snapshot_download
173
+ dataset_path = dataset_snapshot_download(dataset_id=repo_id, revision='master')
174
+ else:
175
+ hf_token = os.environ.get('HUGGINGFACE_TOKEN')
176
+ huggingface_hub.login(hf_token)
177
+ dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
178
+ unzip_hf_zip(dataset_path)
179
+ move_files(dataset_path)
180
+ generate_tsv(dataset_path)
181
+
182
+ data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
183
+
184
+ self.decord_method = {
185
+ 'video': self.read_video,
186
+ 'gif': self.read_gif,
187
+ 'frame': self.read_frame,
188
+ }
189
+
190
+ self.nframe = 8
191
+ self.frame_fps = 3
192
+
193
+ # transform
194
+ self.transform = T.Compose([
195
+ Stack(),
196
+ ToTorchFormatTensor()
197
+ ])
198
+
199
+ return dict(root=dataset_path, data_file=data_file)
200
+
201
+ def get_index(self, bound, fps, max_frame, first_idx=0):
202
+ if bound:
203
+ start, end = bound[0], bound[1]
204
+ else:
205
+ start, end = -100000, 100000
206
+ start_idx = max(first_idx, round(start * fps))
207
+ end_idx = min(round(end * fps), max_frame)
208
+ seg_size = float(end_idx - start_idx) / self.num_segments
209
+ frame_indices = np.array([
210
+ int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
211
+ for idx in range(self.num_segments)
212
+ ])
213
+ return frame_indices
214
+
215
+ def read_video(self, video_path, bound=None):
216
+ vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
217
+ max_frame = len(vr) - 1
218
+ fps = float(vr.get_avg_fps())
219
+
220
+ images_group = list()
221
+ frame_indices = self.get_index(bound, fps, max_frame, first_idx=0)
222
+ for frame_index in frame_indices:
223
+ img = Image.fromarray(vr[frame_index].asnumpy())
224
+ images_group.append(img)
225
+ torch_imgs = self.transform(images_group)
226
+ return torch_imgs
227
+
228
+ def read_gif(self, video_path, bound=None, fps=25):
229
+ gif = imageio.get_reader(video_path)
230
+ max_frame = len(gif) - 1
231
+
232
+ images_group = list()
233
+ frame_indices = self.get_index(bound, fps, max_frame, first_idx=0)
234
+ for index, frame in enumerate(gif):
235
+ if index in frame_indices:
236
+ img = cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB)
237
+ img = Image.fromarray(img)
238
+ images_group.append(img)
239
+ torch_imgs = self.transform(images_group)
240
+ return torch_imgs
241
+
242
+ def read_frame(self, video_path, bound=None, fps=3):
243
+ max_frame = len(os.listdir(video_path))
244
+ images_group = list()
245
+ frame_indices = self.get_index(bound, fps, max_frame, first_idx=1) # frame_idx starts from 1
246
+ for frame_index in frame_indices:
247
+ img = Image.open(os.path.join(video_path, f'{frame_index:05d}.jpg'))
248
+ images_group.append(img)
249
+ torch_imgs = self.transform(images_group)
250
+ return torch_imgs
251
+
252
+ def save_video_frames(self, imgs, video_name, frames):
253
+
254
+ frame_paths = self.frame_paths(video_name)
255
+ flag = np.all([osp.exists(p) for p in frame_paths])
256
+
257
+ if not flag:
258
+ block_size = imgs.size(0) // frames
259
+ split_tensors = torch.split(imgs, block_size)
260
+ to_pil = transforms.ToPILImage()
261
+ images = [to_pil(arr) for arr in split_tensors]
262
+ for im, pth in zip(images, frame_paths):
263
+ if not osp.exists(pth):
264
+ im.save(pth)
265
+
266
+ return frame_paths
267
+
268
+ def qa_template(self, data):
269
+ question = f"Question: {data['question']}\n"
270
+ question += 'Options:\n'
271
+ answer = data['answer']
272
+ answer_idx = -1
273
+ for idx, c in enumerate(eval(data['candidates'])):
274
+ question += f"({chr(ord('A') + idx)}) {c}\n"
275
+ if c == answer:
276
+ answer_idx = idx
277
+ question = question.rstrip()
278
+ answer = f"({chr(ord('A') + answer_idx)}) {answer}"
279
+ return question, answer
280
+
281
+ def load_into_video_and_process(self, line):
282
+ try:
283
+ from moviepy.editor import VideoFileClip, ImageSequenceClip
284
+ except:
285
+ raise ImportError(
286
+ 'MoviePy is not installed, please install it by running "pip install moviepy==1.0.3"'
287
+ )
288
+ video_path = os.path.join(self.data_root, line['prefix'], line['video'])
289
+
290
+ if line['data_type'] in ['gif'] or os.path.splitext(video_path)[1] in ['.webm']:
291
+ processed_video_path = video_path.replace(os.path.splitext(video_path)[1], '.mp4')
292
+ if not os.path.exists(processed_video_path):
293
+ # using MoviePy to transform GIF, webm into mp4 format
294
+ gif_clip = VideoFileClip(video_path)
295
+ gif_clip.write_videofile(processed_video_path, codec='libx264')
296
+ gif_clip.close()
297
+ elif line['data_type'] in ['frame']:
298
+ input_images = os.path.join(video_path, '*.jpg')
299
+ processed_video_path = f'{video_path}.mp4'
300
+ if not os.path.exists(processed_video_path):
301
+ # using MoviePy to transform images into mp4
302
+ image_files = sorted(glob.glob(input_images))
303
+ image_clip = ImageSequenceClip(image_files, fps=self.frame_fps)
304
+ image_clip.write_videofile(processed_video_path, codec='libx264')
305
+ image_clip.close()
306
+ else:
307
+ processed_video_path = video_path
308
+
309
+ if line['bound']:
310
+ base_name, suffix = os.path.splitext(processed_video_path)
311
+ output_video_path = f'{base_name}_processed{suffix}'
312
+ if not os.path.exists(output_video_path):
313
+ video_clip = VideoFileClip(processed_video_path)
314
+ clip = video_clip.subclip(line['start'], min(line['end'], video_clip.duration))
315
+ clip.write_videofile(output_video_path)
316
+ clip.close()
317
+ else:
318
+ output_video_path = processed_video_path
319
+
320
+ return output_video_path
321
+
322
+ def save_video_into_images(self, line):
323
+ bound = None
324
+ if line['bound']:
325
+ bound = (
326
+ line['start'],
327
+ line['end'],
328
+ )
329
+ video_path = os.path.join(self.data_root, line['prefix'], line['video'])
330
+ decord_method = self.decord_method[line['data_type']]
331
+ self.num_segments = self.nframe
332
+ torch_imgs = decord_method(video_path, bound)
333
+ img_frame_paths = self.save_video_frames(torch_imgs, line['video'], self.num_segments)
334
+ return img_frame_paths
335
+
336
+ def build_prompt(self, line, video_llm):
337
+ if self.fps > 0:
338
+ raise ValueError('MVBench does not support fps setting, please transfer to MVBench_MP4!')
339
+ if isinstance(line, int):
340
+ assert line < len(self)
341
+ line = self.data.iloc[line]
342
+
343
+ question, answer = self.qa_template(line)
344
+ message = [dict(type='text', value=self.SYS, role='system')]
345
+ message.append(dict(type='text', value=question))
346
+ if video_llm:
347
+ new_video_path = self.load_into_video_and_process(line)
348
+ message.append(dict(type='video', value=new_video_path))
349
+ else:
350
+ img_frame_paths = self.save_video_into_images(line)
351
+ for im in img_frame_paths:
352
+ message.append(dict(type='image', value=im))
353
+ message.append(dict(type='text', value='\nOnly give the best option.'))
354
+ message.append(dict(type='text', value='Best option:(', role='assistant'))
355
+ return message
356
+
357
+ @classmethod
358
+ def evaluate(self, eval_file, **judge_kwargs):
359
+
360
+ assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
361
+
362
+ tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
363
+ tgt_file = eval_file.replace('.xlsx', '_rating.json')
364
+ score_file = eval_file.replace('.xlsx', '_score.xlsx')
365
+
366
+ if not osp.exists(score_file):
367
+ model = judge_kwargs.setdefault('model', 'chatgpt-0125')
368
+ assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
369
+
370
+ if model == 'exact_matching':
371
+ model = None
372
+ elif gpt_key_set():
373
+ model = build_judge(**judge_kwargs)
374
+ if not model.working():
375
+ warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
376
+ warnings.warn(DEBUG_MESSAGE)
377
+ model = None
378
+ else:
379
+ warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
380
+ model = None
381
+ res = {} if not osp.exists(tmp_file) else load(tmp_file)
382
+ res = {k: v for k, v in res.items() if FAIL_MSG not in v}
383
+
384
+ data = load(eval_file)
385
+ data_un = data[~pd.isna(data['prediction'])]
386
+
387
+ for idx in data_un['index']:
388
+ ans = data.loc[data['index'] == idx, 'answer'].values[0]
389
+ pred = data.loc[data['index'] == idx, 'prediction'].values[0]
390
+ options = eval(data.loc[data['index'] == idx, 'candidates'].values[0])
391
+ answer_idx = -1
392
+ for id, c in enumerate(options):
393
+ if c == ans:
394
+ answer_idx = id
395
+ ans = f"({chr(ord('A') + answer_idx)}) {ans}"
396
+ input_item = data.loc[data['index'] == idx].to_dict(orient='records')[0]
397
+ for id, option_content in enumerate(eval(input_item['candidates'])):
398
+ input_item[chr(ord('A') + id)] = option_content
399
+ if option_content == input_item['answer']:
400
+ input_item['answer'] = chr(ord('A') + id)
401
+
402
+ if FAIL_MSG in pred:
403
+ data.loc[idx, 'score'] = -1
404
+ else:
405
+ data.loc[idx, 'score'] = int(check_ans_with_model(
406
+ pred, ans, model,
407
+ input_item,
408
+ 'MVBench'
409
+ ))
410
+
411
+ rejected = [x for x in data['score'] if x == -1]
412
+
413
+ print(
414
+ f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
415
+ f'failed to obtain the score for another {len(rejected)} questions. '
416
+ f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
417
+ )
418
+
419
+ dump(data, score_file)
420
+
421
+ rating = get_dimension_rating(score_file)
422
+ dump(rating, tgt_file)
423
+ return rating
424
+
425
+
426
+ class MVBench_MP4(VideoBaseDataset):
427
+
428
+ MP4_MD5 = '5c8c6f8b7972c2de65a629590f7c42f5'
429
+ SYS = """Carefully watch the video and pay attention to the cause and sequence of events, \
430
+ the detail and movement of objects, and the action and pose of persons. \
431
+ Based on your observations, select the best option that accurately addresses the question.
432
+ """
433
+ TYPE = 'Video-MCQ'
434
+
435
+ def __init__(self, dataset='MVBench_MP4', nframe=0, fps=-1):
436
+ super().__init__(dataset=dataset, nframe=nframe, fps=fps)
437
+
438
+ @classmethod
439
+ def supported_datasets(cls):
440
+ return ['MVBench_MP4']
441
+
442
+ def prepare_dataset(self, dataset_name='MVBench_MP4', repo_id='OpenGVLab/MVBench'):
443
+ def check_integrity(pth):
444
+ data_file = osp.join(pth, f'{dataset_name}.tsv')
445
+
446
+ if not os.path.exists(data_file):
447
+ return False
448
+
449
+ if md5(data_file) != self.MP4_MD5:
450
+ return False
451
+
452
+ data = load(data_file)
453
+ for idx, item in data.iterrows():
454
+ if not osp.exists(osp.join(pth, item['prefix'], item['video'])):
455
+ return False
456
+ return True
457
+
458
+ if modelscope_flag_set():
459
+ repo_id = 'modelscope/MVBench'
460
+
461
+ cache_path = get_cache_path(repo_id, branch='video')
462
+ if cache_path is not None and check_integrity(cache_path):
463
+ dataset_path = cache_path
464
+ else:
465
+ def generate_tsv(pth):
466
+ data_file = osp.join(pth, f'{dataset_name}.tsv')
467
+ if os.path.exists(data_file) and md5(data_file) == self.MP4_MD5:
468
+ return
469
+ json_data_path = os.path.join(dataset_path, 'test.json')
470
+ json_data = load(json_data_path)
471
+ root_data_dict = json_data['root']
472
+ self.data_list = []
473
+ for k, v in json_data['meta'].items():
474
+ for item in v:
475
+ self.data_list.append({
476
+ 'task_type': k,
477
+ 'prefix': root_data_dict[k],
478
+ 'video': item['video'],
479
+ 'question': item['question'],
480
+ 'answer': item['answer'],
481
+ 'candidates': item['candidates']
482
+ })
483
+ data_df = pd.DataFrame(self.data_list)
484
+ data_df = data_df.assign(index=range(len(data_df)))
485
+ data_df.to_csv(data_file, sep='\t', index=False)
486
+
487
+ if modelscope_flag_set():
488
+ from modelscope import dataset_snapshot_download
489
+ dataset_path = dataset_snapshot_download(dataset_id=repo_id, revision='video')
490
+ else:
491
+ hf_token = os.environ.get('HUGGINGFACE_TOKEN')
492
+ huggingface_hub.login(hf_token)
493
+ dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset', revision='video')
494
+ generate_tsv(dataset_path)
495
+
496
+ data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
497
+
498
+ # transform
499
+ self.transform = T.Compose([
500
+ Stack(),
501
+ ToTorchFormatTensor()
502
+ ])
503
+
504
+ return dict(root=dataset_path, data_file=data_file)
505
+
506
+ def qa_template(self, data):
507
+ question = f"Question: {data['question']}\n"
508
+ question += 'Options:\n'
509
+ answer = data['answer']
510
+ answer_idx = -1
511
+ for idx, c in enumerate(eval(data['candidates'])):
512
+ question += f"({chr(ord('A') + idx)}) {c}\n"
513
+ if c == answer:
514
+ answer_idx = idx
515
+ question = question.rstrip()
516
+ answer = f"({chr(ord('A') + answer_idx)}) {answer}"
517
+ return question, answer
518
+
519
+ def get_index_by_frame(self, max_frame):
520
+ seg_size = float(max_frame) / self.num_segments
521
+ frame_indices = np.array([
522
+ int((seg_size / 2) + np.round(seg_size * idx))
523
+ for idx in range(self.num_segments)
524
+ ])
525
+ return frame_indices
526
+
527
+ def get_index_by_fps(self, vid, fps):
528
+ total_frames = len(vid)
529
+ video_fps = vid.get_avg_fps()
530
+ total_duration = total_frames / video_fps
531
+ required_frames = int(total_duration * fps)
532
+ step_size = video_fps / fps
533
+ frame_indices = np.array([int(i * step_size) for i in range(required_frames)])
534
+ self.num_segments = len(frame_indices)
535
+ return frame_indices
536
+
537
+ def read_video(self, video_path):
538
+ vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
539
+ max_frame = len(vr) - 1
540
+
541
+ images_group = list()
542
+ if self.fps < 0:
543
+ frame_indices = self.get_index_by_frame(max_frame)
544
+ else:
545
+ frame_indices = self.get_index_by_fps(vr, self.fps)
546
+
547
+ for frame_index in frame_indices:
548
+ img = Image.fromarray(vr[frame_index].asnumpy())
549
+ images_group.append(img)
550
+ torch_imgs = self.transform(images_group)
551
+ return torch_imgs
552
+
553
+ def save_video_frames(self, imgs, video_name, frames):
554
+ if self.fps > 0:
555
+ frame_paths = self.frame_paths_fps(video_name, frames)
556
+ else:
557
+ frame_paths = self.frame_paths(video_name)
558
+ flag = np.all([osp.exists(p) for p in frame_paths])
559
+
560
+ if not flag:
561
+ block_size = imgs.size(0) // frames
562
+ split_tensors = torch.split(imgs, block_size)
563
+ to_pil = transforms.ToPILImage()
564
+ images = [to_pil(arr) for arr in split_tensors]
565
+ for im, pth in zip(images, frame_paths):
566
+ if not osp.exists(pth):
567
+ im.save(pth)
568
+
569
+ return frame_paths
570
+
571
+ def save_video_into_images(self, line):
572
+ video_path = os.path.join(self.data_root, line['prefix'], line['video'])
573
+ if self.fps <= 0:
574
+ self.num_segments = self.nframe
575
+ else:
576
+ self.num_segments = 0
577
+ torch_imgs = self.read_video(video_path)
578
+ img_frame_paths = self.save_video_frames(torch_imgs, line['video'], self.num_segments)
579
+ return img_frame_paths
580
+
581
+ def build_prompt(self, line, video_llm):
582
+ if isinstance(line, int):
583
+ assert line < len(self)
584
+ line = self.data.iloc[line]
585
+
586
+ question, answer = self.qa_template(line)
587
+ message = [dict(type='text', value=self.SYS, role='system')]
588
+ message.append(dict(type='text', value=question))
589
+ video_path = os.path.join(self.data_root, line['prefix'], line['video'])
590
+ if video_llm:
591
+ message.append(dict(type='video', value=video_path))
592
+ else:
593
+ img_frame_paths = self.save_video_into_images(line)
594
+ for im in img_frame_paths:
595
+ message.append(dict(type='image', value=im))
596
+ message.append(dict(type='text', value='\nOnly give the best option.'))
597
+ message.append(dict(type='text', value='Best option:(', role='assistant'))
598
+ return message
599
+
600
+ @classmethod
601
+ def evaluate(self, eval_file, **judge_kwargs):
602
+
603
+ assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
604
+
605
+ tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
606
+ tgt_file = eval_file.replace('.xlsx', '_rating.json')
607
+ score_file = eval_file.replace('.xlsx', '_score.xlsx')
608
+
609
+ if not osp.exists(score_file):
610
+ model = judge_kwargs.setdefault('model', 'chatgpt-0125')
611
+ assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
612
+
613
+ if model == 'exact_matching':
614
+ model = None
615
+ elif gpt_key_set():
616
+ model = build_judge(**judge_kwargs)
617
+ if not model.working():
618
+ warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
619
+ warnings.warn(DEBUG_MESSAGE)
620
+ model = None
621
+ else:
622
+ warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
623
+ model = None
624
+ res = {} if not osp.exists(tmp_file) else load(tmp_file)
625
+ res = {k: v for k, v in res.items() if FAIL_MSG not in v}
626
+
627
+ data = load(eval_file)
628
+ data_un = data[~pd.isna(data['prediction'])]
629
+
630
+ for idx in data_un['index']:
631
+ ans = data.loc[data['index'] == idx, 'answer'].values[0]
632
+ pred = data.loc[data['index'] == idx, 'prediction'].values[0]
633
+ options = eval(data.loc[data['index'] == idx, 'candidates'].values[0])
634
+ answer_idx = -1
635
+ for id, c in enumerate(options):
636
+ if c == ans:
637
+ answer_idx = id
638
+ ans = f"({chr(ord('A') + answer_idx)}) {ans}"
639
+ input_item = data.loc[data['index'] == idx].to_dict(orient='records')[0]
640
+ for id, option_content in enumerate(eval(input_item['candidates'])):
641
+ input_item[chr(ord('A') + id)] = option_content
642
+ if option_content == input_item['answer']:
643
+ input_item['answer'] = chr(ord('A') + id)
644
+
645
+ if FAIL_MSG in pred:
646
+ data.loc[idx, 'score'] = -1
647
+ else:
648
+ data.loc[idx, 'score'] = int(check_ans_with_model(
649
+ pred, ans, model,
650
+ input_item,
651
+ 'MVBench_MP4'
652
+ ))
653
+
654
+ rejected = [x for x in data['score'] if x == -1]
655
+
656
+ print(
657
+ f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
658
+ f'failed to obtain the score for another {len(rejected)} questions. '
659
+ f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
660
+ )
661
+
662
+ dump(data, score_file)
663
+
664
+ rating = get_dimension_rating(score_file)
665
+ dump(rating, tgt_file)
666
+ return rating
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/slidevqa.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import math
3
+ from typing import List
4
+
5
+ from vlmeval.dataset.utils.judge_util import build_judge
6
+ from vlmeval.smp import *
7
+ from .image_base import ImageBaseDataset
8
+ from .mmlongbench import concat_images, MMLongBench_auxeval, anls_compute
9
+
10
+
11
+ FAIL_MSG = 'Failed to obtain answer via API.'
12
+
13
+
14
+ def get_f1(gt, pred):
15
+ gt_bow, pred_bow = gt.strip().split(), pred.strip().split()
16
+ if not gt_bow or not pred_bow:
17
+ return 0.0
18
+
19
+ recall = len([pred_e for pred_e in pred_bow if pred_e in gt_bow]) / len(gt_bow)
20
+ precision = len([pred_e for pred_e in pred_bow if pred_e in gt_bow]) / len(pred_bow)
21
+ f1 = 2 * recall * precision / (recall + precision) if (recall + precision) > 1e-4 else 0.0
22
+ return f1
23
+
24
+
25
+ def SlideVQA_acc(result_file):
26
+ data = load(result_file)
27
+ anls_list, em_list, f1_list = list(), list(), list()
28
+ for i in range(len(data)):
29
+ item = data.iloc[i]
30
+ if isinstance(item['answer'], float) and math.isnan(item['answer']):
31
+ item['answer'] = 'Not answerable'
32
+
33
+ item['answer'] = re.sub('\n', '', item['answer']).lower()
34
+ item['pred'] = str(item['pred']).lower()
35
+ anls_score = anls_compute(item['answer'], item['pred'])
36
+ em_score = (item['answer'].strip() == item['pred'].strip())
37
+ f1_score = get_f1(item['answer'], item['pred'])
38
+ anls_list.append(anls_score)
39
+ em_list.append(em_score)
40
+ f1_list.append(f1_score)
41
+ print('---------------------')
42
+ print(item['answer'], item['pred'], anls_score, em_score, f1_score)
43
+
44
+ data['anls'] = anls_list
45
+ data['em'] = em_list
46
+ data['f1'] = f1_list
47
+ dump(data, result_file)
48
+
49
+ res = dict()
50
+ res['category'], res['num'] = ['anls', 'EM', 'F1'], [len(data), len(data), len(data)]
51
+ res['avg'] = [sum(anls_list) / len(data), sum(em_list) / len(data), sum(f1_list) / len(data)]
52
+ res = pd.DataFrame(res)
53
+ return res
54
+
55
+
56
+ class SlideVQA(ImageBaseDataset):
57
+
58
+ TYPE = 'VQA'
59
+
60
+ DATASET_URL = {
61
+ 'SLIDEVQA_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/SLIDEVQA_MINI.tsv',
62
+ 'SLIDEVQA': 'https://opencompass.openxlab.space/utils/VLMEval/SLIDEVQA.tsv',
63
+ }
64
+ DATASET_MD5 = {
65
+ 'SLIDEVQA_MINI': '6d9a8d8814fa5b7669deb2af3a3208eb',
66
+ 'SLIDEVQA': '5e822c2f800e94c1e23badfd478326b6',
67
+ }
68
+
69
+ SUPPORTED_MODELS = {
70
+ 'GPT4': (1, 1),
71
+ 'GPT4V': (1, 1),
72
+ 'GPT4V_HIGH': (1, 1),
73
+ 'GPT4o': (1, 1),
74
+ 'GPT4o_HIGH': (1, 1),
75
+ 'GPT4o_MINI': (1, 1),
76
+ 'XComposer2d5': (1, -1),
77
+ 'XComposer2_4KHD': (1, -1),
78
+ 'MiniCPM-Llama3-V-2_5': (1, 5),
79
+ 'InternVL-Chat-V1-5': (5, 2),
80
+ }
81
+
82
+ def __init__(self, dataset, **kwargs):
83
+ self.model_list = list(self.SUPPORTED_MODELS.keys())
84
+ model_name = kwargs['model']
85
+ if not listinstr(self.model_list, model_name):
86
+ raise AssertionError("{} doesn't support the evaluation on SlideVQA.".format(model_name))
87
+ super(SlideVQA, self).__init__(dataset)
88
+
89
+ self.is_api = True if listinstr(['GPT4'], model_name) else False
90
+ self.max_pages = 120
91
+ concat_num, column_num = self.SUPPORTED_MODELS.get(model_name)
92
+ self.concat_num = concat_num
93
+ self.column_num = column_num
94
+
95
+ def dump_image(self, origin_line):
96
+ os.makedirs(self.img_root, exist_ok=True)
97
+
98
+ line = origin_line.copy()
99
+ if not isinstance(line['image_path'], List):
100
+ line['image_path'] = [line['image_path']]
101
+ line['image_path'] = line['image_path'][:self.max_pages]
102
+
103
+ if 'image' in line:
104
+ if isinstance(line['image'], list):
105
+ tgt_path = []
106
+ assert 'image_path' in line
107
+ for img, im_name in zip(line['image'], line['image_path']):
108
+ path = osp.join(self.img_root, im_name)
109
+ if not read_ok(path):
110
+ decode_base64_to_image_file(img, path)
111
+ tgt_path.append(path)
112
+ else:
113
+ tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
114
+ if not read_ok(tgt_path):
115
+ decode_base64_to_image_file(line['image'], tgt_path)
116
+ tgt_path = [tgt_path]
117
+ else:
118
+ assert 'image_path' in line
119
+ tgt_path = toliststr(line['image_path'])
120
+
121
+ if self.concat_num > 0 and not self.is_api:
122
+ concatenated_images = concat_images(tgt_path, max_concat=self.concat_num, column_num=self.column_num)
123
+
124
+ old_tgt_path = tgt_path
125
+ assert isinstance(old_tgt_path, list)
126
+ if self.column_num != -1:
127
+ tgt_path = [
128
+ '_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat{}_{}.jpg'.format(self.concat_num, i)
129
+ for i in range(len(concatenated_images))
130
+ ]
131
+ else:
132
+ tgt_path = ['_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat_all.jpg']
133
+
134
+ for path, concatenated_image in zip(tgt_path, concatenated_images):
135
+ if not read_ok(path):
136
+ decode_base64_to_image_file(encode_image_to_base64(concatenated_image), path)
137
+ num_images, image_size = len(old_tgt_path), concatenated_image.size
138
+ print('concat {} images to a new one with size {}. save at {}'.format(num_images, image_size, path))
139
+ return tgt_path
140
+
141
+ @classmethod
142
+ def evaluate(self, eval_file, **judge_kwargs):
143
+ logger = get_logger('Evaluation')
144
+ model = judge_kwargs['model']
145
+
146
+ suffix = eval_file.split('.')[-1]
147
+ storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
148
+ tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
149
+
150
+ if osp.exists(storage):
151
+ logger.warning(f'GPT scoring file {storage} already exists, will reuse it in SlideVQA_eval. ')
152
+ else:
153
+ data = load(eval_file)
154
+ model = build_judge(max_tokens=128, **judge_kwargs)
155
+ lt = len(data)
156
+ lines = [data.iloc[i] for i in range(lt)]
157
+ tups = [(model, line) for line in lines]
158
+ indices = [line['index'] for line in lines]
159
+
160
+ ans = {}
161
+ if osp.exists(tmp_file):
162
+ ans = load(tmp_file)
163
+ tups = [x for x, i in zip(tups, indices) if i not in ans]
164
+ indices = [i for i in indices if i not in ans]
165
+
166
+ if len(indices):
167
+ new_results = list()
168
+ for model, line in tqdm(tups):
169
+ res = MMLongBench_auxeval(model, line)
170
+ new_results.append(res)
171
+
172
+ log_map, res_map, pred_map = {}, {}, {}
173
+ all_inds = [line['index'] for line in lines]
174
+ for k, v in zip(all_inds, new_results):
175
+ log_map[k] = v['log']
176
+ res_map[k] = v['res']
177
+ pred_map[k] = v['pred']
178
+ data['res'] = [res_map[idx] for idx in data['index']]
179
+ data['log'] = [log_map[idx] for idx in data['index']]
180
+ data['pred'] = [pred_map[idx] for idx in data['index']]
181
+ dump(data, storage)
182
+
183
+ score = SlideVQA_acc(storage)
184
+ score_pth = storage.replace('.xlsx', '_score.csv')
185
+
186
+ dump(score, score_pth)
187
+ logger.info(f'SlideVQA successfully finished evaluating {eval_file}, results saved in {score_pth}')
188
+ logger.info('Score: ')
189
+ logger.info(score)
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/tempcompass.py ADDED
@@ -0,0 +1,639 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import huggingface_hub
2
+ from huggingface_hub import snapshot_download
3
+ from ..smp import *
4
+ from .video_concat_dataset import ConcatVideoDataset
5
+ from .video_base import VideoBaseDataset
6
+ from .utils import build_judge, DEBUG_MESSAGE
7
+ from ..utils import track_progress_rich
8
+ import torchvision.transforms as T
9
+ from torchvision import transforms
10
+ from torchvision.transforms.functional import InterpolationMode
11
+ from decord import VideoReader, cpu
12
+ from .utils.tempcompass import *
13
+
14
+
15
+ FAIL_MSG = 'Failed to obtain answer via API.'
16
+
17
+
18
+ class TempCompass(ConcatVideoDataset):
19
+ def __init__(self, dataset='TempCompass', nframe=0, fps=-1):
20
+ self.DATASET_SETS[dataset] = ['TempCompass_MCQ', 'TempCompass_Captioning', 'TempCompass_YorN']
21
+ super().__init__(dataset=dataset, nframe=nframe, fps=fps)
22
+
23
+ @classmethod
24
+ def supported_datasets(cls):
25
+ return ['TempCompass']
26
+
27
+ def evaluate(self, eval_file, **judge_kwargs):
28
+ result = super().evaluate(eval_file=eval_file, **judge_kwargs)
29
+ suffix = eval_file.split('.')[-1]
30
+ result = result.reset_index().rename(columns={'index': 'dim.task_type'})
31
+ score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
32
+ avg_dict = {}
33
+ for idx, item in result.iterrows():
34
+ dim, task_type = item['dim.task_type'].split('. ')
35
+ if dim not in avg_dict:
36
+ avg_dict[dim] = {'success': 0.0, 'overall': 0.0}
37
+ if task_type not in avg_dict:
38
+ avg_dict[task_type] = {'success': 0.0, 'overall': 0.0}
39
+ if 'overall' not in avg_dict:
40
+ avg_dict['overall'] = {'success': 0.0, 'overall': 0.0}
41
+ avg_dict[dim]['success'] += item['success']
42
+ avg_dict[dim]['overall'] += item['overall']
43
+ avg_dict[task_type]['success'] += item['success']
44
+ avg_dict[task_type]['overall'] += item['overall']
45
+ avg_dict['overall']['success'] += item['success']
46
+ avg_dict['overall']['overall'] += item['overall']
47
+ result.loc[idx, 'acc'] = round(item['success'] / item['overall'] * 100, 2)
48
+ for key, value in avg_dict.items():
49
+ # 使用 loc 方法添加新行
50
+ result.loc[len(result)] = {
51
+ 'dim.task_type': key,
52
+ 'success': value['success'],
53
+ 'overall': value['overall'],
54
+ 'acc': round(value['success'] / value['overall'] * 100, 2)
55
+ }
56
+ dump(result, score_file)
57
+ return result
58
+
59
+
60
+ class TempCompass_MCQ(VideoBaseDataset):
61
+
62
+ MD5 = '7efbb9e6d9dabacd22daf274852691dd'
63
+ TYPE = 'Video-MCQ'
64
+
65
+ def __init__(self, dataset='TempCompass_MCQ', nframe=0, fps=-1):
66
+ self.type_data_list = {
67
+ 'multi-choice': ('multi-choice.json', './videos', '.mp4'),
68
+ 'caption_matching': ('caption_matching.json', './videos', '.mp4'),
69
+ }
70
+ super().__init__(dataset=dataset, nframe=nframe, fps=fps)
71
+
72
+ @classmethod
73
+ def supported_datasets(cls):
74
+ return ['TempCompass_MCQ']
75
+
76
+ def prepare_dataset(self, dataset_name='TempCompass_MCQ', repo_id='lmms-lab/TempCompass'):
77
+ def check_integrity(pth):
78
+ data_file = osp.join(pth, f'{dataset_name}.tsv')
79
+
80
+ if not osp.exists(data_file):
81
+ return False
82
+
83
+ if md5(data_file) != self.MD5:
84
+ return False
85
+
86
+ data = load(data_file)
87
+ for idx, item in data.iterrows():
88
+ if not osp.exists(osp.join(pth, item['prefix'], item['video'] + item['suffix'])):
89
+ return False
90
+ return True
91
+
92
+ cache_path = get_cache_path(repo_id)
93
+ if cache_path is not None and check_integrity(cache_path):
94
+ dataset_path = cache_path
95
+ else:
96
+ def read_parquet(pth):
97
+ import pandas as pd
98
+ for task_name in self.type_data_list.keys():
99
+ if not osp.exists(osp.join(pth, f'{task_name}.json')):
100
+ data = pd.read_parquet(osp.join(pth, task_name, 'test-00000-of-00001.parquet'))
101
+ data.to_json(osp.join(pth, f'{task_name}.json'), orient='records', lines=False)
102
+
103
+ def unzip_videos(pth):
104
+ import zipfile
105
+ if not osp.exists(osp.join(pth, 'videos')):
106
+ zip_file = osp.join(pth, 'tempcompass_videos.zip')
107
+ with zipfile.ZipFile(zip_file, 'r') as zip_ref:
108
+ zip_ref.extractall(pth)
109
+
110
+ def generate_tsv(pth):
111
+ data_file = osp.join(pth, f'{dataset_name}.tsv')
112
+ if osp.exists(data_file) and md5(data_file) == self.MD5:
113
+ return
114
+ self.data_list = []
115
+ for k, v in self.type_data_list.items():
116
+ with open(osp.join(pth, v[0]), 'r') as f:
117
+ json_data = json.load(f)
118
+ for data in json_data:
119
+ self.data_list.append({
120
+ 'task_type': k,
121
+ 'prefix': v[1],
122
+ 'suffix': v[2],
123
+ 'video': data['video_id'],
124
+ 'question': data['question'].split('\n')[0],
125
+ 'answer': data['answer'],
126
+ 'dim': data['dim'],
127
+ 'candidates': data['question'].split('\n')[1:],
128
+ })
129
+
130
+ data_df = pd.DataFrame(self.data_list)
131
+ data_df = data_df.assign(index=range(len(data_df)))
132
+ data_df.to_csv(data_file, sep='\t', index=False)
133
+
134
+ if modelscope_flag_set():
135
+ from modelscope import dataset_snapshot_download
136
+ dataset_path = dataset_snapshot_download(dataset_id=repo_id)
137
+ else:
138
+ dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
139
+ read_parquet(dataset_path)
140
+ unzip_videos(dataset_path)
141
+ generate_tsv(dataset_path)
142
+
143
+ data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
144
+ return dict(root=dataset_path, data_file=data_file)
145
+
146
+ def qa_template(self, data):
147
+ question = data['question'] + '\n' + '\n'.join(eval(data['candidates']))
148
+ answer = data['answer']
149
+ return question, answer
150
+
151
+ def save_video_frames(self, line):
152
+ vid_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix'])
153
+ vid = decord.VideoReader(vid_path)
154
+ video_info = {
155
+ 'fps': vid.get_avg_fps(),
156
+ 'n_frames': len(vid),
157
+ }
158
+ if self.nframe > 0 and self.fps < 0:
159
+ step_size = len(vid) / (self.nframe + 1)
160
+ indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
161
+ frame_paths = self.frame_paths(line['video'])
162
+ elif self.fps > 0:
163
+ # not constrained by num_frames, get frames by fps
164
+ total_duration = video_info['n_frames'] / video_info['fps']
165
+ required_frames = int(total_duration * self.fps)
166
+ step_size = video_info['fps'] / self.fps
167
+ indices = [int(i * step_size) for i in range(required_frames)]
168
+ frame_paths = self.frame_paths_fps(line['video'], len(indices))
169
+
170
+ flag = np.all([osp.exists(p) for p in frame_paths])
171
+
172
+ if not flag:
173
+ images = [vid[i].asnumpy() for i in indices]
174
+ images = [Image.fromarray(arr) for arr in images]
175
+ for im, pth in zip(images, frame_paths):
176
+ if not osp.exists(pth):
177
+ im.save(pth)
178
+
179
+ return frame_paths
180
+
181
+ def save_video_into_images(self, line):
182
+ frame_paths = self.save_video_frames(line)
183
+ return frame_paths
184
+
185
+ def build_prompt(self, line, video_llm):
186
+ if isinstance(line, int):
187
+ assert line < len(self)
188
+ line = self.data.iloc[line]
189
+
190
+ question, answer = self.qa_template(line)
191
+ message = []
192
+ message.append(dict(type='text', value=question))
193
+ video_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix'])
194
+ if video_llm:
195
+ message.append(dict(type='video', value=video_path))
196
+ else:
197
+ img_frame_paths = self.save_video_into_images(line)
198
+ for im in img_frame_paths:
199
+ message.append(dict(type='image', value=im))
200
+ message.append(dict(type='text', value='\nPlease directly give the best option:'))
201
+ return message
202
+
203
+ @classmethod
204
+ def evaluate(self, eval_file, **judge_kwargs):
205
+ model = judge_kwargs.get('model', 'exact_matching')
206
+ assert model in ['chatgpt-1106', 'exact_matching']
207
+ judge_kwargs.update({
208
+ "max_tokens": 128,
209
+ "temperature": 1.0,
210
+ "top_p": 1,
211
+ "presence_penalty": 1,
212
+ })
213
+
214
+ suffix = eval_file.split('.')[-1]
215
+ score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
216
+ tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
217
+ nproc = judge_kwargs.pop('nproc', 4)
218
+
219
+ if not osp.exists(score_file):
220
+ data = load(eval_file)
221
+ if model != 'exact_matching':
222
+ model = build_judge(system_prompt=sys_prompt, **judge_kwargs)
223
+ else:
224
+ model = None
225
+
226
+ lt = len(data)
227
+ lines = [data.iloc[i] for i in range(lt)]
228
+ tups = [(model, line) for line in lines]
229
+ indices = [line['index'] for line in lines]
230
+
231
+ ans = {}
232
+ if osp.exists(tmp_file):
233
+ ans = load(tmp_file)
234
+ tups = [x for x, i in zip(tups, indices) if i not in ans]
235
+ indices = [i for i in indices if i not in ans]
236
+
237
+ if len(indices):
238
+ _ = track_progress_rich(
239
+ evaluate_tempcompass_mcq,
240
+ tups,
241
+ nproc=nproc,
242
+ chunksize=nproc,
243
+ keys=indices,
244
+ save=tmp_file,
245
+ )
246
+ ans = load(tmp_file)
247
+ for idx, item in data.iterrows():
248
+ data.loc[idx, 'score'] = ans[idx]['rating']
249
+ dump(data, score_file)
250
+
251
+ rating = get_dimension_rating(score_file)
252
+ return rating
253
+
254
+
255
+ class TempCompass_Captioning(VideoBaseDataset):
256
+
257
+ MD5 = '35be9bf2581ea7767f02e9a8f37ae1ab'
258
+ TYPE = 'Video-VQA'
259
+
260
+ def __init__(self, dataset='TempCompass_Captioning', nframe=0, fps=-1):
261
+ self.type_data_list = {
262
+ 'captioning': ('captioning.json', './videos', '.mp4'),
263
+ }
264
+ super().__init__(dataset=dataset, nframe=nframe, fps=fps)
265
+
266
+ @classmethod
267
+ def supported_datasets(cls):
268
+ return ['TempCompass_Captioning']
269
+
270
+ def prepare_dataset(self, dataset_name='TempCompass_Captioning', repo_id='lmms-lab/TempCompass'):
271
+ def check_integrity(pth):
272
+ data_file = osp.join(pth, f'{dataset_name}.tsv')
273
+
274
+ if not osp.exists(data_file):
275
+ return False
276
+
277
+ if md5(data_file) != self.MD5:
278
+ return False
279
+
280
+ data = load(data_file)
281
+ for idx, item in data.iterrows():
282
+ if not osp.exists(osp.join(pth, item['prefix'], item['video'] + item['suffix'])):
283
+ return False
284
+ return True
285
+
286
+ cache_path = get_cache_path(repo_id)
287
+ if cache_path is not None and check_integrity(cache_path):
288
+ dataset_path = cache_path
289
+ else:
290
+ def read_parquet(pth):
291
+ import pandas as pd
292
+ for task_name in self.type_data_list.keys():
293
+ if not osp.exists(osp.join(pth, f'{task_name}.json')):
294
+ data = pd.read_parquet(osp.join(pth, task_name, 'test-00000-of-00001.parquet'))
295
+ data.to_json(osp.join(pth, f'{task_name}.json'), orient='records', lines=False)
296
+
297
+ def unzip_videos(pth):
298
+ import zipfile
299
+ if not osp.exists(osp.join(pth, 'videos')):
300
+ zip_file = osp.join(pth, 'tempcompass_videos.zip')
301
+ with zipfile.ZipFile(zip_file, 'r') as zip_ref:
302
+ zip_ref.extractall(pth)
303
+
304
+ def generate_tsv(pth):
305
+ data_file = osp.join(pth, f'{dataset_name}.tsv')
306
+ if osp.exists(data_file) and md5(data_file) == self.MD5:
307
+ return
308
+ self.data_list = []
309
+ for k, v in self.type_data_list.items():
310
+ with open(osp.join(pth, v[0]), 'r') as f:
311
+ json_data = json.load(f)
312
+ for data in json_data:
313
+ self.data_list.append({
314
+ 'task_type': k,
315
+ 'prefix': v[1],
316
+ 'suffix': v[2],
317
+ 'video': data['video_id'],
318
+ 'question': data['question'],
319
+ 'answer': data['answer'],
320
+ 'dim': data['dim'],
321
+ 'mc_question': data['mc_question'],
322
+ 'mc_answer': data['mc_answer'],
323
+ })
324
+
325
+ data_df = pd.DataFrame(self.data_list)
326
+ data_df = data_df.assign(index=range(len(data_df)))
327
+ data_df.to_csv(data_file, sep='\t', index=False)
328
+
329
+ if modelscope_flag_set():
330
+ from modelscope import dataset_snapshot_download
331
+ dataset_path = dataset_snapshot_download(dataset_id=repo_id)
332
+ else:
333
+ dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
334
+ read_parquet(dataset_path)
335
+ unzip_videos(dataset_path)
336
+ generate_tsv(dataset_path)
337
+
338
+ data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
339
+ return dict(root=dataset_path, data_file=data_file)
340
+
341
+ def qa_template(self, data):
342
+ question = data['question']
343
+ answer = data['answer']
344
+ return question, answer
345
+
346
+ def save_video_frames(self, line):
347
+ vid_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix'])
348
+ vid = decord.VideoReader(vid_path)
349
+ video_info = {
350
+ 'fps': vid.get_avg_fps(),
351
+ 'n_frames': len(vid),
352
+ }
353
+ if self.nframe > 0 and self.fps < 0:
354
+ step_size = len(vid) / (self.nframe + 1)
355
+ indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
356
+ frame_paths = self.frame_paths(line['video'])
357
+ elif self.fps > 0:
358
+ # not constrained by num_frames, get frames by fps
359
+ total_duration = video_info['n_frames'] / video_info['fps']
360
+ required_frames = int(total_duration * self.fps)
361
+ step_size = video_info['fps'] / self.fps
362
+ indices = [int(i * step_size) for i in range(required_frames)]
363
+ frame_paths = self.frame_paths_fps(line['video'], len(indices))
364
+
365
+ flag = np.all([osp.exists(p) for p in frame_paths])
366
+
367
+ if not flag:
368
+ images = [vid[i].asnumpy() for i in indices]
369
+ images = [Image.fromarray(arr) for arr in images]
370
+ for im, pth in zip(images, frame_paths):
371
+ if not osp.exists(pth):
372
+ im.save(pth)
373
+
374
+ return frame_paths
375
+
376
+ def save_video_into_images(self, line):
377
+ frame_paths = self.save_video_frames(line)
378
+ return frame_paths
379
+
380
+ def build_prompt(self, line, video_llm):
381
+ if isinstance(line, int):
382
+ assert line < len(self)
383
+ line = self.data.iloc[line]
384
+
385
+ question, answer = self.qa_template(line)
386
+ message = []
387
+ message.append(dict(type='text', value=question))
388
+ video_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix'])
389
+ if video_llm:
390
+ message.append(dict(type='video', value=video_path))
391
+ else:
392
+ img_frame_paths = self.save_video_into_images(line)
393
+ for im in img_frame_paths:
394
+ message.append(dict(type='image', value=im))
395
+ return message
396
+
397
+ @classmethod
398
+ def evaluate(self, eval_file, **judge_kwargs):
399
+ model = judge_kwargs.get('model', 'exact_matching')
400
+ assert model in ['chatgpt-1106', 'exact_matching']
401
+ judge_kwargs.update({
402
+ "max_tokens": 128,
403
+ "temperature": 1.0,
404
+ "top_p": 1,
405
+ "presence_penalty": 1,
406
+ })
407
+
408
+ suffix = eval_file.split('.')[-1]
409
+ score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
410
+ tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
411
+ nproc = judge_kwargs.pop('nproc', 4)
412
+
413
+ if not osp.exists(score_file):
414
+ data = load(eval_file)
415
+ if model != 'exact_matching':
416
+ model = build_judge(system_prompt=sys_prompt, **judge_kwargs)
417
+ else:
418
+ model = None
419
+
420
+ lt = len(data)
421
+ lines = [data.iloc[i] for i in range(lt)]
422
+ tups = [(model, line) for line in lines]
423
+ indices = [line['index'] for line in lines]
424
+
425
+ ans = {}
426
+ if osp.exists(tmp_file):
427
+ ans = load(tmp_file)
428
+ tups = [x for x, i in zip(tups, indices) if i not in ans]
429
+ indices = [i for i in indices if i not in ans]
430
+
431
+ if len(indices):
432
+ _ = track_progress_rich(
433
+ evaluate_tempcompass_captioning,
434
+ tups,
435
+ nproc=nproc,
436
+ chunksize=nproc,
437
+ keys=indices,
438
+ save=tmp_file,
439
+ )
440
+ ans = load(tmp_file)
441
+ for idx, item in data.iterrows():
442
+ data.loc[idx, 'score'] = ans[idx]['rating']
443
+ dump(data, score_file)
444
+
445
+ rating = get_dimension_rating(score_file)
446
+ return rating
447
+
448
+
449
+ class TempCompass_YorN(VideoBaseDataset):
450
+
451
+ MD5 = 'c72c046d7fa0e82c8cd7462f2e844ea8'
452
+ TYPE = 'Video-Y/N'
453
+
454
+ def __init__(self, dataset='TempCompass_YorN', nframe=0, fps=-1):
455
+ self.type_data_list = {
456
+ 'yes_no': ('yes_no.json', './videos', '.mp4'),
457
+ }
458
+ super().__init__(dataset=dataset, nframe=nframe, fps=fps)
459
+
460
+ @classmethod
461
+ def supported_datasets(cls):
462
+ return ['TempCompass_YorN']
463
+
464
+ def prepare_dataset(self, dataset_name='TempCompass_YorN', repo_id='lmms-lab/TempCompass'):
465
+ def check_integrity(pth):
466
+ data_file = osp.join(pth, f'{dataset_name}.tsv')
467
+
468
+ if not osp.exists(data_file):
469
+ return False
470
+
471
+ if md5(data_file) != self.MD5:
472
+ return False
473
+
474
+ data = load(data_file)
475
+ for idx, item in data.iterrows():
476
+ if not osp.exists(osp.join(pth, item['prefix'], item['video'] + item['suffix'])):
477
+ return False
478
+ return True
479
+
480
+ cache_path = get_cache_path(repo_id)
481
+ if cache_path is not None and check_integrity(cache_path):
482
+ dataset_path = cache_path
483
+ else:
484
+ def read_parquet(pth):
485
+ import pandas as pd
486
+ for task_name in self.type_data_list.keys():
487
+ if not osp.exists(osp.join(pth, f'{task_name}.json')):
488
+ data = pd.read_parquet(osp.join(pth, task_name, 'test-00000-of-00001.parquet'))
489
+ data.to_json(osp.join(pth, f'{task_name}.json'), orient='records', lines=False)
490
+
491
+ def unzip_videos(pth):
492
+ import zipfile
493
+ if not osp.exists(osp.join(pth, 'videos')):
494
+ zip_file = osp.join(pth, 'tempcompass_videos.zip')
495
+ with zipfile.ZipFile(zip_file, 'r') as zip_ref:
496
+ zip_ref.extractall(pth)
497
+
498
+ def generate_tsv(pth):
499
+ data_file = osp.join(pth, f'{dataset_name}.tsv')
500
+ if osp.exists(data_file) and md5(data_file) == self.MD5:
501
+ return
502
+ self.data_list = []
503
+ for k, v in self.type_data_list.items():
504
+ with open(osp.join(pth, v[0]), 'r') as f:
505
+ json_data = json.load(f)
506
+ for data in json_data:
507
+ self.data_list.append({
508
+ 'task_type': k,
509
+ 'prefix': v[1],
510
+ 'suffix': v[2],
511
+ 'video': data['video_id'],
512
+ 'question': data['question'].split('\n')[0],
513
+ 'answer': data['answer'],
514
+ 'dim': data['dim']
515
+ })
516
+
517
+ data_df = pd.DataFrame(self.data_list)
518
+ data_df = data_df.assign(index=range(len(data_df)))
519
+ data_df.to_csv(data_file, sep='\t', index=False)
520
+
521
+ if modelscope_flag_set():
522
+ from modelscope import dataset_snapshot_download
523
+ dataset_path = dataset_snapshot_download(dataset_id=repo_id)
524
+ else:
525
+ dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
526
+ read_parquet(dataset_path)
527
+ unzip_videos(dataset_path)
528
+ generate_tsv(dataset_path)
529
+
530
+ data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
531
+ return dict(root=dataset_path, data_file=data_file)
532
+
533
+ def qa_template(self, data):
534
+ question = data['question']
535
+ answer = data['answer']
536
+ return question, answer
537
+
538
+ def save_video_frames(self, line):
539
+ vid_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix'])
540
+ vid = decord.VideoReader(vid_path)
541
+ video_info = {
542
+ 'fps': vid.get_avg_fps(),
543
+ 'n_frames': len(vid),
544
+ }
545
+ if self.nframe > 0 and self.fps < 0:
546
+ step_size = len(vid) / (self.nframe + 1)
547
+ indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
548
+ frame_paths = self.frame_paths(line['video'])
549
+ elif self.fps > 0:
550
+ # not constrained by num_frames, get frames by fps
551
+ total_duration = video_info['n_frames'] / video_info['fps']
552
+ required_frames = int(total_duration * self.fps)
553
+ step_size = video_info['fps'] / self.fps
554
+ indices = [int(i * step_size) for i in range(required_frames)]
555
+ frame_paths = self.frame_paths_fps(line['video'], len(indices))
556
+
557
+ flag = np.all([osp.exists(p) for p in frame_paths])
558
+
559
+ if not flag:
560
+ images = [vid[i].asnumpy() for i in indices]
561
+ images = [Image.fromarray(arr) for arr in images]
562
+ for im, pth in zip(images, frame_paths):
563
+ if not osp.exists(pth):
564
+ im.save(pth)
565
+
566
+ return frame_paths
567
+
568
+ def save_video_into_images(self, line):
569
+ frame_paths = self.save_video_frames(line)
570
+ return frame_paths
571
+
572
+ def build_prompt(self, line, video_llm):
573
+ if isinstance(line, int):
574
+ assert line < len(self)
575
+ line = self.data.iloc[line]
576
+
577
+ question, answer = self.qa_template(line)
578
+ message = []
579
+ message.append(dict(type='text', value=question))
580
+ video_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix'])
581
+ if video_llm:
582
+ message.append(dict(type='video', value=video_path))
583
+ else:
584
+ img_frame_paths = self.save_video_into_images(line)
585
+ for im in img_frame_paths:
586
+ message.append(dict(type='image', value=im))
587
+ message.append(dict(type='text', value='\nPlease answer yes or no:'))
588
+ return message
589
+
590
+ @classmethod
591
+ def evaluate(self, eval_file, **judge_kwargs):
592
+ model = judge_kwargs.get('model', 'exact_matching')
593
+ assert model in ['chatgpt-1106', 'exact_matching']
594
+ judge_kwargs.update({
595
+ "max_tokens": 128,
596
+ "temperature": 1.0,
597
+ "top_p": 1,
598
+ "presence_penalty": 1,
599
+ })
600
+
601
+ suffix = eval_file.split('.')[-1]
602
+ score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
603
+ tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
604
+ nproc = judge_kwargs.pop('nproc', 4)
605
+
606
+ if not osp.exists(score_file):
607
+ data = load(eval_file)
608
+ if model != 'exact_matching':
609
+ model = build_judge(system_prompt=sys_prompt, **judge_kwargs)
610
+ else:
611
+ model = None
612
+
613
+ lt = len(data)
614
+ lines = [data.iloc[i] for i in range(lt)]
615
+ tups = [(model, line) for line in lines]
616
+ indices = [line['index'] for line in lines]
617
+
618
+ ans = {}
619
+ if osp.exists(tmp_file):
620
+ ans = load(tmp_file)
621
+ tups = [x for x, i in zip(tups, indices) if i not in ans]
622
+ indices = [i for i in indices if i not in ans]
623
+
624
+ if len(indices):
625
+ _ = track_progress_rich(
626
+ evaluate_tempcompass_YorN,
627
+ tups,
628
+ nproc=nproc,
629
+ chunksize=nproc,
630
+ keys=indices,
631
+ save=tmp_file,
632
+ )
633
+ ans = load(tmp_file)
634
+ for idx, item in data.iterrows():
635
+ data.loc[idx, 'score'] = ans[idx]['rating']
636
+ dump(data, score_file)
637
+
638
+ rating = get_dimension_rating(score_file)
639
+ return rating
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/text_base.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import abstractmethod
2
+ from ..smp import *
3
+
4
+
5
+ class TextBaseDataset:
6
+ MODALITY = 'TEXT'
7
+ DATASET_URL = {}
8
+ DATASET_MD5 = {}
9
+
10
+ def __init__(self, dataset='MMBench', **kwargs):
11
+ self.dataset_name = dataset
12
+
13
+ data = self.load_data(dataset)
14
+
15
+ data['index'] = [str(x) for x in data['index']]
16
+
17
+ if np.all([istype(x, int) for x in data['index']]):
18
+ data['index'] = [int(x) for x in data['index']]
19
+
20
+ self.data = data
21
+ self.post_build(dataset)
22
+
23
+ def __len__(self):
24
+ return len(self.data)
25
+
26
+ def __getitem__(self, idx):
27
+ return dict(self.data.iloc[idx])
28
+
29
+ def prepare_tsv(self, url, file_md5=None):
30
+ data_root = LMUDataRoot()
31
+ os.makedirs(data_root, exist_ok=True)
32
+ update_flag = False
33
+ file_name = url.split('/')[-1]
34
+ data_path = osp.join(data_root, file_name)
35
+ if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
36
+ pass
37
+ else:
38
+ warnings.warn('The dataset tsv is not downloaded')
39
+ download_file(url, data_path)
40
+ update_flag = True
41
+
42
+ if file_size(data_path, 'GB') > 1:
43
+ local_path = data_path.replace('.tsv', '_local.tsv')
44
+ if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag:
45
+ from ..tools import LOCALIZE
46
+ LOCALIZE(data_path, local_path)
47
+ data_path = local_path
48
+ return load(data_path)
49
+
50
+ def dump_image(self, line):
51
+ return []
52
+
53
+ def display(self, line):
54
+ if isinstance(line, int):
55
+ line = self.data.iloc[line]
56
+ assert isinstance(line, pd.Series) or isinstance(line, dict)
57
+ mmqa_display(line)
58
+
59
+ # Return a list of dataset names that are supported by this class, can override
60
+ @classmethod
61
+ def supported_datasets(cls):
62
+ return list(cls.DATASET_URL)
63
+
64
+ # Given the dataset name, return the dataset as a pandas dataframe, can override
65
+ def load_data(self, dataset):
66
+ url = self.DATASET_URL[dataset]
67
+ file_md5 = self.DATASET_MD5[dataset]
68
+ return self.prepare_tsv(url, file_md5)
69
+
70
+ # Post built hook, will be called after the dataset is built, can override
71
+ def post_build(self, dataset):
72
+ pass
73
+
74
+ # Given one data record, return the built prompt (a multi-modal message), can override
75
+ def build_prompt(self, line):
76
+ if isinstance(line, int):
77
+ line = self.data.iloc[line]
78
+
79
+ question = line['question']
80
+
81
+ msgs = []
82
+ msgs.append(dict(type='text', value=question))
83
+ return msgs
84
+
85
+ # Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
86
+ @abstractmethod
87
+ def evaluate(self, eval_file, **judge_kwargs):
88
+ pass
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/text_mcq.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .text_base import TextBaseDataset
2
+ from .utils import build_judge, DEBUG_MESSAGE
3
+ from ..smp import *
4
+
5
+
6
+ class TextMCQDataset(TextBaseDataset):
7
+ TYPE = 'MCQ'
8
+
9
+ DATASET_URL = {}
10
+
11
+ DATASET_MD5 = {}
12
+
13
+ def build_prompt(self, line):
14
+
15
+ if isinstance(line, int):
16
+ line = self.data.iloc[line]
17
+
18
+ question = line['question']
19
+ options = {
20
+ cand: line[cand]
21
+ for cand in string.ascii_uppercase
22
+ if cand in line and not pd.isna(line[cand])
23
+ }
24
+ options_prompt = 'Options:\n'
25
+ for key, item in options.items():
26
+ options_prompt += f'{key}. {item}\n'
27
+ hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
28
+ prompt = ''
29
+ if hint is not None:
30
+ prompt += f'Hint: {hint}\n'
31
+ prompt += f'Question: {question}\n'
32
+ if len(options):
33
+ prompt += options_prompt
34
+ prompt += 'Please select the correct answer from the options above. \n'
35
+
36
+ msgs = []
37
+
38
+ msgs.append(dict(type='text', value=prompt))
39
+
40
+ return msgs
41
+
42
+ def evaluate(self, eval_file, **judge_kwargs):
43
+ from .utils.multiple_choice import report_acc, report_acc_MMT, mcq_circular_eval, mcq_vanilla_eval
44
+ # assert dataset is not None
45
+ dataset_map = {
46
+ 'MMBench_TEST_EN': 'MMBench', 'MMBench_TEST_EN_V11': 'MMBench_V11',
47
+ 'MMBench_TEST_CN': 'MMBench_CN', 'MMBench_TEST_CN_V11': 'MMBench_CN_V11'
48
+ }
49
+ dataset = self.dataset_name
50
+ if dataset in dataset_map:
51
+ dataset = dataset_map[dataset]
52
+ nproc = judge_kwargs.pop('nproc', 4)
53
+
54
+ circular = False
55
+
56
+ suffix = eval_file.split('.')[-1]
57
+ model = judge_kwargs.get('model', 'exact_matching')
58
+ assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
59
+ name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
60
+ name_str = name_str_map[model] if model in name_str_map else model
61
+
62
+ if model == 'exact_matching':
63
+ model = None
64
+ elif gpt_key_set():
65
+ model = build_judge(**judge_kwargs)
66
+ if not model.working():
67
+ warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
68
+ warnings.warn(DEBUG_MESSAGE)
69
+ model = None
70
+ else:
71
+ warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
72
+ model = None
73
+
74
+ result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
75
+
76
+ data = load(eval_file)
77
+ data = data.sort_values(by='index')
78
+ data['prediction'] = [str(x) for x in data['prediction']]
79
+ # If not choice label, then use lower case
80
+ for k in data.keys():
81
+ data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
82
+
83
+ meta = self.data
84
+ meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
85
+ data_map = {x: y for x, y in zip(data['index'], data['question'])}
86
+ for k in data_map:
87
+ assert k in meta_q_map, (
88
+ f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
89
+ )
90
+
91
+ if circular:
92
+ data = mcq_circular_eval(model, data, meta, nproc, result_file, self.dataset_name)
93
+ else:
94
+ data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
95
+
96
+ # load split
97
+ dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
98
+ data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
99
+
100
+ # May have different report acc functions for different datasets
101
+ if 'MMT' in dataset:
102
+ acc = report_acc_MMT(data)
103
+ else:
104
+ acc = report_acc(data)
105
+
106
+ score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
107
+ dump(acc, score_file)
108
+
109
+ return acc
110
+
111
+
112
+ class CustomTextMCQDataset(TextMCQDataset):
113
+
114
+ def load_data(self, dataset):
115
+ data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
116
+
117
+ if file_size(data_path, 'GB') > 1:
118
+ local_path = data_path.replace('.tsv', '_local.tsv')
119
+ if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None):
120
+ from ..tools import LOCALIZE
121
+ LOCALIZE(data_path, local_path)
122
+ data_path = local_path
123
+ return load(data_path)