1f commited on
Commit
052bf16
·
verified ·
1 Parent(s): 885ccec

Add files using upload-large-folder tool

Browse files
Files changed (20) hide show
  1. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/__init__.py +9 -0
  2. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/cgbench.py +682 -0
  3. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/crpe.py +13 -0
  4. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/hrbench.py +54 -0
  5. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/judge_util.py +49 -0
  6. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/llavabench.py +65 -0
  7. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/logicvista.py +150 -0
  8. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/longvideobench.py +80 -0
  9. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathv.py +171 -0
  10. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathverse.py +193 -0
  11. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathvista.py +164 -0
  12. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mlvu.py +189 -0
  13. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmbench_video.py +70 -0
  14. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/vcr.py +335 -0
  15. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/video_base.py +135 -0
  16. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/video_concat_dataset.py +85 -0
  17. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/video_dataset_config.py +103 -0
  18. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/videomme.py +283 -0
  19. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/vl_rewardbench.py +174 -0
  20. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/wildvision.py +222 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from .judge_util import build_judge, DEBUG_MESSAGE
2
+ from .multiple_choice import extract_answer_from_item, prefetch_answer
3
+ from .vqa_eval import levenshtein_distance
4
+
5
+
6
+ __all__ = [
7
+ 'build_judge', 'extract_answer_from_item', 'prefetch_answer',
8
+ 'levenshtein_distance', 'DEBUG_MESSAGE',
9
+ ]
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/cgbench.py ADDED
@@ -0,0 +1,682 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ...smp import *
2
+ from .multiple_choice import extract_answer_from_item
3
+ import pandas as pd
4
+ import numpy as np
5
+ import re
6
+
7
+ FAIL_MSG = "Failed to obtain answer via API."
8
+
9
+ frame_tmpl = "frame-{}-of-{}.jpg"
10
+
11
+ sys_prompt_open_eval_step_1 = (
12
+ "You will be provided with a question, a model's prediction, and the ground "
13
+ "truth answer for this question.\n"
14
+ "Your task is to judge whether the model's prediction is correct based on the "
15
+ "meaning of the two texts.\n"
16
+ "In most cases, this can be done by determining if the meaning of the model's "
17
+ "prediction is consistent with, or contains, the ground truth answer. However, "
18
+ "in some cases where the two texts differ, it may represent different "
19
+ "descriptions of the same visual scene, in which case visual information is "
20
+ "needed for further judgment.\n"
21
+ "Therefore, I hope you:\n"
22
+ "- Output 0, if the model's prediction and the ground truth answer are neither "
23
+ "consistent nor related by inclusion, with fundamentally different meanings.\n"
24
+ "- Output 1, if the meaning of the model's prediction and the ground truth "
25
+ "answer is consistent, or if the model's prediction meaningfully contains the "
26
+ "ground truth answer.\n"
27
+ "- Output 2, if the model's prediction and ground truth are not consistent or "
28
+ "inclusive, but may be different descriptions of the same visual scene, "
29
+ "requiring visual information for further judgment.\n"
30
+ "Only output the answer in the following format:\n\n"
31
+ '```json\n{"result": choice}\n```\n\n'
32
+ "The choice is either 0, 1, or 2 as specified above."
33
+ )
34
+
35
+ sys_prompt_open_eval_step_2 = (
36
+ "You will be provided with a question, a model's prediction, and the sampling "
37
+ "frames of the clue intervals related to this question.\n"
38
+ "Your task is to determine whether the model has answered the question "
39
+ "correctly based on the visual information provided.\n"
40
+ "Therefore, I hope you:\n"
41
+ "- Output 0, if the model's prediction does not correctly answer the question.\n"
42
+ "- Output 1, if the model's prediction correctly answers the question.\n"
43
+ "Only output the answer in the following format without output extra "
44
+ "explanation:\n\n"
45
+ '```json\n{"result": choice}\n```\n\n'
46
+ "The choice is either 0 or 1 as specified above."
47
+ )
48
+
49
+ FAIL_MSG = "Failed to obtain answer via API."
50
+
51
+ # '10-20', '20-30', '30-40', '40-50', '50-60'
52
+ DURATIONS = ["0 ~ 10", "10 ~ 20", "20 ~ 30", "30 ~ 40", "40 ~ 50", "50 ~ 60", "60+"]
53
+
54
+ DOMAINS = [
55
+ "Life Record",
56
+ "Music & TV show",
57
+ "Instruction & Knowledge",
58
+ "Driving",
59
+ "Embodied Expert",
60
+ "Humor/funny",
61
+ "Electonic/Social Gaming",
62
+ "Security & Health",
63
+ "Sports & Exercise",
64
+ "Special Scenes",
65
+ "Art & Culture",
66
+ "GUI",
67
+ "News",
68
+ "Animal & Pet",
69
+ ]
70
+
71
+ SUB_CATEGORIES = [
72
+ "Time Cognition",
73
+ "Hallucination",
74
+ "Entity Perception",
75
+ "2D Spatial Perception",
76
+ "Time Perception",
77
+ "Scene Perception",
78
+ "Text Perception",
79
+ "Event Cognition",
80
+ "Entity Cognition",
81
+ "Text Cognition",
82
+ "Event Perception",
83
+ "Scene Cognition",
84
+ ]
85
+
86
+
87
+ def get_dimention_rating_open_ended(data_path):
88
+ # 读取数据
89
+ df = load(data_path)
90
+
91
+ df = df[df["score"] != -1]
92
+
93
+ # 将秒转换为分钟并分配到对应区间
94
+ df["duration_minutes"] = df["duration"] / 60
95
+ df["duration_range"] = pd.cut(
96
+ df["duration_minutes"], bins=[-np.inf, 10, 20, 30, 40, 50, 60, np.inf], labels=DURATIONS
97
+ )
98
+
99
+ # 初始化结果字典
100
+ result = {
101
+ "overall": 0,
102
+ "duration": {k: 0 for k in DURATIONS},
103
+ "domain": {k: 0 for k in DOMAINS},
104
+ "sub_category": {k: 0 for k in SUB_CATEGORIES},
105
+ }
106
+
107
+ # Overall
108
+ result["overall"] = round(df["score"].mean(), 4)
109
+
110
+ # Duration
111
+ for dur in DURATIONS:
112
+ dur_scores = df[df["duration_range"] == dur]["score"]
113
+ result["duration"][dur] = round(dur_scores.mean(), 4) if not dur_scores.empty else 0
114
+
115
+ # Domain
116
+ for domain in DOMAINS:
117
+ domain_scores = df[df["domain"] == domain]["score"]
118
+ result["domain"][domain] = round(domain_scores.mean(), 4) if not domain_scores.empty else 0
119
+
120
+ # Sub-category
121
+ for sub_cat in SUB_CATEGORIES:
122
+ sub_cat_scores = df[df["sub_category"] == sub_cat]["score"]
123
+ result["sub_category"][sub_cat] = round(sub_cat_scores.mean(), 4) if not sub_cat_scores.empty else 0
124
+
125
+ return result
126
+
127
+
128
+ def get_dimention_rating_mcq_grouding(data_path):
129
+
130
+ # 读取数据
131
+ df = load(data_path)
132
+
133
+ # df.loc[(df['task_mode'] == 'miou') & (df['score'] == -1), 'score'] = 0
134
+
135
+ df = df[df["score"] != -1]
136
+
137
+ # 将秒转换为分钟并分配到对应区间
138
+ df["duration_minutes"] = df["duration"] / 60
139
+ df["duration_range"] = pd.cut(
140
+ df["duration_minutes"], bins=[-np.inf, 10, 20, 30, 40, 50, 60, np.inf], labels=DURATIONS
141
+ )
142
+
143
+ # 初始化结果字典
144
+ result = {
145
+ metric: {
146
+ "overall": 0,
147
+ "duration": {k: 0 for k in DURATIONS},
148
+ "domain": {k: 0 for k in DOMAINS},
149
+ "sub_category": {k: 0 for k in SUB_CATEGORIES},
150
+ }
151
+ for metric in ["long_acc", "clue_acc", "miou", "CRR", "acc@iou", "rec@iou"]
152
+ }
153
+
154
+ # 计算基础指标
155
+ for metric in ["long_acc", "clue_acc", "miou"]:
156
+ metric_df = df[df["task_mode"] == metric]
157
+
158
+ # Overall
159
+ result[metric]["overall"] = round(metric_df["score"].mean(), 4)
160
+
161
+ # Duration
162
+ for dur in DURATIONS:
163
+ dur_scores = metric_df[metric_df["duration_range"] == dur]["score"]
164
+ result[metric]["duration"][dur] = round(dur_scores.mean(), 4) if not dur_scores.empty else 0
165
+
166
+ # Domain
167
+ for domain in DOMAINS:
168
+ domain_scores = metric_df[metric_df["domain"] == domain]["score"]
169
+ result[metric]["domain"][domain] = round(domain_scores.mean(), 4) if not domain_scores.empty else 0
170
+
171
+ # Sub-category
172
+ for sub_cat in SUB_CATEGORIES:
173
+ sub_cat_scores = metric_df[metric_df["sub_category"] == sub_cat]["score"]
174
+ result[metric]["sub_category"][sub_cat] = round(sub_cat_scores.mean(), 4) if not sub_cat_scores.empty else 0
175
+
176
+ # 计算复合指标 CRR
177
+ def calculate_crr(scores):
178
+ long_acc = scores[scores["task_mode"] == "long_acc"]["score"].mean()
179
+ clue_acc = scores[scores["task_mode"] == "clue_acc"]["score"].mean()
180
+ return round(min(long_acc, clue_acc) / clue_acc, 4) if clue_acc != 0 else 0
181
+
182
+ # Overall CRR
183
+ result["CRR"]["overall"] = calculate_crr(df)
184
+
185
+ # Duration CRR
186
+ for dur in DURATIONS:
187
+ dur_df = df[df["duration_range"] == dur]
188
+ result["CRR"]["duration"][dur] = calculate_crr(dur_df)
189
+
190
+ # Domain CRR
191
+ for domain in DOMAINS:
192
+ domain_df = df[df["domain"] == domain]
193
+ result["CRR"]["domain"][domain] = calculate_crr(domain_df)
194
+
195
+ # Sub-category CRR
196
+ for sub_cat in SUB_CATEGORIES:
197
+ sub_cat_df = df[df["sub_category"] == sub_cat]
198
+ result["CRR"]["sub_category"][sub_cat] = calculate_crr(sub_cat_df)
199
+
200
+ # 计算 acc@iou
201
+ def calculate_acc_at_iou_threshold(scores, threshold):
202
+
203
+ miou_qids = set(scores[scores["task_mode"] == "miou"]["qid"])
204
+
205
+ long_acc_qids = set(scores[scores["task_mode"] == "long_acc"]["qid"])
206
+
207
+ valid_qids = miou_qids & long_acc_qids
208
+
209
+ miou_positive = set(scores[(scores["task_mode"] == "miou") & (scores["score"] > threshold)]["qid"])
210
+
211
+ long_acc_positive = scores[
212
+ (scores["task_mode"] == "long_acc") & (scores["qid"].isin(miou_positive)) & (scores["score"] == 1)
213
+ ]
214
+
215
+ acc_at_iou_threshold = len(long_acc_positive) / len(valid_qids) if len(valid_qids) > 0 else 0
216
+ return round(acc_at_iou_threshold, 4)
217
+
218
+ def calculate_acc_at_iou(scores):
219
+ thresholds = [0.1, 0.2, 0.3, 0.4, 0.5]
220
+ acc_at_iou_values = [calculate_acc_at_iou_threshold(scores, threshold) for threshold in thresholds]
221
+
222
+ return round(sum(acc_at_iou_values) / len(acc_at_iou_values), 4)
223
+
224
+ # Overall acc@iou
225
+ result["acc@iou"]["overall"] = calculate_acc_at_iou(df)
226
+
227
+ # Duration acc@iou
228
+ for dur in DURATIONS:
229
+ dur_df = df[df["duration_range"] == dur]
230
+ result["acc@iou"]["duration"][dur] = calculate_acc_at_iou(dur_df)
231
+
232
+ # Domain acc@iou
233
+ for domain in DOMAINS:
234
+ domain_df = df[df["domain"] == domain]
235
+ result["acc@iou"]["domain"][domain] = calculate_acc_at_iou(domain_df)
236
+
237
+ # Sub-category acc@iou
238
+ for sub_cat in SUB_CATEGORIES:
239
+ sub_cat_df = df[df["sub_category"] == sub_cat]
240
+ result["acc@iou"]["sub_category"][sub_cat] = calculate_acc_at_iou(sub_cat_df)
241
+
242
+ # 计算 rec@iou
243
+ def calculate_rec_at_iou_threshold(scores, threshold):
244
+ # 获取所有 miou 类型的数据
245
+ miou_scores = scores[scores["task_mode"] == "miou"]
246
+
247
+ # 计算 miou score 大于 threshold 的数量
248
+ miou_positive = miou_scores[miou_scores["score"] > threshold]
249
+
250
+ # 计算比例
251
+ rec_at_iou = len(miou_positive) / len(miou_scores) if len(miou_scores) > 0 else 0
252
+
253
+ return round(rec_at_iou, 4)
254
+
255
+ def calculate_rec_at_iou(scores):
256
+ thresholds = [0.1, 0.2, 0.3, 0.4, 0.5]
257
+ rec_at_iou_values = [calculate_rec_at_iou_threshold(scores, threshold) for threshold in thresholds]
258
+
259
+ return round(sum(rec_at_iou_values) / len(rec_at_iou_values), 4)
260
+
261
+ # Overall rec@iou
262
+ result["rec@iou"]["overall"] = calculate_rec_at_iou(df)
263
+
264
+ # Duration rec@iou
265
+ for dur in DURATIONS:
266
+ dur_df = df[df["duration_range"] == dur]
267
+ result["rec@iou"]["duration"][dur] = calculate_rec_at_iou(dur_df)
268
+
269
+ # Domain rec@iou
270
+ for domain in DOMAINS:
271
+ domain_df = df[df["domain"] == domain]
272
+ result["rec@iou"]["domain"][domain] = calculate_rec_at_iou(domain_df)
273
+
274
+ # Sub-category rec@iou
275
+ for sub_cat in SUB_CATEGORIES:
276
+ sub_cat_df = df[df["sub_category"] == sub_cat]
277
+ result["rec@iou"]["sub_category"][sub_cat] = calculate_rec_at_iou(sub_cat_df)
278
+
279
+ return result
280
+
281
+
282
+ def milliseconds_to_seconds(milliseconds):
283
+ return milliseconds / 1000
284
+
285
+
286
+ def sample_frames_clue_average(clues_time_intervals, frame_num, fps):
287
+ # 计算每个线索区间的时长
288
+ clues_frame_intervals = [(round(interval[0] * fps), round(interval[1] * fps)) for interval in clues_time_intervals]
289
+ clue_durations = [interval[1] - interval[0] for interval in clues_frame_intervals]
290
+ total_duration = sum(clue_durations)
291
+ # 如果 frame_num 的数量大于等于总帧数, 则直接返回全部帧
292
+ if frame_num >= total_duration:
293
+ return [frame for interval in clues_frame_intervals for frame in range(interval[0], interval[1])]
294
+ frames_per_clue = [int(frame_num * (duration / total_duration)) for duration in clue_durations]
295
+ frame_indices = []
296
+ for i, (interval, num_frames) in enumerate(zip(clues_frame_intervals, frames_per_clue)):
297
+ num_frames = max(1, num_frames)
298
+ seg_size = (interval[1] - interval[0]) / num_frames
299
+ clue_frame_indices = [int(interval[0] + seg_size / 2 + seg_size * idx) for idx in range(num_frames)]
300
+ frame_indices.extend(clue_frame_indices)
301
+ return frame_indices
302
+
303
+
304
+ def merge_intervals(intervals):
305
+ """
306
+ Merge overlapping intervals in a list.
307
+ Assumes each interval is a list [start, end].
308
+ """
309
+ if not intervals:
310
+ return []
311
+
312
+ # Sort intervals by start time
313
+ intervals.sort(key=lambda x: x[0])
314
+
315
+ merged = [intervals[0]]
316
+
317
+ for current in intervals[1:]:
318
+ last_merged = merged[-1]
319
+
320
+ # Check if there is an overlap
321
+ if current[0] <= last_merged[1]:
322
+ # Merge the current interval with the last one
323
+ last_merged[1] = max(last_merged[1], current[1])
324
+ else:
325
+ # No overlap, add current interval
326
+ merged.append(current)
327
+
328
+ return merged
329
+
330
+
331
+ def calculate_intervals_iou(intervals1, intervals2):
332
+ """
333
+ Calculate the IoU of two lists of intervals.
334
+ Each list contains intervals represented as [start, end].
335
+ """
336
+ # Merge overlapping intervals in both lists
337
+ merged1 = merge_intervals(intervals1)
338
+ merged2 = merge_intervals(intervals2)
339
+
340
+ # Calculate total length of intervals for both lists
341
+ def total_length(merged_intervals):
342
+ return sum(end - start for start, end in merged_intervals)
343
+
344
+ length1 = total_length(merged1)
345
+ length2 = total_length(merged2)
346
+
347
+ # Calculate intersection length
348
+ intersection_length = 0
349
+ for interval1 in merged1:
350
+ for interval2 in merged2:
351
+ intersection_start = max(interval1[0], interval2[0])
352
+ intersection_end = min(interval1[1], interval2[1])
353
+ intersection_length += max(0, intersection_end - intersection_start)
354
+ # Calculate union length
355
+ union_length = length1 + length2 - intersection_length
356
+ # IoU is intersection divided by union
357
+ iou = intersection_length / union_length if union_length > 0 else 0
358
+ return iou
359
+
360
+
361
+ def post_process(response, right_answer, task_mode, duration):
362
+ result = -1
363
+
364
+ if response:
365
+ # 找到 ```json 和 ``` 的位置
366
+ json_start = response.find("```json")
367
+ json_end = response.find("```", json_start + len("```json"))
368
+
369
+ # 如果找到了 json 内容
370
+ if json_start != -1 and json_end != -1:
371
+ json_content = response[json_start + len("```json"):json_end].strip()
372
+ else:
373
+ json_content = ""
374
+
375
+ if json_content:
376
+ if task_mode in ["long_acc", "clue_acc"]:
377
+ json_content = re.sub(r"(?<=:\s)([A-Za-z_]\w*)", r'"\1"', json_content)
378
+
379
+ try:
380
+ model_result = json.loads(json_content)["result"]
381
+
382
+ if task_mode in ["long_acc", "clue_acc"]:
383
+ result = 1 if right_answer == model_result else 0
384
+ elif task_mode == "miou":
385
+ if not isinstance(model_result, list):
386
+ return -1
387
+ if not isinstance(model_result[0], list):
388
+ model_result = [model_result]
389
+
390
+ need_duration = all(interval[0] <= 1 and interval[1] <= 1 for interval in model_result)
391
+
392
+ if need_duration:
393
+ model_result = [[interval[0] * duration, interval[1] * duration] for interval in model_result]
394
+
395
+ right_answer = eval(right_answer)
396
+
397
+ result = calculate_intervals_iou(right_answer, model_result)
398
+
399
+ except Exception as e:
400
+ print(f"Error in parsing JSON: {e}, {json_content}")
401
+
402
+ if result == -1:
403
+ if task_mode in ["long_acc", "clue_acc"]:
404
+ # 检查是否存在大写字母 A-H,认为其为模型答案
405
+ matches = re.findall(r"\b[A-H]\b", response)
406
+ if matches:
407
+ result = 1 if right_answer in matches else 0
408
+ elif task_mode == "miou":
409
+ # 提取所有实数,进行配对
410
+ numbers = re.findall(r"-?\d+\.?\d*", response)
411
+ if len(numbers) < 2:
412
+ result = -1
413
+ else:
414
+ if len(numbers) % 2 != 0:
415
+ numbers = numbers[:-1]
416
+ model_result = [[float(numbers[i]), float(numbers[i + 1])] for i in range(0, len(numbers), 2)]
417
+
418
+ if type(right_answer) is str:
419
+ right_answer = eval(right_answer)
420
+
421
+ result = calculate_intervals_iou(right_answer, model_result)
422
+
423
+ return result
424
+
425
+
426
+ def get_timestampes(frame_indices, fps):
427
+ seconds = list(map(lambda x: str(round(x / fps, 4)), frame_indices))
428
+ timestamps = ", ".join(seconds)
429
+ return "A total of {frame_num} frames are sampled. Their corresponding timestamps are:\n\n{timestamps}\n\n".format(
430
+ frame_num=len(frame_indices), timestamps=timestamps
431
+ )
432
+
433
+
434
+ def post_process_open(response):
435
+ model_result = -1
436
+
437
+ if response and response != FAIL_MSG:
438
+ json_start = response.find("```json")
439
+ json_end = response.find("```", json_start + len("```json"))
440
+
441
+ # 如果找到了 json 内容
442
+ if json_start != -1 and json_end != -1:
443
+ json_content = response[json_start + len("```json"):json_end].strip()
444
+ else:
445
+ json_content = ""
446
+
447
+ if json_content:
448
+ try:
449
+ model_result = json.loads(json_content)["result"]
450
+ except Exception as e:
451
+ print(f"Error in parsing JSON: {e}, {json_content}")
452
+
453
+ if model_result == -1:
454
+ model_result = response
455
+
456
+ return model_result
457
+
458
+
459
+ def post_process_eval_open(response, step):
460
+
461
+ model_result = -1
462
+
463
+ if response and response != FAIL_MSG:
464
+
465
+ json_start = response.find("```json")
466
+ json_end = response.find("```", json_start + len("```json"))
467
+
468
+ if json_start != -1 and json_end != -1:
469
+ json_content = response[json_start + len("```json"):json_end].strip()
470
+ else:
471
+ json_content = ""
472
+
473
+ if json_content:
474
+ try:
475
+ model_result = json.loads(json_content)["result"]
476
+ except Exception as e:
477
+ print(f"Error in parsing JSON: {e}, {json_content}")
478
+ return -1
479
+ if model_result == -1:
480
+ if step == 1:
481
+ match = re.search(r"[012]", response)
482
+ if match:
483
+ model_result = int(match.group())
484
+ else:
485
+ match = re.search(r"[01]", response)
486
+ if match:
487
+ model_result = int(match.group())
488
+
489
+ return model_result
490
+
491
+
492
+ def eval_open_first(model, line):
493
+
494
+ user_prompt = ""
495
+
496
+ user_prompt += f"Question: {line['question']}\n\n"
497
+
498
+ user_prompt += f"The ground truth answer is '{line['answer']}'\n\n"
499
+
500
+ user_prompt += f"The model's prediction is '{line['model_result']}'\n\n"
501
+
502
+ result = model.generate(user_prompt)
503
+
504
+ return result
505
+
506
+
507
+ def save_step_1_steps(data, step_1_results):
508
+
509
+ # 处理所有结果
510
+ data["step_1_result"] = data["qid"].map(lambda x: post_process_eval_open(step_1_results[x], 1))
511
+
512
+ # 条件更新
513
+ mask = data["step_1_result"].isin([-1, 0, 1])
514
+ data.loc[mask, "step_2_result"] = data.loc[mask, "step_1_result"]
515
+ data.loc[mask, "score"] = data.loc[mask, "step_1_result"]
516
+
517
+ return data
518
+
519
+
520
+ def eval_open_second(model, line, frame_paths):
521
+
522
+ user_prompt = ""
523
+
524
+ user_prompt += f"Question: {line['question']}\n\n"
525
+
526
+ user_prompt += f"The model's prediction is '{line['model_result']}'\n\n"
527
+
528
+ result = model.generate([user_prompt] + frame_paths)
529
+
530
+ return result
531
+
532
+
533
+ def save_step_2_steps(data, step_1_results):
534
+
535
+ # 处理所有结果
536
+ data["score"] = data["qid"].map(lambda x: post_process_eval_open(step_1_results[x], 2))
537
+
538
+ return data
539
+
540
+
541
+ def clue_frame_paths(clue_frame_root, qid, num_frames=8):
542
+ frame_root = osp.join(clue_frame_root, str(qid))
543
+ os.makedirs(frame_root, exist_ok=True)
544
+ return [osp.join(frame_root, frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)]
545
+
546
+
547
+ def save_clue_video_frames(data_root, clue_frame_root, video, uid, clue_intervals=None, num_frames=8, fps=-1):
548
+
549
+ if type(uid) is str:
550
+ uid = str(uid)
551
+
552
+ vid_path = osp.join(data_root, video)
553
+ vid = decord.VideoReader(vid_path)
554
+ vid_fps = vid.get_avg_fps()
555
+
556
+ if clue_intervals is not None:
557
+ # 1. 合并重叠区间
558
+ merged_intervals = merge_intervals(clue_intervals)
559
+
560
+ if num_frames > 0 and fps < 0:
561
+ # 2. 基于clue_intervals均匀抽帧
562
+ indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps)
563
+ frame_paths = clue_frame_paths(clue_frame_root, uid, len(indices))
564
+
565
+ # 保存帧
566
+ flag = np.all([osp.exists(p) for p in frame_paths])
567
+ if not flag:
568
+ images = [vid[i].asnumpy() for i in indices]
569
+ images = [Image.fromarray(arr) for arr in images]
570
+ for im, pth in zip(images, frame_paths):
571
+ if not osp.exists(pth):
572
+ im.save(pth)
573
+
574
+ return frame_paths, indices, vid_fps
575
+
576
+
577
+ def get_chunk_number(filename):
578
+ try:
579
+ num = filename.split("chunk_")[1].split(".zip")[0]
580
+ return int(num)
581
+ except:
582
+ return float('inf')
583
+
584
+
585
+ def unzip_hf_zip(pth):
586
+
587
+ import zipfile
588
+
589
+ target_dir = pth
590
+
591
+ if os.path.exists(f"{target_dir}/cg_videos_720p") and os.path.exists(f"{target_dir}/cg_subtitles")\
592
+ and os.path.exists(f"{target_dir}/cg_clue_videos"):
593
+ print("all exists")
594
+ return
595
+
596
+ video_zip_files = [
597
+ os.path.join(target_dir, file)
598
+ for file in os.listdir(target_dir)
599
+ if file.endswith(".zip") and file.startswith("video")
600
+ ]
601
+
602
+ video_zip_files = sorted(video_zip_files, key=lambda x: get_chunk_number(os.path.basename(x)))
603
+
604
+ videos_temp_zip = os.path.join(target_dir, "videos_merged.zip")
605
+
606
+ print("Merging video files ...")
607
+
608
+ with open(videos_temp_zip, "wb") as outfile:
609
+ for video_zip_file in tqdm(video_zip_files, desc="Merging videos"):
610
+ with open(video_zip_file, "rb") as infile:
611
+ outfile.write(infile.read())
612
+
613
+ print("Extracting video files...")
614
+
615
+ try:
616
+ with zipfile.ZipFile(videos_temp_zip, "r") as zip_ref:
617
+
618
+ total_files = len(zip_ref.namelist())
619
+
620
+ for file in tqdm(zip_ref.namelist(), desc="Extracting", total=total_files):
621
+ zip_ref.extract(file, target_dir)
622
+
623
+ print(f"Successfully extracted to {target_dir}")
624
+ except Exception as e:
625
+ print(f"Error during extraction: {e}")
626
+ finally:
627
+
628
+ if os.path.exists(videos_temp_zip):
629
+ os.remove(videos_temp_zip)
630
+ print("Cleaned up temporary video file")
631
+
632
+ clue_video_zip_files = [
633
+ os.path.join(target_dir, file)
634
+ for file in os.listdir(target_dir)
635
+ if file.endswith(".zip") and file.startswith("clue_video")
636
+ ]
637
+
638
+ clue_video_zip_files = sorted(clue_video_zip_files, key=lambda x: get_chunk_number(os.path.basename(x)))
639
+
640
+ clue_videos_temp_zip = os.path.join(target_dir, "clue_videos_merged.zip")
641
+
642
+ print("Merging clue video files ...")
643
+
644
+ with open(clue_videos_temp_zip, "wb") as outfile:
645
+ for clue_video_zip_file in tqdm(clue_video_zip_files, desc="Merging clue_videos"):
646
+ with open(clue_video_zip_file, "rb") as infile:
647
+ outfile.write(infile.read())
648
+
649
+ print("Extracting clue video files...")
650
+
651
+ try:
652
+ with zipfile.ZipFile(clue_videos_temp_zip, "r") as zip_ref:
653
+
654
+ total_files = len(zip_ref.namelist())
655
+
656
+ for file in tqdm(zip_ref.namelist(), desc="Extracting", total=total_files):
657
+ zip_ref.extract(file, target_dir)
658
+
659
+ print(f"Successfully extracted to {target_dir}")
660
+ except Exception as e:
661
+ print(f"Error during extraction: {e}")
662
+ finally:
663
+
664
+ if os.path.exists(clue_videos_temp_zip):
665
+ os.remove(clue_videos_temp_zip)
666
+ print("Cleaned up temporary clue video file")
667
+
668
+ print("Extracting subtitle files ...")
669
+
670
+ subtitles_zip = os.path.join(target_dir, "subtitles.zip")
671
+
672
+ try:
673
+ with zipfile.ZipFile(subtitles_zip, "r") as zip_ref:
674
+
675
+ total_files = len(zip_ref.namelist())
676
+
677
+ for file in tqdm(zip_ref.namelist(), desc="Extracting", total=total_files):
678
+ zip_ref.extract(file, target_dir)
679
+
680
+ print(f"Successfully extracted to {target_dir}")
681
+ except Exception as e:
682
+ print(f"Error during extraction: {e}")
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/crpe.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import argparse
3
+ from collections import defaultdict
4
+
5
+
6
+ def is_correct(predict, answer):
7
+ # predict是标准答案 answer是预测
8
+ if len(answer) == 1:
9
+ return answer[0] == predict[0]
10
+ elif len(answer) != 1 and answer[0] in ['A', 'B', 'C', 'D']:
11
+ return answer[0] == predict[0]
12
+ elif len(answer) != 1 and answer[0] not in ['A', 'B', 'C', 'D']:
13
+ return predict[4:].lower() in answer.lower()
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/hrbench.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ...smp import *
2
+ import os
3
+
4
+
5
+ def report_acc_hrbench(df):
6
+ cycle_group = df.groupby('cycle_category')
7
+ result_dic = defaultdict(list)
8
+ avg_dic = defaultdict(int)
9
+
10
+ count = 0
11
+ for key, data_value in cycle_group:
12
+ count += 1
13
+ _, resp_dic = hrbench_score(data_value)
14
+
15
+ for task_type, accuracy in resp_dic.items():
16
+ result_dic['cycle'].append(key)
17
+ result_dic['type'].append(task_type)
18
+ result_dic['accuracy'].append(accuracy)
19
+
20
+ avg_dic[task_type] += accuracy
21
+ for task_type, accuracy in avg_dic.items():
22
+ result_dic['cycle'].append('Average')
23
+ result_dic['type'].append(task_type)
24
+ result_dic['accuracy'].append(accuracy / count)
25
+ result_pd = pd.DataFrame(result_dic)
26
+
27
+ return result_pd
28
+
29
+
30
+ def hrbench_score(data):
31
+ ret = defaultdict(list)
32
+ resp_dic = {}
33
+ category_list = set(data['category'])
34
+ score_dict = defaultdict(list)
35
+
36
+ for i in range(len(data)):
37
+ d = data.iloc[i]
38
+ category = d['category']
39
+ gpt_score = d['hit']
40
+ score_dict[category].append(gpt_score)
41
+ score_dict['all'].append(gpt_score)
42
+
43
+ all_acc = np.mean(score_dict['all'])
44
+ ret['type'].append('all')
45
+ ret['acc'].append(all_acc)
46
+ resp_dic['all'] = all_acc
47
+ for cate in category_list:
48
+ acc = np.mean(score_dict[cate])
49
+ ret['type'].append(cate)
50
+ ret['acc'].append(acc)
51
+
52
+ resp_dic[cate] = acc
53
+
54
+ return pd.DataFrame(ret), resp_dic
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/judge_util.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from ...smp import load_env
3
+
4
+ INTERNAL = os.environ.get('INTERNAL', 0)
5
+
6
+
7
+ def build_judge(**kwargs):
8
+ from ...api import OpenAIWrapper, SiliconFlowAPI
9
+ model = kwargs.pop('model', None)
10
+ kwargs.pop('nproc', None)
11
+ load_env()
12
+ LOCAL_LLM = os.environ.get('LOCAL_LLM', None)
13
+ if LOCAL_LLM is None:
14
+ model_map = {
15
+ 'gpt-4-turbo': 'gpt-4-1106-preview',
16
+ 'gpt-4-0613': 'gpt-4-0613',
17
+ 'gpt-4-0125': 'gpt-4-0125-preview',
18
+ 'gpt-4-0409': 'gpt-4-turbo-2024-04-09',
19
+ 'chatgpt-1106': 'gpt-3.5-turbo-1106',
20
+ 'chatgpt-0125': 'gpt-3.5-turbo-0125',
21
+ 'gpt-4o': 'gpt-4o-2024-05-13',
22
+ 'gpt-4o-0806': 'gpt-4o-2024-08-06',
23
+ 'gpt-4o-mini': 'gpt-4o-mini-2024-07-18',
24
+ 'qwen-7b': 'Qwen/Qwen2.5-7B-Instruct',
25
+ 'qwen-72b': 'Qwen/Qwen2.5-72B-Instruct',
26
+ 'deepseek': 'deepseek-ai/DeepSeek-V2.5',
27
+ }
28
+ model_version = model_map[model]
29
+ else:
30
+ model_version = LOCAL_LLM
31
+
32
+ if model in ['qwen-7b', 'qwen-72b', 'deepseek']:
33
+ model = SiliconFlowAPI(model_version, **kwargs)
34
+ else:
35
+ model = OpenAIWrapper(model_version, **kwargs)
36
+ return model
37
+
38
+
39
+ DEBUG_MESSAGE = """
40
+ To debug the OpenAI API, you can try the following scripts in python:
41
+ ```python
42
+ from vlmeval.api import OpenAIWrapper
43
+ model = OpenAIWrapper('gpt-4o', verbose=True)
44
+ msgs = [dict(type='text', value='Hello!')]
45
+ code, answer, resp = model.generate_inner(msgs)
46
+ print(code, answer, resp)
47
+ ```
48
+ You cam see the specific error if the API call fails.
49
+ """
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/llavabench.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from ...smp import *
4
+
5
+ rule_dict = {
6
+ 'llava_bench_conv': {'role': 'Assistant', 'prompt': 'We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.'}, # noqa: E501
7
+ 'llava_bench_detail': {'role': 'Assistant', 'prompt': 'We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.'}, # noqa: E501
8
+ 'llava_bench_complex': {'role': 'Assistant', 'prompt': 'We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.'} # noqa: E501
9
+ }
10
+
11
+
12
+ def get_eval(judge, content):
13
+ return judge.generate(content)
14
+
15
+
16
+ def parse_score(review):
17
+ logger = get_logger('Evaluation')
18
+ try:
19
+ score_pair = review.split('\n')[0]
20
+ score_pair = score_pair.replace(',', ' ')
21
+ sp = score_pair.split(' ')
22
+ if len(sp) == 2:
23
+ return [float(sp[0]), float(sp[1])]
24
+ else:
25
+ logger.error('error', review)
26
+ return [-1, -1]
27
+ except Exception as e:
28
+ logger.error(e, 'error', review)
29
+ return [-1, -1]
30
+
31
+
32
+ def build_prompt(line):
33
+ cap_str = line['caption']
34
+ question = line['question']
35
+ ans1 = line['gpt4_ans']
36
+ ans2 = line['prediction']
37
+ category = 'llava_bench_' + line['category']
38
+ rule = rule_dict[category]
39
+ role, prompt = rule['role'], rule['prompt']
40
+
41
+ content = (f'[Context]\n{cap_str}\n\n'
42
+ f'[Question]\n{question}\n\n'
43
+ f'[{role} 1]\n{ans1}\n\n[End of {role} 1]\n\n'
44
+ f'[{role} 2]\n{ans2}\n\n[End of {role} 2]\n\n'
45
+ f'[System]\n{prompt}\n\n')
46
+ return content
47
+
48
+
49
+ def LLaVABench_atomeval(model, prompt):
50
+ review = get_eval(model, prompt)
51
+ scores = parse_score(review)
52
+ return scores
53
+
54
+
55
+ def LLaVABench_score(data):
56
+ cates = ['overall'] + list(set(data['category']))
57
+ ret = defaultdict(list)
58
+
59
+ for c in cates:
60
+ ret['split'].append(c)
61
+ sub = data[data['category'] == c] if c != 'overall' else data
62
+ ret['Relative Score (main)'].append(np.mean(sub['score']) / np.mean(sub['gpt4_score']) * 100)
63
+ ret['VLM Score'].append(np.mean(sub['score']) * 10)
64
+ ret['GPT4 Score'].append(np.mean(sub['gpt4_score']) * 10)
65
+ return pd.DataFrame(ret)
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/logicvista.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ # from colorama import Fore, Back, Style
4
+ from ...smp import *
5
+
6
+
7
+ FAIL_MSG = 'Failed to obtain answer via API.'
8
+
9
+
10
+ def build_prompt_logicvista(line):
11
+ question = line['question']
12
+ prediction = str(line['prediction'])
13
+ tmpl = (
14
+ "You are a information extractor that extracts multiple choice letter answer choices "
15
+ "from a paragraph that contains the answer choice and sometimes explaination of why that "
16
+ "choice is correct to the given question.\n"
17
+ "What letter did the following answer choose? If the answer did not select a letter answer choice, "
18
+ "first try to infer the answer based off the given choices.\n"
19
+ "If it does not seem like the given answer corresponds to an answer choice OR if there is no selected answer, please just respond with Z.\n"
20
+ "Make sure you answer with ONLY the letters chosen.\n"
21
+ 'Example 1: \n'
22
+ 'Question: <start>\nWhat is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n<end>\n'
23
+ 'Answer: <start>\na cute teddy bear\n<end>\nYour output: A\n'
24
+ 'Example 2: \n'
25
+ 'Question: <start>\nWhat is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n<end>\n'
26
+ 'Answer: <start>\nSpider\n<end>\nYour output: Z\n'
27
+ 'Example 3: \n'
28
+ 'Question: <start>\nWhich figure is a rotation of the object?\n<end>\n'
29
+ 'Answer: <start>\nThe figure on the right, labeled "D," is a rotation of the object shown in the top left corner.\n<end>\nYour output: D\n'
30
+ 'Example 4: \n'
31
+ 'Question: <start>\nWhich of the boxes comes next in the sequence? Select from A-E\n<end>\n'
32
+ 'Answer: <start>\nThe sequence of the boxes is A, B, C, D, E.\n<end>\nYour output: ABCDE\n'
33
+ 'Example 5: \n'
34
+ 'Question: <start>\n{}\n<end>\nAnswer: <start>\n{}\n<end>\nYour output: '
35
+ )
36
+
37
+ return tmpl.format(question, prediction)
38
+
39
+
40
+ def LogicVista_auxeval(model, line):
41
+ prompt = build_prompt_logicvista(line)
42
+ print(prompt)
43
+ log = ''
44
+ retry = 5
45
+
46
+ for i in range(retry):
47
+ prediction = line['prediction']
48
+ res = model.generate(prompt, temperature=i * 0.5)
49
+ answer = line['answer'].split(", ")
50
+ for j in range(0, len(answer)):
51
+ answer[j] = answer[j].lower()
52
+ answer.sort()
53
+ answer = ''.join(answer)
54
+
55
+ if FAIL_MSG in res:
56
+ log += f'Try {i}: output is {prediction}, failed to parse.\n'
57
+ elif not res.isupper() or not res.isalpha():
58
+ log += f'Try {i}: output is {prediction}, failed to parse.\n'
59
+ else:
60
+ log += 'Succeed'
61
+ hit = 0
62
+ extracted = [alpha.lower() for alpha in res]
63
+ extracted.sort()
64
+ extracted = ''.join(extracted)
65
+ if extracted == answer:
66
+ hit = 1
67
+ return dict(log=log, res=res, hit=hit)
68
+ log += 'All 5 retries failed.\n'
69
+ return dict(log=log, res='', hit=0)
70
+
71
+
72
+ cat = ["diagram", "ocr", "patterns", "graphs", "tables", "3d shapes", "puzzles", "sequences", "physics"]
73
+
74
+
75
+ def evaluate_logicvista(file_path):
76
+ df = pd.read_excel(file_path)
77
+
78
+ tot = defaultdict(lambda: 0)
79
+ hit = defaultdict(lambda: 0)
80
+ acc = defaultdict(lambda: 0)
81
+
82
+ lt = len(df)
83
+ skill_list = []
84
+
85
+ df_tot = df
86
+
87
+ df_inductive = df[df["skill"].str.contains("inductive")]
88
+ df_deductive = df[df["skill"].str.contains("deductive")]
89
+ df_numerical = df[df["skill"].str.contains("numerical")]
90
+ df_spatial = df[df["skill"].str.contains("spatial")]
91
+ df_mechanical = df[df["skill"].str.contains("mechanical")]
92
+
93
+ tot_correct = df_tot["hit"].sum()
94
+ tot_acc = (tot_correct / df_tot.shape[0]) * 100
95
+ tot['Overall'] = df_tot.shape[0]
96
+ hit['Overall'] = tot_correct
97
+ acc['Overall'] = tot_acc
98
+
99
+ inductive_correct = df_inductive["hit"].sum()
100
+ inductive_acc = (inductive_correct / df_inductive.shape[0]) * 100
101
+
102
+ tot["inductive"] = df_inductive.shape[0]
103
+ hit["inductive"] = inductive_correct
104
+ acc["inductive"] = inductive_acc
105
+
106
+ deductive_correct = df_deductive["hit"].sum()
107
+ deductive_acc = (deductive_correct / df_deductive.shape[0]) * 100
108
+
109
+ tot["deductive"] = df_deductive.shape[0]
110
+ hit["deductive"] = deductive_correct
111
+ acc["deductive"] = deductive_acc
112
+
113
+ numerical_correct = df_numerical["hit"].sum()
114
+ numerical_acc = (numerical_correct / df_numerical.shape[0]) * 100
115
+
116
+ tot["numerical"] = df_numerical.shape[0]
117
+ hit["numerical"] = numerical_correct
118
+ acc["numerical"] = numerical_acc
119
+
120
+ spatial_correct = df_spatial["hit"].sum()
121
+ spatial_acc = (spatial_correct / df_spatial.shape[0]) * 100
122
+
123
+ tot["spatial"] = df_spatial.shape[0]
124
+ hit["spatial"] = spatial_correct
125
+ acc["spatial"] = spatial_acc
126
+
127
+ mechanical_correct = df_mechanical["hit"].sum()
128
+ mechanical_acc = (mechanical_correct / df_mechanical.shape[0]) * 100
129
+
130
+ tot["mechanical"] = df_mechanical.shape[0]
131
+ hit["mechanical"] = mechanical_correct
132
+ acc["mechanical"] = mechanical_acc
133
+
134
+ # capability dimension, the official data json does not contain 'capability' column, so it is now ignored
135
+ # for i in cat:
136
+ # curr = df[df["capability"].str.contains(i.replace(" ", ""))]
137
+ # correct = curr["hit"].sum()
138
+ # accuracy = (correct / curr.shape[0]) * 100
139
+ # tot[i] = curr.shape[0]
140
+ # hit[i] = correct
141
+ # acc[i] = accuracy
142
+
143
+ res = defaultdict(list)
144
+ for k in tot.keys():
145
+ res['Task&Skill'].append(k)
146
+ res['tot'].append(tot[k])
147
+ res['hit'].append(hit[k])
148
+ res['acc'].append(acc[k])
149
+ res = pd.DataFrame(res)
150
+ return res
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/longvideobench.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ...smp import *
2
+ from .multiple_choice import extract_answer_from_item
3
+ import numpy as np
4
+ import re
5
+
6
+ FAIL_MSG = 'Failed to obtain answer via API.'
7
+
8
+ DURATIONS = [15, 60, 600, 3600]
9
+ TASK_CATEGORIES = [
10
+ "S2E", "S2O", "S2A",
11
+ "E2O", "O2E", "T2E",
12
+ "T2O", "T2A", "E3E",
13
+ "O3O", "SSS", "SOS",
14
+ "SAA", "T3E", "T3O",
15
+ "TOS", "TAA"
16
+ ]
17
+
18
+
19
+ def get_dimension_rating(data_path):
20
+ data = load(data_path)
21
+ print(data.iloc[0])
22
+
23
+ duration_rating = {k: {} for k in DURATIONS}
24
+ for duration in DURATIONS + ['overall']:
25
+ duration_rating[duration] = {
26
+ 'overall': '',
27
+ 'question_category': {k: [] for k in TASK_CATEGORIES}
28
+ }
29
+
30
+ for i in range(len(data)):
31
+
32
+ task_ctg = data.iloc[i]['question_category']
33
+
34
+ duration = data.iloc[i]['duration_group']
35
+ duration_rating[duration]['question_category'][task_ctg].append(data.iloc[i]['score'])
36
+
37
+ duration_rating['overall']['question_category'][task_ctg].append(data.iloc[i]['score'])
38
+
39
+ for duration in DURATIONS + ['overall']:
40
+ overall_res_dur = f'{np.mean([x for x in sum(duration_rating[duration]["question_category"].values(), []) if x >= 0]):.3f}' # noqa: E501
41
+ duration_rating[duration]['overall'] = overall_res_dur
42
+ for task_ctg in TASK_CATEGORIES:
43
+ task_res_dur = f'{np.mean([x for x in duration_rating[duration]["question_category"][task_ctg] if x >= 0]):.3f}' # noqa: E501
44
+ duration_rating[duration]['question_category'][task_ctg] = task_res_dur
45
+
46
+ return duration_rating
47
+
48
+
49
+ def extract_option(model, input_item, dataset_name):
50
+ options = input_item['question'].split('\n')[1:]
51
+ for id, option in enumerate(options):
52
+ option_id = chr(ord('A') + id) + '.'
53
+ if option.find(option_id) >= 0:
54
+ input_item[chr(ord('A') + id)] = option[option.find(option_id) + len(option_id):].strip('. \n')
55
+ return extract_answer_from_item(model, input_item, dataset_name)['opt']
56
+
57
+
58
+ def extract_characters_regex(s):
59
+ s = s.strip()
60
+ answer_prefixes = [
61
+ 'The best answer is',
62
+ 'The correct answer is',
63
+ 'The answer is',
64
+ 'The answer',
65
+ 'The best option is'
66
+ 'The correct option is',
67
+ 'Best answer:'
68
+ 'Best option:',
69
+ 'Answer:',
70
+ 'Option:',
71
+ ]
72
+ for answer_prefix in answer_prefixes:
73
+ s = s.replace(answer_prefix, '')
74
+
75
+ if len(s.split()) > 10 and not re.search('[ABCDE]', s):
76
+ return ''
77
+ matches = re.search(r'[ABCDE]', s)
78
+ if matches is None:
79
+ return ''
80
+ return matches[0]
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathv.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ...smp import *
2
+ from ...utils import can_infer
3
+ try:
4
+ from latex2sympy2 import latex2sympy
5
+ except Exception as e:
6
+ logging.critical(f'{type(e)}: {e}')
7
+ logging.critical('Please install latex2sympy2 by running "pip install latex2sympy2"')
8
+
9
+ FAIL_MSG = 'Failed to obtain answer via API.'
10
+
11
+
12
+ def is_equal(asw: str, gt_asw: str) -> bool:
13
+ if not isinstance(asw, str) != str or not isinstance(gt_asw, str):
14
+ print('Warning: input is not string')
15
+ print(asw, gt_asw)
16
+ asw = str(asw).lower().strip()
17
+ gt_asw = str(gt_asw).lower().strip()
18
+ if gt_asw == asw:
19
+ return True
20
+ try:
21
+ a = eval(gt_asw)
22
+ b = eval(asw)
23
+ if abs(a - b) < 1e-6:
24
+ return True
25
+ except:
26
+ pass
27
+ try:
28
+ a = latex2sympy(gt_asw)
29
+ b = latex2sympy(asw)
30
+ if abs(eval(str(a)) - eval(str(b))) < 1e-6:
31
+ return True
32
+ if abs(a - b) < 1e-6:
33
+ return True
34
+ except:
35
+ pass
36
+ return False
37
+
38
+
39
+ def get_gpt4_ICE():
40
+ example_1 = """
41
+ Hint: Please answer the question and provide the final answer at the end.\n
42
+ Question: Which number is missing?\n
43
+ Model response: The number missing in the sequence is 14.\n
44
+ Extracted answer: 14
45
+ """
46
+
47
+ example_2 = """
48
+ Hint: Please answer the question and provide the final answer at the end.\n
49
+ Question: What is the fraction of females facing the camera?\n
50
+ Model response: The fraction of females facing the camera is 0.6,
51
+ which means that six out of ten females in the group are facing the camera.\n
52
+ Extracted answer: 0.6
53
+ """
54
+
55
+ example_3 = """
56
+ Hint: Please answer the question and provide the final answer at the end.\n
57
+ Question: How much money does Luca need to buy a sour apple candy and a butter-scotch candy? (Unit: $)\n
58
+ Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy.\n
59
+ Extracted answer: 1.45
60
+ """
61
+
62
+ example_4 = """
63
+ Hint: Please answer the question and provide the final answer at the end.\n
64
+ Question: Between which two years does the line graph saw its maximum peak?\n
65
+ Model response: The line graph saw its maximum peak between 2007 and 2008.\n
66
+ Extracted answer: [2007, 2008]
67
+ """
68
+
69
+ example_5 = """
70
+ Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.\n
71
+ Question: What fraction of the shape is blue?\n
72
+ Choices: (A) 3/11 (B) 8/11 (C) 6/11 (D) 3/5\n
73
+ Model response: The correct answer is (B) 8/11.\n
74
+ Extracted answer: B
75
+ """
76
+
77
+ return [example_1, example_2, example_3, example_4, example_5]
78
+
79
+
80
+ def build_mathv_gpt4_prompt(line):
81
+ task_description = """
82
+ Please read the following example.
83
+ Then extract the answer from the model response and type it at the end of the prompt.\n
84
+ """
85
+ question = line['question']
86
+ prediction = str(line['prediction'])
87
+ prompt = task_description
88
+ examples = get_gpt4_ICE()
89
+ for example in examples:
90
+ prompt += example + '\n'
91
+ prompt += question + '\n'
92
+ prompt += 'Model respone: ' + prediction
93
+ prompt += 'Extracted answer:'
94
+ return prompt
95
+
96
+
97
+ def list_to_dict(lst):
98
+ return {chr(65 + i): val for i, val in enumerate(lst)}
99
+
100
+
101
+ def post_check(line, prefetch=False):
102
+ res = None
103
+ ans = line['answer']
104
+ response = line['prediction'] if prefetch else line['res']
105
+ try:
106
+ if len(eval(line['choices'])) > 0:
107
+ ans = line['answer']
108
+ choices = list_to_dict(eval(line['choices']))
109
+ res = can_infer(response, choices)
110
+ if prefetch:
111
+ return res
112
+ else:
113
+ res = str(response)
114
+ ans = str(ans)
115
+ except ValueError:
116
+ pass
117
+
118
+ if is_equal(res, ans):
119
+ return res if prefetch else True
120
+ else:
121
+ return False
122
+
123
+
124
+ def MATH_V_auxeval(model, line):
125
+ prompt = build_mathv_gpt4_prompt(line)
126
+ log = ''
127
+ retry = 5
128
+ if post_check(line, prefetch=True):
129
+ res = post_check(line, prefetch=True)
130
+ return dict(log='Prefetch succeed', res=res)
131
+ for i in range(retry):
132
+ prediction = line['prediction']
133
+ res = model.generate(prompt, temperature=i * 0.5)
134
+
135
+ if FAIL_MSG in res:
136
+ log += f'Try {i}: output is {prediction}, failed to parse.\n'
137
+ else:
138
+ log += 'Succeed'
139
+ return dict(log=log, res=res)
140
+ log += 'All 5 retries failed.\n'
141
+ return dict(log=log, res='')
142
+
143
+
144
+ def MATH_V_acc(result_file):
145
+ data = load(result_file)
146
+ tot = defaultdict(lambda: 0)
147
+ fetch = defaultdict(lambda: 0)
148
+ hit = defaultdict(lambda: 0)
149
+ lt = len(data)
150
+ for i in range(lt):
151
+ item = data.iloc[i]
152
+ cate = item['category']
153
+ tot['Overall'] += 1
154
+ tot[cate] += 1
155
+ if item['log'] == 'Prefetch succeed':
156
+ fetch['Overall'] += 1
157
+ fetch[cate] += 1
158
+ if post_check(item, prefetch=False):
159
+ hit['Overall'] += 1
160
+ hit[cate] += 1
161
+
162
+ res = defaultdict(list)
163
+ for k in tot.keys():
164
+ res['Subject'].append(k)
165
+ res['tot'].append(tot[k])
166
+ res['prefetch'].append(fetch[k])
167
+ res['hit'].append(hit[k])
168
+ res['prefetch_rate'].append(fetch[k] / tot[k] * 100)
169
+ res['acc'].append(hit[k] / tot[k] * 100)
170
+ res = pd.DataFrame(res).sort_values('Subject', ignore_index=True)
171
+ return res
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathverse.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ...smp import *
2
+ from ...utils import can_infer
3
+
4
+
5
+ FAIL_MSG = 'Failed to obtain answer via API.'
6
+
7
+
8
+ def get_gpt4_extract_ICE():
9
+ example_1 = """
10
+ 1.
11
+ Model response: 'Rounded to two decimal places, the perimeter of the sector is approximately:\n\n(-2, 1)'
12
+ Extracted Answer: (-2, 1)
13
+ """ # noqa
14
+
15
+ example_2 = """
16
+ 2.
17
+ Model response: 'at those points.\n\nTherefore, the correct option that represents the meaning of the intersection points of the graphs is:\n\nD. They give the solutions to the equation $f(t)=g(t)$.",'
18
+ Extracted Answer: D
19
+ """ # noqa
20
+
21
+ example_3 = """
22
+ 3.
23
+ Model response: ' at 1 (there's a closed circle at y = 1), the range in interval notation is \\((-4, 1]\\).\n\nFinal values:\nDomain: \\((-3, 3]\\)\nRange: \\((-4, 1]\\)'
24
+ Extracted Answer: Domain: \\((-3, 3]\\)\nRange: \\((-4, 1]\\)
25
+ """ # noqa
26
+
27
+ example_4 = """
28
+ 4.
29
+ Model response: 'As it stands, I cannot provide the correct option letter because there isn't enough information to solve for 'y'.'
30
+ Extracted Answer: null
31
+ """ # noqa
32
+
33
+ example_5 = """
34
+ 5.
35
+ Model response: 'Given that AB = 17.6 meters, we can now substitute into the equation:\n\nd = 17.6 / cos(38\u00b0)\n\nTherefore, to one decimal place, the distance d between Ned and Bart is approximately 22.3 meters.'
36
+ Extracted answer: 22.3
37
+ """ # noqa
38
+
39
+ example_6 = """
40
+ 6.
41
+ Model response: have all the coefficients for the quadratic function:\n\\( f(x) = ax^2 + bx + c \\)\n\\( f(x) = -1x^2 - 2x + 1 \\)\n\nTherefore, the equation for the graphed function \\( f \\) is:\n\\( f(x) = -x^2 - 2x + 1 \\)"'
42
+ Extracted answer: f(x) = -x^2 - 2x + 1
43
+ """ # noqa
44
+
45
+ return [example_1, example_2, example_3, example_4, example_5, example_6]
46
+
47
+
48
+ def get_gpt4_score_ICE():
49
+ example_1 = """
50
+ [Question]: Write the set of numbers represented on the number line in interval notation.
51
+ [Standard Answer]: (-2,1]
52
+ [Model_answer] : Extracted Answer: \\((-2, 1)\\)
53
+ Judgement: 0
54
+ """ # noqa
55
+
56
+ example_2 = """
57
+ [Question]: As shown in the figure, circle O has a radius 1.0, if angle BAC = 60.0, then the length of BC is ()\nChoices:\nA:2\nB:2\u221a{{3}}\nC:\u221a{{3}}\nD:2\u221a{{2}}
58
+ [Standard Answer]: C
59
+ [Model_answer] : B:2\u221a{{3}}
60
+ Judgement: 0
61
+ """ # noqa
62
+
63
+ example_3 = """
64
+ [Question]: Find the domain and range of the function f using interval notation.
65
+ [Standard Answer]: domain: [-4, 0) and range: (-3, 1]
66
+ [Model_answer] : Range: \\((-4, 1]\\)
67
+ Judgement: 0
68
+ """ # noqa
69
+
70
+ example_4 = """
71
+ [Question]: As shown in the figure, circle O has a radius 1.0, if angle BAC = 60.0, then the length of BC is ()\nChoices:\nA:2\nB:2\u221a{{3}}\nC:\u221a{{3}}\nD:2\u221a{{2}}
72
+ [Standard Answer]: C
73
+ [Model_answer] : null
74
+ Judgement: 0
75
+ """ # noqa
76
+
77
+ return [example_1, example_2, example_3, example_4]
78
+
79
+
80
+ def build_mathverse_gpt4_extract_prompt(line):
81
+ task_description = """
82
+ I am providing you a response from a model to a math problem, termed 'Model Response'. You should extract the answer from the response as 'Extracted Answer'. Directly output the extracted answer with no explanation.\n\n
83
+ """ # noqa
84
+ prediction = str(line['prediction'])
85
+ demo_prompt = task_description
86
+ examples = get_gpt4_extract_ICE()
87
+ for example in examples:
88
+ demo_prompt += example + '\n\n'
89
+ test_prompt = f"Model response: '{prediction}'\nExtracted Answer: "
90
+ full_prompt = f'{demo_prompt}7.\n{test_prompt}'
91
+
92
+ return full_prompt
93
+
94
+
95
+ def build_mathverse_gpt4_score_prompt(line):
96
+ task_description = """
97
+ Below are two answers to a math question. Question is [Question], [Standard Answer] is the standard answer to the question, and [Model_answer] is the answer extracted from a model's output to this question. Determine whether these two answers are consistent.
98
+ Please note that only when the [Model_answer] completely matches the [Standard Answer] means they are consistent. For non-multiple-choice questions, if the meaning is expressed in the same way, it is also considered consistent, for example, 0.5m and 50cm.
99
+ If they are consistent, Judement is 1; if they are different, Judement is 0.\n\n
100
+ """ # noqa
101
+ question_for_eval = line['question_for_eval']
102
+ extract = line['extract']
103
+ answer = line['answer']
104
+ demo_prompt = task_description
105
+ examples = get_gpt4_score_ICE()
106
+ for example in examples:
107
+ demo_prompt += example + '\n\n'
108
+ test_prompt = f"""
109
+ [Question]: {question_for_eval}
110
+ [Standard Answer]: {answer}
111
+ [Model_answer] : {extract}
112
+ Judgement:"""
113
+ full_prompt = f'{demo_prompt}{test_prompt}'
114
+
115
+ return full_prompt
116
+
117
+
118
+ def post_check_score(line, prefetch=False):
119
+ ans = str(line['answer']).strip()
120
+ response = str(line['extract']).strip()
121
+
122
+ if response == ans:
123
+ return response if prefetch else True
124
+ else:
125
+ return False
126
+
127
+
128
+ def MathVerse_auxeval_extract(model, line):
129
+ prompt = build_mathverse_gpt4_extract_prompt(line)
130
+ log = ''
131
+ retry = 5
132
+ for i in range(retry):
133
+ prediction = line['prediction']
134
+ res = model.generate(prompt, temperature=i * 0.5)
135
+
136
+ if FAIL_MSG in res:
137
+ log += f'Try {i}: output is {prediction}, failed to parse.\n'
138
+ else:
139
+ log += 'Succeed'
140
+ return dict(log_extract=log, extract=res)
141
+ log += 'All 5 retries failed.\n'
142
+ return dict(log_extract=log, extract='')
143
+
144
+
145
+ def MathVerse_auxeval_score(model, line):
146
+ prompt = build_mathverse_gpt4_score_prompt(line)
147
+ log = ''
148
+ retry = 5
149
+ if post_check_score(line, prefetch=True):
150
+ res = post_check_score(line, prefetch=True)
151
+ return dict(log_score='Prefetch succeed', score=True)
152
+ for i in range(retry):
153
+ prediction = line['prediction']
154
+ res = model.generate(prompt, temperature=i * 0.5)
155
+
156
+ if FAIL_MSG in res or res.strip() not in ['0', '1']:
157
+ log += f'Try {i}: output is {prediction}, res is {res}, failed to parse.\n'
158
+ else:
159
+ log += 'Succeed'
160
+ return dict(log_score=log, score=int(res) == 1)
161
+ log += 'All 5 retries failed.\n'
162
+ return dict(log_score=log, score=False)
163
+
164
+
165
+ def MathVerse_acc(result_file):
166
+ df = load(result_file)
167
+
168
+ df['metadata'] = df['metadata'].apply(lambda x: x.replace("'", '"'))
169
+ df['metadata'] = df['metadata'].apply(json.loads)
170
+ df_metadata = pd.json_normalize(df['metadata'])
171
+ df = pd.concat([df.drop('metadata', axis=1), df_metadata], axis=1)
172
+
173
+ subset = list(set(df['problem_version']))
174
+
175
+ res = defaultdict(list)
176
+ for p in subset:
177
+ if p != 'Overall':
178
+ sub = df[df['problem_version'] == p]
179
+ else:
180
+ sub = cp.deepcopy(df)
181
+ res['split'].append(p)
182
+ # Overall Acc
183
+ res['Overall'].append(np.mean(sub['score']) * 100)
184
+ # Subject
185
+ subjects = set(df['subject'])
186
+ for k in subjects:
187
+ res[k].append(np.mean(sub[sub['subject'] == k]['score']) * 100)
188
+ # Subfield
189
+ subfields = set(df['subfield'])
190
+ for k in subfields:
191
+ res[k].append(np.mean(sub[sub['subfield'] == k]['score']) * 100)
192
+
193
+ return pd.DataFrame(res)
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathvista.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ...smp import *
2
+ from ...utils import can_infer
3
+
4
+
5
+ FAIL_MSG = 'Failed to obtain answer via API.'
6
+
7
+
8
+ def get_gpt4_ICE():
9
+ example_1 = """
10
+ Hint: Please answer the question requiring an integer answer and provide the final value,
11
+ e.g., 1, 2, 3, at the end.\n
12
+ Question: Which number is missing?\n
13
+ Model response: The number missing in the sequence is 14.\n
14
+ Extracted answer: 14
15
+ """
16
+
17
+ example_2 = """
18
+ Hint: Please answer the question requiring a floating-point number with one decimal place and provide the final value,
19
+ e.g., 1.2, 1.3, 1.4, at the end.\n
20
+ Question: What is the fraction of females facing the camera?\n
21
+ Model response: The fraction of females facing the camera is 0.6,
22
+ which means that six out of ten females in the group are facing the camera.\n
23
+ Extracted answer: 0.6
24
+ """
25
+
26
+ example_3 = """
27
+ Hint: Please answer the question requiring a floating-point number with two decimal places and provide the final value,
28
+ e.g., 1.23, 1.34, 1.45, at the end.\n
29
+ Question: How much money does Luca need to buy a sour apple candy and a butter-scotch candy? (Unit: $)\n
30
+ Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy.\n
31
+ Extracted answer: 1.45
32
+ """
33
+
34
+ example_4 = """
35
+ Hint: Please answer the question requiring a Python list as an answer and provide the final list,
36
+ e.g., [1, 2, 3], [1.2, 1.3, 1.4], at the end.\n
37
+ Question: Between which two years does the line graph saw its maximum peak?\n
38
+ Model response: The line graph saw its maximum peak between 2007 and 2008.\n
39
+ Extracted answer: [2007, 2008]
40
+ """
41
+
42
+ example_5 = """
43
+ Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.\n
44
+ Question: What fraction of the shape is blue?\n
45
+ Choices: (A) 3/11 (B) 8/11 (C) 6/11 (D) 3/5\n
46
+ Model response: The correct answer is (B) 8/11.\n
47
+ Extracted answer: B
48
+ """
49
+
50
+ return [example_1, example_2, example_3, example_4, example_5]
51
+
52
+
53
+ def build_mathvista_gpt4_prompt(line):
54
+ task_description = """
55
+ Please read the following example.
56
+ Then extract the answer from the model response and type it at the end of the prompt.\n
57
+ """
58
+ question = line['question']
59
+ prediction = str(line['prediction'])
60
+ prompt = task_description
61
+ examples = get_gpt4_ICE()
62
+ for example in examples:
63
+ prompt += example + '\n'
64
+ prompt += question + '\n'
65
+ prompt += 'Model respone: ' + prediction
66
+ prompt += 'Extracted answer:'
67
+ return prompt
68
+
69
+
70
+ def list_to_dict(lst):
71
+ return {chr(65 + i): val for i, val in enumerate(lst)}
72
+
73
+
74
+ def post_check(line, prefetch=False):
75
+ res = None
76
+ ans = line['answer']
77
+ response = line['prediction'] if prefetch else line['res']
78
+ try:
79
+ if line['question_type'] == 'multi_choice':
80
+ ans = line['answer_option']
81
+ choices = list_to_dict(eval(line['choices']))
82
+ res = can_infer(response, choices)
83
+ if prefetch:
84
+ return res
85
+ else:
86
+ if line['answer_type'] == 'integer':
87
+ res = int(response)
88
+ ans = int(line['answer'])
89
+ elif line['answer_type'] == 'float':
90
+ res = float(response)
91
+ ans = float(line['answer'])
92
+ else:
93
+ res = str(res)
94
+ ans = str(ans)
95
+ except ValueError:
96
+ pass
97
+
98
+ if res == ans:
99
+ return res if prefetch else True
100
+ else:
101
+ return False
102
+
103
+
104
+ def MathVista_auxeval(model, line):
105
+ prompt = build_mathvista_gpt4_prompt(line)
106
+ log = ''
107
+ retry = 5
108
+ if post_check(line, prefetch=True):
109
+ res = post_check(line, prefetch=True)
110
+ return dict(log='Prefetch succeed', res=res)
111
+ for i in range(retry):
112
+ prediction = line['prediction']
113
+ res = model.generate(prompt, temperature=i * 0.5)
114
+
115
+ if FAIL_MSG in res:
116
+ log += f'Try {i}: output is {prediction}, failed to parse.\n'
117
+ else:
118
+ log += 'Succeed'
119
+ return dict(log=log, res=res)
120
+ log += 'All 5 retries failed.\n'
121
+ return dict(log=log, res='')
122
+
123
+
124
+ def MathVista_acc(result_file):
125
+ data = load(result_file)
126
+ tot = defaultdict(lambda: 0)
127
+ fetch = defaultdict(lambda: 0)
128
+ hit = defaultdict(lambda: 0)
129
+ lt = len(data)
130
+ skill_list = []
131
+ for i in range(lt):
132
+ item = data.iloc[i]
133
+ cate = item['task']
134
+ tot['Overall'] += 1
135
+ try:
136
+ skills = eval(item['skills'])
137
+ except SyntaxError:
138
+ skills = [item['skills']]
139
+ for skill in skills:
140
+ if skill not in skill_list:
141
+ skill_list.append(skill)
142
+ tot[skill] += 1
143
+ tot[cate] += 1
144
+ if item['log'] == 'Prefetch succeed':
145
+ fetch['Overall'] += 1
146
+ fetch[cate] += 1
147
+ for skill in skills:
148
+ fetch[skill] += 1
149
+ if post_check(item, prefetch=False):
150
+ hit['Overall'] += 1
151
+ hit[cate] += 1
152
+ for skill in skills:
153
+ hit[skill] += 1
154
+
155
+ res = defaultdict(list)
156
+ for k in tot.keys():
157
+ res['Task&Skill'].append(k)
158
+ res['tot'].append(tot[k])
159
+ res['prefetch'].append(fetch[k])
160
+ res['hit'].append(hit[k])
161
+ res['prefetch_rate'].append(fetch[k] / tot[k] * 100)
162
+ res['acc'].append(hit[k] / tot[k] * 100)
163
+ res = pd.DataFrame(res)
164
+ return res
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mlvu.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ...smp import *
2
+ from .multiple_choice import extract_answer_from_item
3
+ from PIL import Image, ImageOps
4
+ import numpy as np
5
+
6
+ FAIL_MSG = 'Failed to obtain answer via API.'
7
+
8
+ system_prompt_sub_scene = """
9
+ ##TASK DESCRIPTION:
10
+ You are required to evaluate a respondent's answer based on a provided question, some scoring points, and the respondent's answer. You should provide two scores. The first is the accuracy score, which should range from 1 to 5. The second is the relevance score, which should also range from 1 to 5. Below are the criteria for each scoring category.
11
+ ##ACCURACY Scoring Criteria:
12
+ Evaluate the respondent's answer against specific scoring points as follows:
13
+ Score 1: The response completely misses the scoring point.
14
+ Score 3: The response mentions content related to the scoring point but is not entirely correct.
15
+ Score 5: The response accurately addresses the scoring point.
16
+ Calculate the average score across all scoring points to determine the final accuracy score.
17
+ ##RELEVANCE Scoring Criteria:
18
+ Assess how the respondent's answer relates to the original question:
19
+ Score 1: The response is completely off-topic from the question.
20
+ Score 2: The response is partially related to the question but contains a significant amount of irrelevant content.
21
+ Score 3: The response primarily addresses the question, but the respondent seems uncertain about their own answer.
22
+ Score 4: The response mostly addresses the question and the respondent appears confident in their answer.
23
+ Score 5: The response is fully focused on addressing the question with no irrelevant content and demonstrates complete certainty.
24
+ ----
25
+ ##INSTRUCTION:
26
+ 1. Evaluate Accuracy: First, assess and score each scoring point based on the respondent's answer. Calculate the average of these scores to establish the final accuracy score. Provide a detailed rationale before assigning your score.
27
+ 2. Evaluate RELEVANCE: Assess the relevance of the respondent’s answer to the question. Note that when evaluating relevance, the correctness of the answer is not considered; focus solely on how relevant the answer is to the question. Provide a comprehensive rationale before assigning your score.
28
+ 3. Output Scores in JSON Format: Present the scores in JSON format as follows:
29
+ {'score_accuracy': score_acc, 'score_relevance': score_rele, 'total_score': score_acc + score_rele}
30
+ """ # noqa
31
+
32
+ system_prompt_summary = """
33
+ ##TASK DESCRIPTION:
34
+ You are required to evaluate the performance of the respondent in the video summarization task based on the standard answer and the respondent's answer. You should provide two scores. The first is the COMPLETENESS score, which should range from 1 to 5. The second is the RELIABILITY score, which should also range from 1 to 5. Below are the criteria for each scoring category:
35
+ ##COMPLETENESS Scoring Criteria:
36
+ The completeness score focuses on whether the summary covers all key points and main information from the video.
37
+ Score 1: The summary hardly covers any of the main content or key points of the video.
38
+ Score 2: The summary covers some of the main content and key points but misses many.
39
+ Score 3: The summary covers most of the main content and key points.
40
+ Score 4: The summary is very comprehensive, covering most to nearly all of the main content and key points.
41
+ Score 5: The summary completely covers all the main content and key points of the video.
42
+ ##RELIABILITY Scoring Criteria:
43
+ The reliability score evaluates the correctness and clarity of the video summary. It checks for factual errors, misleading statements, and contradictions with the video content. If the respondent's answer includes details that are not present in the standard answer, as long as these details do not conflict with the correct answer and are reasonable, points should not be deducted.
44
+ Score 1: Contains multiple factual errors and contradictions; presentation is confusing.
45
+ Score 2: Includes several errors and some contradictions; needs clearer presentation.
46
+ Score 3: Generally accurate with minor errors; minimal contradictions; reasonably clear presentation.
47
+ Score 4: Very accurate with negligible inaccuracies; no contradictions; clear and fluent presentation.
48
+ Score 5: Completely accurate with no errors or contradictions; presentation is clear and easy to understand.
49
+ ----
50
+ ##INSTRUCTION:
51
+ 1. Evaluate COMPLETENESS: First, analyze the respondent's answer according to the scoring criteria, then provide an integer score between 1 and 5 based on sufficient evidence.
52
+ 2. Evaluate RELIABILITY: First, analyze the respondent's answer according to the scoring criteria, then provide an integer score between 1 and 5 based on sufficient evidence.
53
+ 3. Output Scores in JSON Format: Present the scores in JSON format as follows:
54
+ {'score_completeness': score_comp, 'score_reliability': score_reli, 'total_score': score_comp + score_reli}
55
+ """ # noqa
56
+
57
+
58
+ def check_ans_with_model(pred, gt, model, item, dataset_name='MLVU_MCQ'):
59
+ flag = False
60
+
61
+ index = gt.index("(") # noqa
62
+ index2 = gt.index(")") # noqa
63
+ gt_option = gt[index + 1: index2]
64
+
65
+ if ")" in pred:
66
+ index3 = pred.index(")")
67
+ pred = pred[index3 - 1: index3]
68
+ if pred == gt_option:
69
+ flag = True
70
+ elif extract_answer_from_item(model, item, dataset_name)['opt'] == item['answer']:
71
+ flag = True
72
+
73
+ return flag
74
+
75
+
76
+ def extract_scores_summary(text):
77
+ # Define the keys to locate in the text
78
+ keys = ["score_completeness", "score_reliability"]
79
+ scores = []
80
+
81
+ for key in keys:
82
+ # Find the index where each key starts
83
+ start_index = text.find(key)
84
+ if start_index == -1:
85
+ continue # Skip if key is not found
86
+
87
+ # Find the start of the number which is after the colon and space
88
+ start_number_index = text.find(":", start_index) + 2
89
+ end_number_index = text.find(",", start_number_index) # Assuming the number ends before a comma
90
+
91
+ # Extract and convert the number to float
92
+ score = float(text[start_number_index:end_number_index])
93
+ scores.append(score)
94
+
95
+ return scores
96
+
97
+
98
+ def check_ans_with_model_summary(pred, gt, model, item, dataset_name='MLVU_OpenEnded'):
99
+ user_prompt = f"""
100
+ Please score the respondent's answer according to the steps in the Instructions. You must end with a JSON dict to store the scores.
101
+ Standard Answer: {gt}
102
+ Respondent's Answer: {pred}
103
+ """ # noqa
104
+ result = model.generate(user_prompt)
105
+ result = extract_scores_summary(result)
106
+ result = np.sum(result)
107
+ return result
108
+
109
+
110
+ def extract_scores_sub_scene(text):
111
+ # Define the keys to locate in the text
112
+ keys = ["score_accuracy", "score_relevance"]
113
+ scores = []
114
+
115
+ for key in keys:
116
+ # Find the index where each key starts
117
+ start_index = text.find(key)
118
+ if start_index == -1:
119
+ continue # Skip if key is not found
120
+
121
+ # Find the start of the number which is after the colon and space
122
+ start_number_index = text.find(":", start_index) + 2
123
+ end_number_index = text.find(",", start_number_index) # Assuming the number ends before a comma
124
+
125
+ # Extract and convert the number to float
126
+ score = float(text[start_number_index:end_number_index])
127
+ scores.append(score)
128
+
129
+ return scores
130
+
131
+
132
+ def check_ans_with_model_sub_scene(pred, gt, model, item, dataset_name='MLVU_OpenEnded'):
133
+ user_prompt = f"""
134
+ Please score the respondent's answer according to the steps in the Instructions. You must end with a JSON dict to store the scores.
135
+ Question: {item['question']}
136
+ Scoring Points: {item['scoring_points']}
137
+ Respondent's Answer: {pred}
138
+ """ # noqa
139
+ result = model.generate(user_prompt)
140
+ result = extract_scores_sub_scene(result)
141
+ result = np.sum(result)
142
+ return result
143
+
144
+
145
+ def MLVU_OpenEnded_generate(model, line):
146
+ task_type = line['task_type']
147
+ if task_type == 'summary':
148
+ user_prompt = (
149
+ f"Please score the respondent's answer according to the steps in the Instructions. "
150
+ f"You must end with a JSON dict to store the scores.\n"
151
+ f"Standard Answer: {line['answer']}\n"
152
+ f"Respondent's Answer: {line['prediction']}\n"
153
+ )
154
+ elif task_type == 'sub_scene':
155
+ user_prompt = (
156
+ f"Please score the respondent's answer according to the steps in the Instructions. "
157
+ f"You must end with a JSON dict to store the scores.\n"
158
+ f"Question: {line['question']}\n"
159
+ f"Scoring Points: {line['scoring_points']}\n"
160
+ f"Respondent's Answer: {line['prediction']}\n"
161
+ )
162
+ else:
163
+ AssertionError(f'MLVU don\'t have {task_type} open ended task!')
164
+ result = model.generate(user_prompt)
165
+ return result
166
+
167
+
168
+ def MLVU_OpenEnded_extract(gpt_generate_data, org_data):
169
+ extract_func = {
170
+ 'sub_scene': extract_scores_sub_scene,
171
+ 'summary': extract_scores_summary
172
+ }
173
+ for idx, item in org_data.iterrows():
174
+ func = extract_func[item['task_type']]
175
+ text = gpt_generate_data[idx]
176
+ org_data.loc[idx, 'score'] = np.sum(func(text))
177
+
178
+ return org_data
179
+
180
+
181
+ def get_dimension_rating(data_path):
182
+ data = load(data_path)
183
+ result_dict = {}
184
+ for idx, item in data.iterrows():
185
+ if item['task_type'] not in result_dict:
186
+ result_dict[item['task_type']] = [0,0]
187
+ result_dict[item['task_type']][0] += int(item['score'])
188
+ result_dict[item['task_type']][1] += 1
189
+ return result_dict
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmbench_video.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ...smp import *
2
+ import numpy as np
3
+
4
+ FAIL_MSG = 'Failed to obtain answer via API.'
5
+
6
+ system_prompt = """
7
+ As an AI assistant, your task is to evaluate a candidate answer in comparison to a given correct answer.
8
+ The question itself, the correct 'groundtruth' answer, and the candidate answer will be provided to you.
9
+ Your assessment should range from 0 to 3, \
10
+ based solely on the semantic similarity between the groundtruth and the candidate answer, \
11
+ disregarding any grammatical differences.
12
+ A rating of 0 suggests no similarity, implying the candidate answer is entirely incorrect.
13
+ A rating of 1 suggests low similarity, meaning the candidate answer is largely incorrect.
14
+ A rating of 2 suggests high similarity, meaning the candidate answer is largely correct.
15
+ Lastly, a rating of 3 indicates complete similarity, which means the candidate answer is entirely correct.
16
+ Your response should be a single integer from 0, 1, 2, or 3.
17
+ """
18
+
19
+ MMV_DIMENSIONS = {
20
+ 'CP': ['Video Topic', 'Video Emotion', 'Video Scene', 'Video Style'],
21
+ 'FP-S': ['OCR', 'Object Recognition', 'Attribute Recognition', 'Event Recognition', 'Human Motion', 'Counting'],
22
+ 'FP-C': ['Spatial Relationship', 'Human-object Interaction', 'Human Interaction'],
23
+ 'HL': ['Hallucination'],
24
+ 'LR': ['Structuralized Image-Text Understanding', 'Mathematical Calculation'],
25
+ 'AR': ['Physical Property', 'Function Reasoning', 'Identity Reasoning'],
26
+ 'RR': ['Natural Relation', 'Physical Relation', 'Social Relation'],
27
+ 'CSR': ['Common Sense Reasoning'],
28
+ 'TR': ['Counterfactual Reasoning', 'Causal Reasoning', 'Future Prediction'],
29
+ }
30
+ L3_DIMS = []
31
+ for k, v in MMV_DIMENSIONS.items():
32
+ L3_DIMS.extend(v)
33
+
34
+ MMV_DIMENSIONS['Perception'] = []
35
+ MMV_DIMENSIONS['Reasoning'] = []
36
+ MMV_DIMENSIONS['Overall'] = []
37
+ for k in ['CP', 'FP-C', 'FP-S', 'HL']:
38
+ MMV_DIMENSIONS['Perception'].extend(MMV_DIMENSIONS[k])
39
+ MMV_DIMENSIONS['Overall'].extend(MMV_DIMENSIONS[k])
40
+ for k in ['LR', 'AR', 'RR', 'CSR', 'TR']:
41
+ MMV_DIMENSIONS['Reasoning'].extend(MMV_DIMENSIONS[k])
42
+ MMV_DIMENSIONS['Overall'].extend(MMV_DIMENSIONS[k])
43
+
44
+
45
+ def get_dimension_rating(data_path):
46
+ data = load(data_path)
47
+ coarse_rating = {k: [] for k in MMV_DIMENSIONS}
48
+ fine_rating = {k: [] for k in L3_DIMS}
49
+
50
+ for i in range(len(data)):
51
+ cate = data.iloc[i]['dimensions']
52
+ cates = eval(cate)
53
+
54
+ for c in cates:
55
+ fine_rating[c].append(data.iloc[i]['score'])
56
+
57
+ for d in MMV_DIMENSIONS:
58
+ if np.any([x in MMV_DIMENSIONS[d] for x in cates]):
59
+ coarse_rating[d].append(data.iloc[i]['score'])
60
+
61
+ coarse_all = {k: f'{np.mean([max(x, 0) for x in v]):.2f}' for k, v in coarse_rating.items()}
62
+ coarse_valid = {k: f'{np.mean([x for x in v if x >= 0]):.2f}' for k, v in coarse_rating.items()}
63
+ fine_all = {k: f'{np.mean([max(x, 0) for x in v]):.2f}' for k, v in fine_rating.items()}
64
+ fine_valid = {k: f'{np.mean([x for x in v if x >= 0]):.2f}' for k, v in fine_rating.items()}
65
+ return dict(coarse_all=coarse_all, coarse_valid=coarse_valid, fine_all=fine_all, fine_valid=fine_valid)
66
+
67
+
68
+ def build_prompt(item):
69
+ tmpl = 'Question: {}\nGroundtruth answer: {}\nCandidate answer: {}\nYour response: '
70
+ return tmpl.format(item['question'], item['answer'], item['prediction'])
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/vcr.py ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uuid
2
+ from functools import partial
3
+ from .image_base import ImageBaseDataset
4
+ from ..smp import *
5
+
6
+ rouge = None
7
+ nlp_en = None
8
+ nlp_zh = None
9
+ nlp = None
10
+
11
+
12
+ def initialize():
13
+ import evaluate
14
+ import spacy
15
+
16
+ global rouge, nlp_en, nlp_zh, nlp
17
+
18
+ try:
19
+ rouge = evaluate.load('rouge', experiment_id=str(uuid.uuid4()))
20
+ except Exception as e:
21
+ logging.critical(f'{type(e)}: {e}')
22
+ logging.critical('Please first `pip install rouge_score`.')
23
+
24
+ try:
25
+ nlp_en = spacy.load('en_core_web_sm')
26
+ except Exception as e:
27
+ logging.warning(f'{type(e)}: {e}')
28
+ logging.warning('Will automatically download en_core_web_sm via spacy.')
29
+ spacy.cli.download('en_core_web_sm')
30
+ nlp_en = spacy.load('en_core_web_sm')
31
+
32
+ try:
33
+ nlp_zh = spacy.load('zh_core_web_sm')
34
+ except Exception as e:
35
+ logging.warning(f'{type(e)}: {e}')
36
+ logging.warning('Will automatically download zh_core_web_sm via spacy.')
37
+ spacy.cli.download('zh_core_web_sm')
38
+ nlp_zh = spacy.load('zh_core_web_sm')
39
+
40
+ nlp = {'en': nlp_en, 'zh': nlp_zh}
41
+
42
+
43
+ def rough_filter(answer_text):
44
+ if "I can't" in answer_text:
45
+ return False
46
+ elif 'I cannot' in answer_text:
47
+ return False
48
+ elif 'sorry' in answer_text.lower():
49
+ return False
50
+ if '无法' in answer_text:
51
+ return False
52
+ elif '抱歉' in answer_text:
53
+ return False
54
+ else:
55
+ return True
56
+
57
+
58
+ def zero_template(crossed_text):
59
+ return {
60
+ 'crossed_text': crossed_text,
61
+ 'max_sim_val': 0,
62
+ 'max_sim_string': '',
63
+ 'precision': 0,
64
+ 'recall': 0,
65
+ 'f1': 0,
66
+ 'jaccard': 0,
67
+ 'rouge1': 0,
68
+ 'exact_match': 0,
69
+ }
70
+
71
+
72
+ def tokenize(text, language):
73
+ """
74
+ Tokenize the text and return the tokens.
75
+
76
+ Parameters:
77
+ text (str): The text to tokenize.
78
+ language (str): The language of the text.
79
+
80
+ Returns:
81
+ list: The list of tokens.
82
+ """
83
+ assert language in ['en', 'zh']
84
+ nlp_language = nlp[language]
85
+ processed_text = nlp_language(text)
86
+ return [token.text for token in processed_text]
87
+
88
+
89
+ def find_best_match(needle, hay, language, rouge):
90
+ """
91
+ Finds the best matching n-gram in the haystack for the given needle.
92
+
93
+ Parameters:
94
+ needle (str): The string to find.
95
+ hay (str): The text to search within.
96
+
97
+ Returns:
98
+ tuple: The highest similarity value and the best matching string.
99
+ """
100
+ assert language in ['en', 'zh']
101
+ from nltk.util import ngrams
102
+ from difflib import SequenceMatcher as SM
103
+
104
+ tokens_hay = tokenize(hay, language)
105
+ tokens_needle = tokenize(needle, language)
106
+
107
+ splitter = '' if language == 'zh' else ' '
108
+ ngrams_ = ngrams(tokens_hay, len(tokens_needle))
109
+ max_sim_val = 0
110
+ max_sim_string = ''
111
+ max_sim_ngram = []
112
+ tokens_needle_set = set(tokens_needle)
113
+ ngrams_hasjoint = [
114
+ ngram
115
+ for ngram in ngrams_
116
+ if not set(ngram).isdisjoint(tokens_needle_set)
117
+ ]
118
+
119
+ for ngram in ngrams_hasjoint:
120
+ hay_ngram = splitter.join(ngram)
121
+ similarity = SM(None, hay_ngram, needle).ratio()
122
+ if similarity > max_sim_val:
123
+ max_sim_val = similarity
124
+ max_sim_string = hay_ngram
125
+ max_sim_ngram = ngram
126
+
127
+ # Evaluate
128
+ if len(max_sim_ngram) == 0:
129
+ return {
130
+ 'crossed_text': needle,
131
+ 'max_sim_val': 0,
132
+ 'max_sim_string': '',
133
+ 'precision': 0,
134
+ 'recall': 0,
135
+ 'f1': 0,
136
+ 'jaccard': 0,
137
+ 'rouge1': 0,
138
+ 'exact_match': 0,
139
+ }
140
+ pred_set = set(max_sim_ngram)
141
+ ref_set = set(tokens_needle)
142
+ correct_tokens = pred_set.intersection(ref_set)
143
+ len_correct_tokens = len(correct_tokens)
144
+
145
+ precision = len_correct_tokens / len(pred_set)
146
+ recall = len_correct_tokens / len(ref_set)
147
+ if (precision + recall) == 0:
148
+ f1 = 0
149
+ else:
150
+ f1 = 2 * precision * recall / (precision + recall)
151
+ union = pred_set.union(ref_set)
152
+ jaccard = len_correct_tokens / len(union) if len(union) > 0 else 0
153
+ rouge_1 = rouge.compute(
154
+ predictions=[max_sim_string],
155
+ references=[needle],
156
+ tokenizer=partial(tokenize, language=language),
157
+ rouge_types=['rouge1'],
158
+ )['rouge1']
159
+ exact_match = float(list(max_sim_ngram) == list(tokens_needle))
160
+ out = {
161
+ 'crossed_text': needle,
162
+ 'max_sim_string': max_sim_string,
163
+ 'max_sim_val': max_sim_val,
164
+ 'precision': precision,
165
+ 'recall': recall,
166
+ 'f1': f1,
167
+ 'jaccard': jaccard,
168
+ 'rouge1': rouge_1,
169
+ 'exact_match': exact_match,
170
+ }
171
+ return out
172
+
173
+
174
+ def process_match_single_new(
175
+ image_id, prediction, answer, language, progress
176
+ ):
177
+ """
178
+ process the inference results for a single image and calculate the metrics
179
+
180
+ Parameters:
181
+ image_id (int): The image id (question id).
182
+ prediction (str): The prediction text.
183
+ answer (Union[str, List[str]]): The answer text, or a list of answer texts. The masked n-grams in the image.
184
+ language (str): The language of the text. Can be "en" or "zh".
185
+ rouge (rouge): The rouge metric object.
186
+ progress (multiprocessing.Queue): The progress queue.
187
+
188
+ Returns:
189
+ tuple: The image id (question_id, int) and the result per id (dict of dict of dict).
190
+ """
191
+ result_per_id = {image_id: {}}
192
+ if isinstance(answer, str):
193
+ answer = eval(answer)
194
+ assert isinstance(answer, list)
195
+ result = prediction.split('Assistant: ')[-1]
196
+ for i, crossed_text in enumerate(answer):
197
+ if rough_filter(result):
198
+ find_best_match_result = find_best_match(
199
+ crossed_text, result, language, rouge
200
+ )
201
+ if i == 0:
202
+ result_per_id[image_id] = {str(i): find_best_match_result}
203
+ else:
204
+ result_per_id[image_id][str(i)] = find_best_match_result
205
+ else:
206
+ if i == 0:
207
+ result_per_id[image_id] = {str(i): zero_template(crossed_text)}
208
+ else:
209
+ result_per_id[image_id][str(i)] = zero_template(crossed_text)
210
+ progress.put(1)
211
+ return image_id, result_per_id
212
+
213
+
214
+ class VCRDataset(ImageBaseDataset):
215
+ TYPE = 'VQA'
216
+
217
+ URL_PREFIX = 'https://huggingface.co/datasets/vcr-org'
218
+
219
+ DATASET_URL = {
220
+ 'VCR_EN_EASY_500': f'{URL_PREFIX}/VCR-wiki-en-easy-test-500/resolve/main/VCR-wiki-en-easy-test-500.tsv',
221
+ 'VCR_EN_EASY_100': f'{URL_PREFIX}/VCR-wiki-en-easy-test-100/resolve/main/VCR-wiki-en-easy-test-100.tsv',
222
+ 'VCR_EN_EASY_ALL': f'{URL_PREFIX}/VCR-wiki-en-easy-test/resolve/main/VCR-wiki-en-easy-test.tsv',
223
+ 'VCR_EN_HARD_500': f'{URL_PREFIX}/VCR-wiki-en-hard-test-500/resolve/main/VCR-wiki-en-hard-test-500.tsv',
224
+ 'VCR_EN_HARD_100': f'{URL_PREFIX}/VCR-wiki-en-hard-test-100/resolve/main/VCR-wiki-en-hard-test-100.tsv',
225
+ 'VCR_EN_HARD_ALL': f'{URL_PREFIX}/VCR-wiki-en-hard-test/resolve/main/VCR-wiki-en-hard-test.tsv',
226
+ 'VCR_ZH_EASY_500': f'{URL_PREFIX}/VCR-wiki-zh-easy-test-500/resolve/main/VCR-wiki-zh-easy-test-500.tsv',
227
+ 'VCR_ZH_EASY_100': f'{URL_PREFIX}/VCR-wiki-zh-easy-test-100/resolve/main/VCR-wiki-zh-easy-test-100.tsv',
228
+ 'VCR_ZH_EASY_ALL': f'{URL_PREFIX}/VCR-wiki-zh-easy-test/resolve/main/VCR-wiki-zh-easy-test.tsv',
229
+ 'VCR_ZH_HARD_500': f'{URL_PREFIX}/VCR-wiki-zh-hard-test-500/resolve/main/VCR-wiki-zh-hard-test-500.tsv',
230
+ 'VCR_ZH_HARD_100': f'{URL_PREFIX}/VCR-wiki-zh-hard-test-100/resolve/main/VCR-wiki-zh-hard-test-100.tsv',
231
+ 'VCR_ZH_HARD_ALL': f'{URL_PREFIX}/VCR-wiki-zh-hard-test/resolve/main/VCR-wiki-zh-hard-test.tsv',
232
+ }
233
+
234
+ DATASET_MD5 = {
235
+ 'VCR_EN_EASY_500': 'fd9258db52f8685dc710619a0ea0a261',
236
+ 'VCR_EN_EASY_100': '9df5d7266683458621ecbe122beb72f0',
237
+ 'VCR_EN_EASY_ALL': '8a9b96885f251d1c85f42f84073327f1',
238
+ 'VCR_EN_HARD_500': '0a22a85080b6a1f52b1f95e302d43df4',
239
+ 'VCR_EN_HARD_100': '1b20f5cbcbeae0b0bec77f7a36143958',
240
+ 'VCR_EN_HARD_ALL': '2d8b8b1ee0eba0e0b618fd3aa7d9710e',
241
+ 'VCR_ZH_EASY_500': 'beca5fd54176adf44cf94bd9b50cf048',
242
+ 'VCR_ZH_EASY_100': '4a86a5678a79844d6d22ab0629c51cd5',
243
+ 'VCR_ZH_EASY_ALL': '5050fe7f0027ad2068fd4c7f220edaea',
244
+ 'VCR_ZH_HARD_500': '617e3360f75c54455625cb0a8da5c1e7',
245
+ 'VCR_ZH_HARD_100': 'b0e38c85f5d5e63894a3b881c372a62b',
246
+ 'VCR_ZH_HARD_ALL': '54bbfef448206518b03127ef8b61404c',
247
+ }
248
+
249
+ def __init__(self, dataset='VCR_EN_EASY_500', skip_noimg=True):
250
+ super().__init__(dataset, skip_noimg)
251
+
252
+ initialize()
253
+ self.language = 'en' if 'EN' in dataset else 'zh'
254
+ self.difficulty = 'easy' if 'EASY' in dataset else 'hard'
255
+
256
+ # def build_prompt(self, line):
257
+ # msgs = super().build_prompt(line)
258
+ # assert msgs[-1]['type'] == 'text'
259
+ # if self.language == 'zh':
260
+ # msgs[-1]['value'] += '图像中被覆盖的文本是什么?请在不输出解释的情况下还原被覆盖的文本。'
261
+ # else:
262
+ # msgs[-1]['value'] += ('What is the covered texts in the image? '
263
+ # 'Please restore the covered texts without outputting the explanations.')
264
+ # return msgs
265
+
266
+ def evaluate(self, eval_file, **judge_kwargs):
267
+ import multiprocessing
268
+
269
+ vcr_score_list = {'Exact_Match': [], 'Jaccard': []}
270
+ vcr_score = {'Exact_Match': 0, 'Jaccard': 0}
271
+ logger = get_logger('Evaluation')
272
+ data = load(eval_file)
273
+
274
+ lt = len(data)
275
+ lines = [data.iloc[i] for i in range(lt)]
276
+
277
+ pool = multiprocessing.Pool()
278
+ manager = multiprocessing.Manager()
279
+ progress_queue = manager.Queue()
280
+ results = []
281
+
282
+ overall_results = {str(image_id): {} for image_id in range(len(lines))}
283
+
284
+ for instance_id, instance in enumerate(lines):
285
+ results.append(
286
+ pool.apply_async(
287
+ process_match_single_new,
288
+ args=(
289
+ str(instance_id),
290
+ instance['prediction'],
291
+ instance['answer'],
292
+ self.language,
293
+ progress_queue,
294
+ ),
295
+ )
296
+ )
297
+ pool.close()
298
+
299
+ # Display progress bar
300
+ for _ in tqdm(range(len(results))):
301
+ progress_queue.get()
302
+
303
+ pool.join()
304
+
305
+ # Merging results into overall_result
306
+ for result in results:
307
+ image_id, result_per_id = result.get()
308
+ overall_results[str(image_id)].update(result_per_id[image_id])
309
+ for blank_id_str in result_per_id[image_id].keys():
310
+ vcr_score_list['Exact_Match'].append(
311
+ result_per_id[image_id][blank_id_str]['exact_match']
312
+ )
313
+ vcr_score_list['Jaccard'].append(
314
+ result_per_id[image_id][blank_id_str]['jaccard']
315
+ )
316
+ vcr_score['Exact_Match'] = np.mean(vcr_score_list['Exact_Match'])
317
+ vcr_score['Jaccard'] = np.mean(vcr_score_list['Jaccard'])
318
+ results_out = {
319
+ k: v for i in range(len(results)) for k, v in results[i].get()[1].items()
320
+ }
321
+ results_with_metrics = {
322
+ 'Exact_Match': vcr_score['Exact_Match'],
323
+ 'Jaccard': vcr_score['Jaccard'],
324
+ 'Predictions': results_out,
325
+ }
326
+ score_pth = eval_file.replace(
327
+ '.xlsx', f'{self.language}_{self.difficulty}_score.json'
328
+ )
329
+ dump(results_with_metrics, score_pth)
330
+ logger.info(
331
+ f'VCR successfully finished evaluating {eval_file}, results saved in {score_pth}'
332
+ )
333
+ logger.info('Score: ')
334
+ for key, value in vcr_score.items():
335
+ logger.info('{}:{}'.format(key, value))
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/video_base.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import abstractmethod
2
+ from ..smp import *
3
+
4
+
5
+ class VideoBaseDataset:
6
+
7
+ MODALITY = 'VIDEO'
8
+
9
+ def __init__(self,
10
+ dataset='MMBench-Video',
11
+ pack=False,
12
+ nframe=0,
13
+ fps=-1):
14
+ try:
15
+ import decord
16
+ except Exception as e:
17
+ logging.critical(f'{type(e)}: {e}')
18
+ logging.critical('Please install decord via `pip install decord`.')
19
+
20
+ self.dataset_name = dataset
21
+ ret = self.prepare_dataset(dataset)
22
+ assert ret is not None
23
+ lmu_root = LMUDataRoot()
24
+ self.frame_root = osp.join(lmu_root, 'images', dataset)
25
+ os.makedirs(self.frame_root, exist_ok=True)
26
+ self.frame_tmpl = 'frame-{}-of-{}.jpg'
27
+ self.frame_tmpl_fps = 'frame-{}-of-{}-{}fps.jpg'
28
+
29
+ self.data_root = ret['root']
30
+ self.data_file = ret['data_file']
31
+ self.data = load(self.data_file)
32
+
33
+ assert 'question' in self.data and 'video' in self.data
34
+ videos = list(set(self.data['video']))
35
+ videos.sort()
36
+ self.videos = videos
37
+ self.pack = pack
38
+ self.nframe = nframe
39
+ self.fps = fps
40
+ if self.fps > 0 and self.nframe > 0:
41
+ raise ValueError('fps and nframe should not be set at the same time')
42
+ if self.fps <= 0 and self.nframe <= 0:
43
+ raise ValueError('fps and nframe should be set at least one valid value')
44
+
45
+ def __len__(self):
46
+ return len(self.videos) if self.pack else len(self.data)
47
+
48
+ def __getitem__(self, idx):
49
+ if self.pack:
50
+ assert idx < len(self.videos)
51
+ sub_data = self.data[self.data['video'] == self.videos[idx]]
52
+ return sub_data
53
+ else:
54
+ assert idx < len(self.data)
55
+ return dict(self.data.iloc[idx])
56
+
57
+ def frame_paths(self, video):
58
+ frame_root = osp.join(self.frame_root, video)
59
+ os.makedirs(frame_root, exist_ok=True)
60
+ return [osp.join(frame_root, self.frame_tmpl.format(i, self.nframe)) for i in range(1, self.nframe + 1)]
61
+
62
+ def frame_paths_fps(self, video, num_frames):
63
+ frame_root = osp.join(self.frame_root, video)
64
+ os.makedirs(frame_root, exist_ok=True)
65
+ return [osp.join(frame_root,
66
+ self.frame_tmpl_fps.format(i, num_frames, self.fps)) for i in range(1, num_frames + 1)]
67
+
68
+ def save_video_frames(self, video):
69
+ if self.fps > 0:
70
+ vid_path = osp.join(self.data_root, video + '.mp4')
71
+ vid = decord.VideoReader(vid_path)
72
+
73
+ # 计算视频的总帧数和总时长
74
+ total_frames = len(vid)
75
+ video_fps = vid.get_avg_fps()
76
+ total_duration = total_frames / video_fps
77
+
78
+ # 计算需要提取的总帧数
79
+ required_frames = int(total_duration * self.fps)
80
+
81
+ # 计算提取帧的间隔
82
+ step_size = video_fps / self.fps
83
+
84
+ # 计算提取帧的索引
85
+ indices = [int(i * step_size) for i in range(required_frames)]
86
+
87
+ # 提取帧并保存
88
+ frame_paths = self.frame_paths_fps(video, len(indices))
89
+ flag = np.all([osp.exists(p) for p in frame_paths])
90
+ if flag:
91
+ return frame_paths
92
+
93
+ images = [vid[i].asnumpy() for i in indices]
94
+ images = [Image.fromarray(arr) for arr in images]
95
+ for im, pth in zip(images, frame_paths):
96
+ if not osp.exists(pth):
97
+ im.save(pth)
98
+ return frame_paths
99
+
100
+ else:
101
+ frame_paths = self.frame_paths(video)
102
+ flag = np.all([osp.exists(p) for p in frame_paths])
103
+ if flag:
104
+ return frame_paths
105
+ vid_path = osp.join(self.data_root, video + '.mp4')
106
+ vid = decord.VideoReader(vid_path)
107
+ step_size = len(vid) / (self.nframe + 1)
108
+ indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
109
+ images = [vid[i].asnumpy() for i in indices]
110
+ images = [Image.fromarray(arr) for arr in images]
111
+ for im, pth in zip(images, frame_paths):
112
+ if not osp.exists(pth):
113
+ im.save(pth)
114
+ return frame_paths
115
+
116
+ # Return a list of dataset names that are supported by this class, can override
117
+ @classmethod
118
+ def supported_datasets(cls):
119
+ return ['MMBench-Video', 'Video-MME', 'MVBench', 'MVBench_MP4', 'LongVideoBench']
120
+
121
+ # Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
122
+ @abstractmethod
123
+ def evaluate(self, eval_file, **judge_kwargs):
124
+ pass
125
+
126
+ @abstractmethod
127
+ def build_prompt(self, idx):
128
+ pass
129
+
130
+ @abstractmethod
131
+ def prepare_dataset(self, dataset):
132
+ # The prepare_dataset function should return a dictionary containing:
133
+ # `root` (directory that containing video files)
134
+ # `data_file` (the TSV dataset file)
135
+ pass
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/video_concat_dataset.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ..smp import *
2
+ from .video_base import VideoBaseDataset
3
+
4
+
5
+ class ConcatVideoDataset(VideoBaseDataset):
6
+ # This dataset takes multiple dataset names as input and aggregate them into a single dataset.
7
+ # Each single dataset should not have a field named `SUB_DATASET`
8
+
9
+ DATASET_SETS = {}
10
+
11
+ def __init__(self, dataset, **kwargs):
12
+ from . import build_dataset
13
+ datasets = self.DATASET_SETS[dataset]
14
+ self.dataset_map = {}
15
+ # The name of the compliation
16
+ self.dataset_name = dataset
17
+ self.datasets = datasets
18
+ self.nframe = kwargs.get('nframe', 0)
19
+ self.fps = kwargs.get('fps', -1)
20
+ for dname in datasets:
21
+ dataset = build_dataset(dname, **kwargs)
22
+ assert dataset is not None, dataset
23
+ self.dataset_map[dname] = dataset
24
+ TYPES = [x.TYPE for x in self.dataset_map.values()]
25
+ MODALITIES = [x.MODALITY for x in self.dataset_map.values()]
26
+ # assert np.all([x == TYPES[0] for x in TYPES]), (datasets, TYPES)
27
+ assert np.all([x == MODALITIES[0] for x in MODALITIES]), (datasets, MODALITIES)
28
+ self.TYPE = TYPES
29
+ self.MODALITY = MODALITIES[0]
30
+ data_all = []
31
+ for dname in datasets:
32
+ data = self.dataset_map[dname].data
33
+ data['SUB_DATASET'] = [dname] * len(data)
34
+ data_all.append(data)
35
+
36
+ data = pd.concat(data_all)
37
+ data['original_index'] = data.pop('index')
38
+ data['index'] = np.arange(len(data))
39
+ self.data = data
40
+
41
+ def build_prompt(self, line, video_llm):
42
+ if isinstance(line, int):
43
+ line = self.data.iloc[line]
44
+ idx = line['original_index']
45
+ dname = line['SUB_DATASET']
46
+ org_data = self.dataset_map[dname].data
47
+ org_line = cp.deepcopy(org_data[org_data['index'] == idx]).iloc[0]
48
+ return self.dataset_map[dname].build_prompt(org_line, video_llm)
49
+
50
+ def dump_image(self, line):
51
+ # Assert all images are pre-dumped
52
+ assert 'image' not in line
53
+ assert 'image_path' in line
54
+ tgt_path = toliststr(line['image_path'])
55
+ return tgt_path
56
+
57
+ @classmethod
58
+ def supported_datasets(cls):
59
+ return [] # list(cls.DATASET_SETS)
60
+
61
+ def evaluate(self, eval_file, **judge_kwargs):
62
+ suffix = eval_file.split('.')[-1]
63
+ # First, split the eval_file by dataset
64
+ data_all = load(eval_file)
65
+ for dname in self.datasets:
66
+ tgt = eval_file.replace(self.dataset_name, dname)
67
+ data_sub = data_all[data_all['SUB_DATASET'] == dname]
68
+ data_sub.pop('index')
69
+ data_sub['index'] = data_sub.pop('original_index')
70
+ data_sub.pop('SUB_DATASET')
71
+ dump(data_sub, tgt)
72
+ # Then, evaluate each dataset separately
73
+ results_all = {}
74
+ for dname in self.datasets:
75
+ tgt = eval_file.replace(self.dataset_name, dname)
76
+ res = self.dataset_map[dname].evaluate(tgt, **judge_kwargs)
77
+ results_all.update(res)
78
+
79
+ result = pd.DataFrame(results_all, index=['success', 'overall'])
80
+ result = result.T
81
+ for idx, item in result.iterrows():
82
+ result.loc[idx, 'acc'] = round(item['success'] / item['overall'] * 100, 1)
83
+ score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
84
+ dump(result, score_file)
85
+ return result
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/video_dataset_config.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from vlmeval.dataset import *
2
+ from functools import partial
3
+
4
+ mmbench_video_dataset = {
5
+ 'MMBench_Video_8frame_nopack': partial(MMBenchVideo, dataset='MMBench-Video', nframe=8, pack=False),
6
+ 'MMBench_Video_8frame_pack': partial(MMBenchVideo, dataset='MMBench-Video', nframe=8, pack=True),
7
+ 'MMBench_Video_16frame_nopack': partial(MMBenchVideo, dataset='MMBench-Video', nframe=16, pack=False),
8
+ 'MMBench_Video_1fps_nopack': partial(MMBenchVideo, dataset='MMBench-Video', fps=1.0, pack=False),
9
+ 'MMBench_Video_1fps_pack': partial(MMBenchVideo, dataset='MMBench-Video', fps=1.0, pack=True)
10
+ }
11
+
12
+ mvbench_dataset = {
13
+ 'MVBench_8frame': partial(MVBench, dataset='MVBench', nframe=8),
14
+ # MVBench not support fps, but MVBench_MP4 does
15
+ 'MVBench_MP4_8frame': partial(MVBench_MP4, dataset='MVBench_MP4', nframe=8),
16
+ 'MVBench_MP4_1fps': partial(MVBench_MP4, dataset='MVBench_MP4', fps=1.0),
17
+ }
18
+
19
+ videomme_dataset = {
20
+ 'Video-MME_8frame': partial(VideoMME, dataset='Video-MME', nframe=8),
21
+ 'Video-MME_8frame_subs': partial(VideoMME, dataset='Video-MME', nframe=8, use_subtitle=True),
22
+ 'Video-MME_1fps': partial(VideoMME, dataset='Video-MME', fps=1.0),
23
+ 'Video-MME_0.5fps': partial(VideoMME, dataset='Video-MME', fps=0.5),
24
+ 'Video-MME_0.5fps_subs': partial(VideoMME, dataset='Video-MME', fps=0.5, use_subtitle=True),
25
+ }
26
+
27
+ longvideobench_dataset = {
28
+ 'LongVideoBench_8frame': partial(LongVideoBench, dataset='LongVideoBench', nframe=8),
29
+ 'LongVideoBench_8frame_subs': partial(LongVideoBench, dataset='LongVideoBench', nframe=8, use_subtitle=True),
30
+ 'LongVideoBench_1fps': partial(LongVideoBench, dataset='LongVideoBench', fps=1.0),
31
+ 'LongVideoBench_0.5fps': partial(LongVideoBench, dataset='LongVideoBench', fps=0.5),
32
+ 'LongVideoBench_0.5fps_subs': partial(LongVideoBench, dataset='LongVideoBench', fps=0.5, use_subtitle=True)
33
+ }
34
+
35
+ mlvu_dataset = {
36
+ 'MLVU_8frame': partial(MLVU, dataset='MLVU', nframe=8),
37
+ 'MLVU_1fps': partial(MLVU, dataset='MLVU', fps=1.0)
38
+ }
39
+
40
+ tempcompass_dataset = {
41
+ 'TempCompass_8frame': partial(TempCompass, dataset='TempCompass', nframe=8),
42
+ 'TempCompass_1fps': partial(TempCompass, dataset='TempCompass', fps=1.0),
43
+ 'TempCompass_0.5fps': partial(TempCompass, dataset='TempCompass', fps=0.5)
44
+ }
45
+
46
+ # In order to reproduce the experimental results in CGbench paper,
47
+ # use_subtitle, use_subtitle_time and use_frame_time need to be set to True.
48
+ # When measuring clue-related results, if the number of frames used is greater
49
+ # than 32, the frame capture limit will be set to 32.
50
+ cgbench_dataset = {
51
+ 'CGBench_MCQ_Grounding_Mini_8frame_subs_subt': partial(
52
+ CGBench_MCQ_Grounding_Mini,
53
+ dataset='CG-Bench_MCQ_Grounding_Mini',
54
+ nframe=8,
55
+ use_subtitle=True,
56
+ use_subtitle_time=True
57
+ ),
58
+ 'CGBench_OpenEnded_Mini_8frame_subs_subt_ft': partial(
59
+ CGBench_OpenEnded_Mini,
60
+ dataset='CG-Bench_OpenEnded_Mini',
61
+ nframe=8,
62
+ use_subtitle=True,
63
+ use_subtitle_time=True,
64
+ use_frame_time=True
65
+ ),
66
+ 'CGBench_MCQ_Grounding_32frame_subs': partial(
67
+ CGBench_MCQ_Grounding,
68
+ dataset='CG-Bench_MCQ_Grounding',
69
+ nframe=32,
70
+ use_subtitle=True
71
+ ),
72
+ 'CGBench_OpenEnded_8frame': partial(
73
+ CGBench_OpenEnded,
74
+ dataset='CG-Bench_OpenEnded',
75
+ nframe=8
76
+ ),
77
+ 'CGBench_MCQ_Grounding_16frame_subs_subt_ft': partial(
78
+ CGBench_MCQ_Grounding,
79
+ dataset='CG-Bench_MCQ_Grounding',
80
+ nframe=16,
81
+ use_subtitle=True,
82
+ use_subtitle_time=True,
83
+ use_frame_time=True
84
+ ),
85
+ 'CGBench_OpenEnded_16frame_subs_subt_ft': partial(
86
+ CGBench_OpenEnded,
87
+ dataset='CG-Bench_OpenEnded',
88
+ nframe=16,
89
+ use_subtitle=True,
90
+ use_subtitle_time=True,
91
+ use_frame_time=True
92
+ )
93
+ }
94
+
95
+ supported_video_datasets = {}
96
+
97
+ dataset_groups = [
98
+ mmbench_video_dataset, mvbench_dataset, videomme_dataset, longvideobench_dataset,
99
+ mlvu_dataset, tempcompass_dataset, cgbench_dataset
100
+ ]
101
+
102
+ for grp in dataset_groups:
103
+ supported_video_datasets.update(grp)
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/videomme.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import snapshot_download
2
+ from ..smp import *
3
+ from .video_base import VideoBaseDataset
4
+ from .utils import build_judge, DEBUG_MESSAGE
5
+
6
+ FAIL_MSG = 'Failed to obtain answer via API.'
7
+
8
+
9
+ def unwrap_hf_pkl(pth, suffix='.mp4'):
10
+ base_dir = os.path.join(pth, 'video_pkl/')
11
+ target_dir = os.path.join(pth, 'video/')
12
+ pickle_files = [os.path.join(base_dir, file) for file in os.listdir(base_dir)]
13
+ pickle_files.sort()
14
+
15
+ if not os.path.exists(target_dir):
16
+ os.makedirs(target_dir, exist_ok=True)
17
+ for pickle_file in pickle_files:
18
+ with open(pickle_file, 'rb') as file:
19
+ video_data = pickle.load(file)
20
+ # For each video file in the pickle file, write its contents to a new mp4 file
21
+ for video_name, video_content in video_data.items():
22
+ output_path = os.path.join(target_dir, f'{video_name}{suffix}')
23
+ with open(output_path, 'wb') as output_file:
24
+ output_file.write(video_content)
25
+ print('The video file has been restored and stored from the pickle file.')
26
+ else:
27
+ print('The video file already exists.')
28
+
29
+
30
+ class VideoMME(VideoBaseDataset):
31
+
32
+ MD5 = '85bdd91f9b29a99354c23b97ab7c113c'
33
+ SYS = ''
34
+
35
+ FRAMES_TMPL_NOSUB = """
36
+ These are the frames of a video. \
37
+ Select the best answer to the following multiple-choice question based on the video. \
38
+ Respond with only the letter (A, B, C, or D) of the correct option.
39
+ """
40
+
41
+ FRAMES_TMPL_SUB = """
42
+ These are the frames of a video. \
43
+ This video's subtitles are listed below:
44
+ {}
45
+ Select the best answer to the following multiple-choice question based on the video. \
46
+ Respond with only the letter (A, B, C, or D) of the correct option.
47
+ """
48
+
49
+ TYPE = 'Video-MCQ'
50
+
51
+ def __init__(self, dataset='Video-MME', use_subtitle=False, nframe=0, fps=-1):
52
+ super().__init__(dataset=dataset, nframe=nframe, fps=fps)
53
+ self.use_subtitle = use_subtitle
54
+ self.dataset_name = dataset
55
+
56
+ @classmethod
57
+ def supported_datasets(cls):
58
+ return ['Video-MME']
59
+
60
+ def prepare_dataset(self, dataset_name='Video-MME', repo_id='lmms-lab/Video-MME'):
61
+
62
+ def check_integrity(pth):
63
+ data_file = osp.join(pth, f'{dataset_name}.tsv')
64
+
65
+ if not os.path.exists(data_file):
66
+ return False
67
+
68
+ if md5(data_file) != self.MD5:
69
+ return False
70
+ data = load(data_file)
71
+ for video_pth in data['video_path']:
72
+ if not osp.exists(osp.join(pth, video_pth)):
73
+ return False
74
+ return True
75
+
76
+ cache_path = get_cache_path(repo_id)
77
+ if cache_path is not None and check_integrity(cache_path):
78
+ dataset_path = cache_path
79
+ else:
80
+
81
+ def unzip_hf_zip(pth):
82
+ import zipfile
83
+ base_dir = pth
84
+ target_dir = os.path.join(pth, 'video/')
85
+ zip_files = [
86
+ os.path.join(base_dir, file) for file in os.listdir(base_dir)
87
+ if file.endswith('.zip') and file.startswith('video')
88
+ ]
89
+ zip_files.sort()
90
+
91
+ if not os.path.exists(target_dir):
92
+ os.makedirs(target_dir, exist_ok=True)
93
+ for zip_file in zip_files:
94
+ with zipfile.ZipFile(zip_file, 'r') as zip_ref:
95
+ for member in zip_ref.namelist():
96
+ # Check if the member is a file (not a directory)
97
+ if not member.endswith('/'):
98
+ # Extract the file to the specified directory
99
+ source = zip_ref.open(member)
100
+ target = open(os.path.join(target_dir, os.path.basename(member)), 'wb')
101
+ with source, target:
102
+ target.write(source.read())
103
+ print('The video file has been restored and stored from the zip file.')
104
+ else:
105
+ print('The video file already exists.')
106
+
107
+ subtitle_zip_file = os.path.join(base_dir, 'subtitle.zip')
108
+ subtitle_target_dir = os.path.join(base_dir, 'subtitle')
109
+
110
+ if not os.path.exists(subtitle_target_dir):
111
+ os.makedirs(subtitle_target_dir, exist_ok=True)
112
+ with zipfile.ZipFile(subtitle_zip_file, 'r') as zip_ref:
113
+ for member in zip_ref.namelist():
114
+ # Check if the member is a file (not a directory)
115
+ if not member.endswith('/'):
116
+ # Extract the file to the specified directory
117
+ source = zip_ref.open(member)
118
+ target = open(os.path.join(subtitle_target_dir, os.path.basename(member)), 'wb')
119
+ with source, target:
120
+ target.write(source.read())
121
+ print('The subtitle file has been restored and stored from the zip file.')
122
+ else:
123
+ print('The subtitle file already exists.')
124
+
125
+ def generate_tsv(pth):
126
+
127
+ data_file = osp.join(pth, f'{dataset_name}.tsv')
128
+ if os.path.exists(data_file) and md5(data_file) == self.MD5:
129
+ return
130
+
131
+ data_file = pd.read_parquet(os.path.join(pth, 'videomme/test-00000-of-00001.parquet'))
132
+ data_file = data_file.assign(index=range(len(data_file)))
133
+ data_file['video'] = data_file['videoID']
134
+ data_file['video_path'] = data_file['videoID'].apply(lambda x: f'./video/{x}.mp4')
135
+ data_file['subtitle_path'] = data_file['videoID'].apply(lambda x: f'./subtitle/{x}.srt')
136
+ data_file['candidates'] = data_file['options'].apply(lambda x: x.tolist())
137
+
138
+ data_file = data_file[['index', 'video', 'video_path', 'duration', 'domain', 'candidates',
139
+ 'sub_category', 'task_type', 'subtitle_path', 'question', 'answer']]
140
+
141
+ data_file.to_csv(osp.join(pth, f'{dataset_name}.tsv'), sep='\t', index=False)
142
+
143
+ if modelscope_flag_set():
144
+ from modelscope import dataset_snapshot_download
145
+ dataset_path = dataset_snapshot_download(dataset_id=repo_id)
146
+ else:
147
+ dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
148
+ unzip_hf_zip(dataset_path)
149
+ generate_tsv(dataset_path)
150
+
151
+ data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
152
+
153
+ return dict(data_file=data_file, root=dataset_path)
154
+
155
+ def save_video_frames(self, video, video_llm=False):
156
+
157
+ vid_path = osp.join(self.data_root, 'video', video + '.mp4')
158
+ vid = decord.VideoReader(vid_path)
159
+ video_info = {
160
+ 'fps': vid.get_avg_fps(),
161
+ 'n_frames': len(vid),
162
+ }
163
+ if self.nframe > 0 and self.fps < 0:
164
+ step_size = len(vid) / (self.nframe + 1)
165
+ indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
166
+ frame_paths = self.frame_paths(video)
167
+ elif self.fps > 0:
168
+ # not constrained by num_frames, get frames by fps
169
+ total_duration = video_info['n_frames'] / video_info['fps']
170
+ required_frames = int(total_duration * self.fps)
171
+ step_size = video_info['fps'] / self.fps
172
+ indices = [int(i * step_size) for i in range(required_frames)]
173
+ frame_paths = self.frame_paths_fps(video, len(indices))
174
+
175
+ flag = np.all([osp.exists(p) for p in frame_paths])
176
+
177
+ if not flag:
178
+ images = [vid[i].asnumpy() for i in indices]
179
+ images = [Image.fromarray(arr) for arr in images]
180
+ for im, pth in zip(images, frame_paths):
181
+ if not osp.exists(pth) and not video_llm:
182
+ im.save(pth)
183
+
184
+ return frame_paths, indices, video_info
185
+
186
+ def build_prompt(self, line, video_llm):
187
+ if isinstance(line, int):
188
+ assert line < len(self)
189
+ line = self.data.iloc[line]
190
+
191
+ frames, indices, video_info = self.save_video_frames(line['video'], video_llm)
192
+
193
+ if self.use_subtitle and os.path.exists(osp.join(self.data_root, line['subtitle_path'])):
194
+ import pysubs2
195
+ subs = pysubs2.load(osp.join(self.data_root, line['subtitle_path']), encoding='utf-8')
196
+ subtitles = []
197
+
198
+ for seleced_frame_id in indices:
199
+ sub_text = ''
200
+ cur_time = pysubs2.make_time(fps=video_info['fps'], frames=seleced_frame_id)
201
+ for sub in subs:
202
+ if sub.start < cur_time and sub.end > cur_time:
203
+ sub_text = sub.text.replace('\\N', ' ')
204
+ break
205
+ if sub_text.strip():
206
+ subtitles.append(sub_text)
207
+ subtitles = '\n'.join(subtitles)
208
+ else:
209
+ subtitles = ''
210
+
211
+ message = [dict(type='text', value=self.SYS)]
212
+ if video_llm:
213
+ message.append(dict(type='video', value=osp.join(self.data_root, 'video', line['video'] + '.mp4')))
214
+ else:
215
+ for im in frames:
216
+ message.append(dict(type='image', value=im))
217
+
218
+ text_prompt = self.FRAMES_TMPL_NOSUB if not self.use_subtitle else self.FRAMES_TMPL_SUB.format(subtitles)
219
+ message.append(dict(type='text', value=text_prompt))
220
+ line['question'] += '\n' + '\n'.join(eval(line['candidates']))
221
+ prompt = 'Question: {}\nAnswer: '.format(line['question'])
222
+ message.append(dict(type='text', value=prompt))
223
+ return message
224
+
225
+ # It returns a dictionary
226
+ @classmethod
227
+ def evaluate(self, eval_file, **judge_kwargs):
228
+ from .utils.videomme import get_dimension_rating, extract_characters_regex, extract_option
229
+
230
+ assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
231
+
232
+ tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
233
+ tgt_file = eval_file.replace('.xlsx', '_rating.json')
234
+ score_file = eval_file.replace('.xlsx', '_score.xlsx')
235
+
236
+ if not osp.exists(score_file):
237
+ model = judge_kwargs.get('model', 'exact_matching')
238
+ assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
239
+
240
+ if model == 'exact_matching':
241
+ model = None
242
+ elif gpt_key_set():
243
+ model = build_judge(**judge_kwargs)
244
+ if not model.working():
245
+ warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
246
+ warnings.warn(DEBUG_MESSAGE)
247
+ model = None
248
+ else:
249
+ warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
250
+ model = None
251
+ res = {} if not osp.exists(tmp_file) else load(tmp_file)
252
+ res = {k: v for k, v in res.items() if FAIL_MSG not in v}
253
+
254
+ data = load(eval_file)
255
+ data_un = data[~pd.isna(data['prediction'])]
256
+
257
+ for idx in data['index']:
258
+ ans = data.loc[data['index'] == idx, 'answer'].values[0]
259
+ pred = str(data.loc[data['index'] == idx, 'prediction'].values[0])
260
+
261
+ if extract_characters_regex(pred) == '':
262
+ extract_pred = extract_option(
263
+ model,
264
+ data.loc[data['index'] == idx].to_dict(orient='records')[0],
265
+ 'Video-MME'
266
+ )
267
+ data.loc[idx, 'score'] = int(extract_pred == ans)
268
+ else:
269
+ data.loc[idx, 'score'] = int(extract_characters_regex(pred) == ans)
270
+
271
+ rejected = [x for x in data['score'] if x == -1]
272
+
273
+ print(
274
+ f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
275
+ f'failed to obtain the score for another {len(rejected)} questions. '
276
+ f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
277
+ )
278
+
279
+ dump(data, score_file)
280
+
281
+ rating = get_dimension_rating(score_file)
282
+ dump(rating, tgt_file)
283
+ return rating
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/vl_rewardbench.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ast import literal_eval
2
+
3
+ from .image_base import ImageBaseDataset
4
+ from .utils import build_judge, DEBUG_MESSAGE
5
+ from ..smp import *
6
+ from ..utils import track_progress_rich
7
+
8
+
9
+ LLM_PARSE_ANSWER_PROMPT = '''
10
+ You are given a pairwise judgement for two responses. Please return the better response according to the judgement.
11
+ Return the Answer X ONLY. e.g., Answer 1 or Answer 2.
12
+
13
+ Judgement: {judgement}
14
+ '''
15
+
16
+
17
+ PROMPT_TEMPLATE = '''\
18
+ You are a highly capable multimodal AI assistant tasked with evaluating answers to visual questions.
19
+ Please analyze the following image and question, then determine which of the two provided answers is better.
20
+
21
+ Question: {query}
22
+
23
+ Answer 1: {answer_0}
24
+
25
+ Answer 2: {answer_1}
26
+
27
+ Please evaluate both answers based on the following criteria:
28
+ 1. Accuracy: How well does the answer align with the visual information in the image?
29
+ 2. Completeness: Does the answer fully address all aspects of the question?
30
+ 3. Clarity: Is the answer easy to understand and well-articulated?
31
+ 4. Relevance: Does the answer directly relate to the question and the image?
32
+
33
+ After your evaluation, please:
34
+ 1. Explain your reasoning for each criterion.
35
+ 2. Provide an overall judgment on which answer is better (Answer 1 or Answer 2).\
36
+ For example: Overall Judgment: Answer X is better.
37
+
38
+ Your response should be structured and detailed, \
39
+ demonstrating your understanding of both the visual and textual elements of the task.'''
40
+
41
+
42
+ def get_score(line, parsed_response, random_number):
43
+ gt_ans = line['human_ranking'].index(0 if random_number == 0 else 1) + 1
44
+ if 'Answer 1'.lower() in parsed_response.lower():
45
+ pred = 1
46
+ elif 'Answer 2'.lower() in parsed_response.lower():
47
+ pred = 2
48
+ else: # failed
49
+ pred = 'None' # random.choice([1, 2])
50
+
51
+ if pred == gt_ans:
52
+ return 1.0
53
+ else:
54
+ return 0.0
55
+
56
+
57
+ def VLRewardBench_eval_answer(model, line):
58
+ response = toliststr(line['response'])
59
+ random_number = sum(len(res) for res in response) % 2
60
+
61
+ prompt = LLM_PARSE_ANSWER_PROMPT.format(judgement=line['prediction'])
62
+ messages = [dict(type='text', value=prompt)]
63
+
64
+ resp = model.generate(messages)
65
+ score = get_score(line, resp, random_number)
66
+
67
+ if score is None:
68
+ return 'Unknown'
69
+ return score
70
+
71
+
72
+ class VLRewardBench(ImageBaseDataset):
73
+ TYPE = 'VQA'
74
+ DATASET_URL = {
75
+ 'VL-RewardBench': 'https://huggingface.co/datasets/MMInstruction/VL-RewardBench/resolve/main/vl_rewardbench.tsv'
76
+ }
77
+ DATASET_MD5 = {'VL-RewardBench': '1d2676f4ab4a5f755019ec0af2b28189'}
78
+
79
+ # Given one data record, return the built prompt (a multi-modal message), can override
80
+ def build_prompt(self, line):
81
+ if isinstance(line, int):
82
+ line = self.data.iloc[line]
83
+ tgt_path = self.dump_image(line) # save image to local
84
+ question = line['question']
85
+ msgs = []
86
+ if isinstance(tgt_path, list):
87
+ msgs.extend([dict(type='image', value=p) for p in tgt_path])
88
+ else:
89
+ msgs = [dict(type='image', value=tgt_path)]
90
+
91
+ response = toliststr(line['response'])
92
+ random_number = sum(len(res) for res in response) % 2
93
+ if random_number == 1:
94
+ # randomly shuffle the order of the responses
95
+ response = response[::-1]
96
+ query_prompt = PROMPT_TEMPLATE.format(
97
+ query=question, answer_0=response[0], answer_1=response[1]
98
+ )
99
+ msgs = msgs + [dict(type='text', value=query_prompt)]
100
+ return msgs
101
+
102
+ # It returns a DataFrame
103
+ @classmethod
104
+ def evaluate(self, eval_file, **judge_kwargs):
105
+ suffix = eval_file.split('.')[-1]
106
+ model = judge_kwargs['model']
107
+ storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
108
+ score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
109
+ tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
110
+ nproc = judge_kwargs.pop('nproc', 4)
111
+
112
+ if not osp.exists(storage):
113
+ raw_data = VLRewardBench('VL-RewardBench').data
114
+ data = load(eval_file)
115
+ data['prediction'] = [str(x) for x in data['prediction']]
116
+ data['human_ranking'] = [literal_eval(x) for x in raw_data['answer']]
117
+
118
+ judge_kwargs['temperature'] = 0
119
+ judge_kwargs['timeout'] = 60
120
+ model = build_judge(max_tokens=128, **judge_kwargs)
121
+
122
+ assert model.working(), (
123
+ 'VLRewardBench evaluation requires a working OPENAI API\n'
124
+ + DEBUG_MESSAGE
125
+ )
126
+
127
+ lt = len(data)
128
+ lines = [data.iloc[i] for i in range(lt)]
129
+ tups = [(model, line) for line in lines]
130
+ indices = [line['index'] for line in lines]
131
+
132
+ ans = load(tmp_file) if osp.exists(tmp_file) else {}
133
+ tups = [x for x, i in zip(tups, indices) if i not in ans]
134
+ indices = [i for i in indices if i not in ans]
135
+
136
+ if len(indices):
137
+ new_results = track_progress_rich(
138
+ VLRewardBench_eval_answer,
139
+ tups,
140
+ nproc=nproc,
141
+ chunksize=nproc,
142
+ keys=indices,
143
+ save=tmp_file,
144
+ )
145
+ ans = load(tmp_file)
146
+ for k, v in zip(indices, new_results):
147
+ ans[k] = v
148
+
149
+ data['score'] = [ans[idx] for idx in data['index']]
150
+ # data.pop('image')
151
+ dump(data, storage)
152
+
153
+ data = load(storage)
154
+ lt = len(data)
155
+
156
+ category_scores = defaultdict(lambda: 0)
157
+ category_cnt = defaultdict(lambda: 0)
158
+ scores = defaultdict(lambda: 0)
159
+ for i in range(lt):
160
+ item = data.iloc[i]
161
+ category_scores[item['category']] += item['score']
162
+ category_cnt[item['category']] += 1
163
+ # calculate the average score for each category
164
+ for k, v in category_scores.items():
165
+ scores[k] = v / category_cnt[k]
166
+ # calculate category macro accuracy (average across categories)
167
+ scores['Macro Accuracy'] = sum(scores.values()) / len(scores)
168
+ # calculate the total average score
169
+ scores['Overall Consistency'] = sum(category_scores.values()) / lt
170
+
171
+ scores = {k: [v] for k, v in scores.items()}
172
+ scores = pd.DataFrame(scores)
173
+ dump(scores, score_file)
174
+ return scores
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/wildvision.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from functools import partial
3
+
4
+ from .image_base import ImageBaseDataset
5
+ from .utils import build_judge, DEBUG_MESSAGE
6
+ from ..smp import *
7
+ from ..utils import track_progress_rich
8
+
9
+
10
+ SYSTEM_PROMPT = """\
11
+ Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user \
12
+ prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate \
13
+ which assistant's answer is better.
14
+
15
+ Begin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any \
16
+ answers.
17
+
18
+ When evaluating the assistants' answers, compare both assistants' answers with your answer. \
19
+ You must identify and correct any mistakes or inaccurate information.
20
+
21
+ Then consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly \
22
+ responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one \
23
+ interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than \
24
+ providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate \
25
+ to what is being asked. Concise means the response is clear and not verbose or excessive.
26
+
27
+ Then consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing \
28
+ important information in the assistants' answers that would be beneficial to include when responding to the user \
29
+ prompt.
30
+
31
+ After providing your explanation, you must output only one of the following choices as your final verdict with a label:
32
+
33
+ 1. Assistant A is significantly better: [[A>>B]]
34
+ 2. Assistant A is slightly better: [[A>B]]
35
+ 3. Tie, relatively the same: [[A=B]]
36
+ 4. Assistant B is slightly better: [[B>A]]
37
+ 5. Assistant B is significantly better: [[B>>A]]
38
+
39
+ Example output: "My final verdict is tie: [[A=B]]".\
40
+ """
41
+
42
+
43
+ PROMPT_TEMPLATE = """\
44
+ "<|User Prompt|>\n{question}
45
+
46
+ <|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|>
47
+
48
+ <|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|>
49
+ """
50
+
51
+
52
+ REGEX_PATTERN = re.compile("\[\[([AB<>=]+)\]\]") # noqa: W605
53
+
54
+
55
+ def get_score(judgement, pattern=REGEX_PATTERN):
56
+ matches = pattern.findall(judgement)
57
+ matches = [m for m in matches if m != ""]
58
+ if len(set(matches)) == 0:
59
+ return None, True
60
+ elif len(set(matches)) == 1:
61
+ return matches[0].strip("\n"), False
62
+ else:
63
+ return None, True
64
+
65
+
66
+ def WildVision_auxeval(model, line):
67
+ config = dict(question=line['question'], answer_1=line['A'], answer_2=line['B'])
68
+ prompt = PROMPT_TEMPLATE.format(**config)
69
+
70
+ prefix = 'data:image/jpeg;base64,'
71
+ img = prefix + line['image']
72
+
73
+ messages = [
74
+ dict(type='text', value=prompt),
75
+ dict(type='image', value=img)
76
+ ]
77
+
78
+ retry = 2
79
+ while retry:
80
+ resp = model.generate(messages)
81
+ score, try_again = get_score(resp)
82
+ if not try_again:
83
+ break
84
+ retry -= 1
85
+
86
+ if score is None:
87
+ return 'Unknown'
88
+ return score
89
+
90
+
91
+ class WildVision(ImageBaseDataset):
92
+ TYPE = 'VQA'
93
+ DATASET_URL = {
94
+ 'WildVision': 'https://opencompass.openxlab.space/utils/VLMEval/WildVision.tsv'
95
+ }
96
+ DATASET_MD5 = {'WildVision': 'b38f80156d49411c594772866b0d0b52'}
97
+
98
+ score_map = {
99
+ 'A>>B': -2,
100
+ 'A>B': -1,
101
+ 'A=B': 0,
102
+ 'B>A': 1,
103
+ 'B>>A': 2
104
+ }
105
+
106
+ # Given one data record, return the built prompt (a multi-modal message), can override
107
+ def build_prompt(self, line):
108
+ if isinstance(line, int):
109
+ line = self.data.iloc[line]
110
+
111
+ if self.meta_only:
112
+ tgt_path = toliststr(line['image_path'])
113
+ else:
114
+ tgt_path = self.dump_image(line)
115
+
116
+ question = line['question']
117
+
118
+ msgs = []
119
+ if isinstance(tgt_path, list):
120
+ msgs.extend([dict(type='image', value=p) for p in tgt_path])
121
+ else:
122
+ msgs = [dict(type='image', value=tgt_path)]
123
+ # WildVision adopts text first
124
+ msgs = [dict(type='text', value=question)] + msgs
125
+ return msgs
126
+
127
+ @classmethod
128
+ def gen_eval_base(self, eval_file, b64_map):
129
+ data = load(eval_file)
130
+ data['B'] = data.pop('prediction')
131
+ data['A'] = data.pop('claude3_sonnet')
132
+ data['image'] = [b64_map[x] for x in data['index']]
133
+ return data
134
+ # rev = cp.deepcopy(data)
135
+ # rev['A'] = data['B']
136
+ # rev['B'] = data['A']
137
+ # rev['index'] = [x + '_rev' for x in data['index']]
138
+ # return pd.concat([data, rev], ignore_index=True)
139
+
140
+ # It returns a DataFrame
141
+ @classmethod
142
+ def evaluate(self, eval_file, **judge_kwargs):
143
+ # We adopt pairwise evaluation (twice for a pair) for this dataset
144
+ suffix = eval_file.split('.')[-1]
145
+ model = judge_kwargs['model']
146
+ storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
147
+ score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
148
+ tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
149
+ nproc = judge_kwargs.pop('nproc', 4)
150
+
151
+ if not osp.exists(storage):
152
+ raw_data = WildVision('WildVision').data
153
+ b64_map = {x: y for x, y in zip(raw_data['index'], raw_data['image'])}
154
+ data = self.gen_eval_base(eval_file, b64_map)
155
+
156
+ judge_kwargs['system_prompt'] = SYSTEM_PROMPT
157
+ judge_kwargs['temperature'] = 0
158
+ judge_kwargs['img_detail'] = 'high'
159
+ judge_kwargs['timeout'] = 300
160
+ model = build_judge(max_tokens=4096, **judge_kwargs)
161
+
162
+ assert model.working(), ('WildVision evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
163
+
164
+ lt = len(data)
165
+ lines = [data.iloc[i] for i in range(lt)]
166
+ tups = [(model, line) for line in lines]
167
+ indices = [line['index'] for line in lines]
168
+
169
+ ans = load(tmp_file) if osp.exists(tmp_file) else {}
170
+ tups = [x for x, i in zip(tups, indices) if i not in ans]
171
+ indices = [i for i in indices if i not in ans]
172
+
173
+ if len(indices):
174
+ new_results = track_progress_rich(
175
+ WildVision_auxeval,
176
+ tups,
177
+ nproc=nproc,
178
+ chunksize=nproc,
179
+ keys=indices,
180
+ save=tmp_file,
181
+ )
182
+ ans = load(tmp_file)
183
+ for k, v in zip(indices, new_results):
184
+ ans[k] = v
185
+
186
+ data['score'] = [ans[idx] for idx in data['index']]
187
+ data.pop('image')
188
+ dump(data, storage)
189
+
190
+ data = load(storage)
191
+ lt = len(data)
192
+
193
+ scores = defaultdict(lambda: 0)
194
+ for i in range(lt):
195
+ item = data.iloc[i]
196
+ if item['score'] not in self.score_map:
197
+ score = 0
198
+ else:
199
+ score = self.score_map[item['score']]
200
+ if '_rev' in item['index']:
201
+ score = -score
202
+ scores[score] += 1
203
+ name_map = {
204
+ 2: 'Much Better',
205
+ 1: 'Better',
206
+ 0: 'Tie',
207
+ -1: 'Worse',
208
+ -2: 'Much Worse'
209
+ }
210
+ scores = {name_map[k]: v for k, v in scores.items()}
211
+ much_better = scores.get('Much Better', 0)
212
+ better = scores.get('Better', 0)
213
+ worse = scores.get('Worse', 0)
214
+ much_worse = scores.get('Much Worse', 0)
215
+ scores['Reward'] = (
216
+ 100 * much_better + 50 * better - 50 * worse - 100 * much_worse
217
+ ) / lt
218
+ scores['Win Rate'] = (better + much_better) / lt
219
+ scores = {k: [v] for k, v in scores.items()}
220
+ scores = pd.DataFrame(scores)
221
+ dump(scores, score_file)
222
+ return scores