Add files using upload-large-folder tool
Browse files- r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/dynamath.py +240 -0
- r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_base.py +172 -0
- r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_caption.py +75 -0
- r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_ccocr.py +197 -0
- r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_mcq.py +904 -0
- r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_mt.py +128 -0
- r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_vqa.py +1475 -0
- r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_yorn.py +95 -0
- r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/longvideobench.py +328 -0
- r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/miabench.py +167 -0
- r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mlvu.py +455 -0
- r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mmbench_video.py +256 -0
- r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mmgenbench.py +69 -0
- r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mmlongbench.py +584 -0
- r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mmmath.py +446 -0
- r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mvbench.py +666 -0
- r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/slidevqa.py +189 -0
- r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/tempcompass.py +639 -0
- r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/text_base.py +88 -0
- r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/text_mcq.py +123 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/dynamath.py
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import json
|
| 3 |
+
import sympy as sp
|
| 4 |
+
import numpy as np
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from sympy import simplify, Eq, sympify, Pow, pi
|
| 7 |
+
from sympy.parsing.latex import parse_latex
|
| 8 |
+
import sys
|
| 9 |
+
import math
|
| 10 |
+
import os
|
| 11 |
+
import os.path as osp
|
| 12 |
+
import argparse
|
| 13 |
+
|
| 14 |
+
from .image_base import ImageBaseDataset
|
| 15 |
+
from .utils import build_judge
|
| 16 |
+
from ..utils import track_progress_rich
|
| 17 |
+
from ..smp import load, dump, d2df, toliststr
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def preprocess(str1):
|
| 21 |
+
if 0 <= str1.find("{") < str1.rfind("}"):
|
| 22 |
+
str1 = str1[str1.find("{"): str1.rfind("}") + 1]
|
| 23 |
+
str2 = str1.replace("\\", "")
|
| 24 |
+
str2 = str2.replace("\\n", "\n")
|
| 25 |
+
return str2
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def transfer(str1):
|
| 29 |
+
if "\u03c0" in str1:
|
| 30 |
+
strs = str1.split('\u03c0')
|
| 31 |
+
str1 = strs[0]
|
| 32 |
+
return float(str1) * np.pi
|
| 33 |
+
else:
|
| 34 |
+
return float(str1)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def parse_answer(answer, answer_type="multiple choice"):
|
| 38 |
+
if answer_type == "float":
|
| 39 |
+
if answer.isdigit():
|
| 40 |
+
return True, float(answer)
|
| 41 |
+
else:
|
| 42 |
+
parts = answer.split(' ')
|
| 43 |
+
answer = parts[0]
|
| 44 |
+
try:
|
| 45 |
+
answer = transfer(answer)
|
| 46 |
+
return True, answer
|
| 47 |
+
except:
|
| 48 |
+
return False, None
|
| 49 |
+
elif answer_type == "multiple choice":
|
| 50 |
+
if len(answer) == 1:
|
| 51 |
+
return True, answer.upper()
|
| 52 |
+
else:
|
| 53 |
+
in_flag = [ch in answer.upper() for ch in 'ABCDE']
|
| 54 |
+
if sum(in_flag) == 1:
|
| 55 |
+
for ch in 'ABCDE':
|
| 56 |
+
if ch in answer.upper():
|
| 57 |
+
return True, ch
|
| 58 |
+
return False, None
|
| 59 |
+
else:
|
| 60 |
+
return True, answer
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def DynaMath_auxeval(model, line):
|
| 64 |
+
pred = line['prediction']
|
| 65 |
+
pred = preprocess(pred)
|
| 66 |
+
|
| 67 |
+
succeed, short_answer = None, None
|
| 68 |
+
try:
|
| 69 |
+
dj = json.loads(pred, strict=False)
|
| 70 |
+
short_answer = dj.get("short answer")
|
| 71 |
+
assert short_answer is not None
|
| 72 |
+
succeed, short_answer = parse_answer(short_answer, answer_type=line['anwser_type'])
|
| 73 |
+
assert succeed
|
| 74 |
+
except:
|
| 75 |
+
# Failed to parse the JSON, use an auxiliary LLM to get the short answer
|
| 76 |
+
if line['answer_type'] == 'multiple choice':
|
| 77 |
+
inst = "Output the corresponing choice option, such as 'A', 'B', 'C', 'D', in a single line."
|
| 78 |
+
elif line['answer_type'] == 'float':
|
| 79 |
+
inst = "Output a three-digit floating-point number in a single line."
|
| 80 |
+
else:
|
| 81 |
+
inst = (
|
| 82 |
+
"Output a short answer in a single line. Any float numbers in the answer "
|
| 83 |
+
"should be formatted as three-digit floating-point numbers."
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
prompt = f"Free-form answer: {pred}\nInstruction: {inst}"
|
| 87 |
+
response = pred
|
| 88 |
+
succeed, short_answer = parse_answer(response, line['answer_type'])
|
| 89 |
+
if not succeed:
|
| 90 |
+
response = model.generate(prompt)
|
| 91 |
+
succeed, short_answer = parse_answer(response, line['answer_type'])
|
| 92 |
+
|
| 93 |
+
if line['answer_type'] == 'float':
|
| 94 |
+
if succeed:
|
| 95 |
+
diff = float(short_answer) - float(line['answer'])
|
| 96 |
+
if abs(diff) <= 0.001:
|
| 97 |
+
return dict(parse=True, extracted=short_answer, correct=True)
|
| 98 |
+
else:
|
| 99 |
+
return dict(parse=True, extracted=short_answer, correct=False)
|
| 100 |
+
else:
|
| 101 |
+
return dict(parse=False, extracted=None, correct=False)
|
| 102 |
+
elif line['answer_type'] == 'multiple choice':
|
| 103 |
+
if succeed:
|
| 104 |
+
return dict(parse=True, extracted=short_answer, correct=(short_answer == line['answer']))
|
| 105 |
+
else:
|
| 106 |
+
if line['answer'] in pred[:3].upper():
|
| 107 |
+
return dict(parse=False, extracted=None, correct=True)
|
| 108 |
+
else:
|
| 109 |
+
return dict(parse=False, extracted=None, correct=False)
|
| 110 |
+
else:
|
| 111 |
+
if succeed:
|
| 112 |
+
return dict(parse=True, extracted=short_answer, correct=(short_answer.lower() in line['answer'].lower()))
|
| 113 |
+
else:
|
| 114 |
+
return dict(parse=False, extracted=None, correct=(short_answer.lower() in line['answer'].lower()))
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
class Dynamath(ImageBaseDataset):
|
| 118 |
+
|
| 119 |
+
TYPE = 'VQA'
|
| 120 |
+
DATASET_URL = {'DynaMath': 'https://opencompass.openxlab.space/utils/VLMEval/DynaMath.tsv'}
|
| 121 |
+
DATASET_MD5 = {'DynaMath': 'b8425ad9a7114571fc9366e013699494'}
|
| 122 |
+
GUIDE = """
|
| 123 |
+
## Answer Instruction Please provide an answer to the question outlined above. Your response should adhere \
|
| 124 |
+
to the following JSON format, which includes two keys: 'solution' and 'short answer'. The 'solution' key can contain \
|
| 125 |
+
detailed steps needed to solve the question, and the 'short answer' key should provide a concise response. {INST}
|
| 126 |
+
|
| 127 |
+
Example of expected JSON response format:
|
| 128 |
+
|
| 129 |
+
"""
|
| 130 |
+
EXAMPLE = {
|
| 131 |
+
"solution": "[Detailed step-by-step explanation]",
|
| 132 |
+
"short answer": "[Concise Answer]"
|
| 133 |
+
}
|
| 134 |
+
TEXT_EXAMPLE = json.dumps(EXAMPLE, indent=4)
|
| 135 |
+
|
| 136 |
+
# Given one data record, return the built prompt (a multi-modal message), can override
|
| 137 |
+
def build_prompt(self, line):
|
| 138 |
+
if isinstance(line, int):
|
| 139 |
+
line = self.data.iloc[line]
|
| 140 |
+
|
| 141 |
+
if self.meta_only:
|
| 142 |
+
tgt_path = toliststr(line['image_path'])
|
| 143 |
+
else:
|
| 144 |
+
tgt_path = self.dump_image(line)
|
| 145 |
+
|
| 146 |
+
prompt = f"## Question\n {line['question']}"
|
| 147 |
+
if line['answer_type'] == 'multiple choice':
|
| 148 |
+
inst = "Provide the corresponing choice option in the 'short answer' key, such as 'A', 'B', 'C', or 'D'."
|
| 149 |
+
elif line['answer_type'] == 'float':
|
| 150 |
+
inst = "Format the answer as a three-digit floating-point number and provide it in the 'short answer' key."
|
| 151 |
+
else:
|
| 152 |
+
inst = "Float numbers in the answer should be formatted as three-digit floating-point numbers."
|
| 153 |
+
|
| 154 |
+
prompt = prompt + self.GUIDE.format(INST=inst) + self.TEXT_EXAMPLE
|
| 155 |
+
|
| 156 |
+
msgs = []
|
| 157 |
+
if isinstance(tgt_path, list):
|
| 158 |
+
msgs.extend([dict(type='image', value=p) for p in tgt_path])
|
| 159 |
+
else:
|
| 160 |
+
msgs = [dict(type='image', value=tgt_path)]
|
| 161 |
+
msgs.append(dict(type='text', value=prompt))
|
| 162 |
+
return msgs
|
| 163 |
+
|
| 164 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 165 |
+
judge_name = judge_kwargs.pop('model', 'gpt-4o-mini')
|
| 166 |
+
|
| 167 |
+
model = build_judge(model=judge_name, **judge_kwargs)
|
| 168 |
+
suffix = eval_file.split('.')[-1]
|
| 169 |
+
|
| 170 |
+
storage = eval_file.replace(f'.{suffix}', f'_{judge_name}.xlsx') # noqa: F841
|
| 171 |
+
score_file = eval_file.replace(f'.{suffix}', f'_{judge_name}_score.csv') # noqa: F841
|
| 172 |
+
tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_name}.pkl') # noqa: F841
|
| 173 |
+
nproc = judge_kwargs.pop('nproc', 6) # noqa: F841
|
| 174 |
+
|
| 175 |
+
res = load(tmp_file) if os.path.exists(tmp_file) else {}
|
| 176 |
+
res = {k: v for k, v in res.items() if v is not None}
|
| 177 |
+
|
| 178 |
+
model.system_prompt = """\
|
| 179 |
+
You are a helpful assistant that helps me to format free-form answers into a short answer according to the instruction.
|
| 180 |
+
"""
|
| 181 |
+
if not osp.exists(storage):
|
| 182 |
+
data = load(eval_file)
|
| 183 |
+
lt = len(data)
|
| 184 |
+
payloads = [dict(model=model, line=data.iloc[i]) for i in range(lt) if data.iloc[i]['index'] not in res]
|
| 185 |
+
keys = [idx for idx in data['index'] if idx not in res]
|
| 186 |
+
|
| 187 |
+
if len(keys):
|
| 188 |
+
results = track_progress_rich(DynaMath_auxeval, payloads, nproc=nproc, save=tmp_file, keys=keys)
|
| 189 |
+
for k, r in zip(keys, results):
|
| 190 |
+
res[k] = r
|
| 191 |
+
|
| 192 |
+
data['parse'] = [res[idx]['parse'] for idx in data['index']]
|
| 193 |
+
data['extracted'] = [res[idx]['extracted'] for idx in data['index']]
|
| 194 |
+
data['correct'] = [res[idx]['correct'] for idx in data['index']]
|
| 195 |
+
dump(data, storage)
|
| 196 |
+
|
| 197 |
+
data = load(storage)
|
| 198 |
+
# Calculate Average Accuracy
|
| 199 |
+
score_avg = {}
|
| 200 |
+
score_avg['Overall'] = np.mean(data['correct'])
|
| 201 |
+
|
| 202 |
+
subs = set(data['subject'])
|
| 203 |
+
for sub in subs:
|
| 204 |
+
data_sub = data[data['subject'] == sub]
|
| 205 |
+
score_avg[f'Subject-{sub}'] = np.mean(data_sub['correct'])
|
| 206 |
+
|
| 207 |
+
lvls = set(data['knowledge_level'])
|
| 208 |
+
for lvl in lvls:
|
| 209 |
+
data_lvl = data[data['knowledge_level'] == lvl]
|
| 210 |
+
score_avg[f'Level-{lvl}'] = np.mean(data_lvl['correct'])
|
| 211 |
+
|
| 212 |
+
# Calculate the Worst Case Accuracy
|
| 213 |
+
score_worst = {}
|
| 214 |
+
data_worst = data[data['varid'] == 1]
|
| 215 |
+
qid2corr = {idx: True for idx in data_worst['index']}
|
| 216 |
+
lt = len(data)
|
| 217 |
+
for i in range(lt):
|
| 218 |
+
item = data.iloc[i]
|
| 219 |
+
qid2corr[item['qid']] *= item['correct']
|
| 220 |
+
data_worst['correct'] = [qid2corr[idx] for idx in data_worst['qid']]
|
| 221 |
+
score_worst['Overall'] = np.mean(data_worst['correct'])
|
| 222 |
+
|
| 223 |
+
subs = set(data_worst['subject'])
|
| 224 |
+
for sub in subs:
|
| 225 |
+
data_sub = data_worst[data_worst['subject'] == sub]
|
| 226 |
+
score_worst[f'Subject-{sub}'] = np.mean(data_sub['correct'])
|
| 227 |
+
|
| 228 |
+
lvls = set(data_worst['knowledge_level'])
|
| 229 |
+
for lvl in lvls:
|
| 230 |
+
data_lvl = data_worst[data_worst['knowledge_level'] == lvl]
|
| 231 |
+
score_worst[f'Level-{lvl}'] = np.mean(data_lvl['correct'])
|
| 232 |
+
|
| 233 |
+
d1 = {'Setting': 'Average'}
|
| 234 |
+
d1.update(score_avg)
|
| 235 |
+
d2 = {'Setting': 'Worst Case'}
|
| 236 |
+
d2.update(score_worst)
|
| 237 |
+
score = pd.concat([d2df(d1), d2df(d2)], ignore_index=True)
|
| 238 |
+
|
| 239 |
+
dump(score, score_file)
|
| 240 |
+
return score
|
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_base.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from abc import abstractmethod
|
| 3 |
+
from ..smp import *
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def img_root_map(dataset):
|
| 7 |
+
if 'MM_NIAH' in dataset:
|
| 8 |
+
return 'MMNIAH'
|
| 9 |
+
if 'CRPE' in dataset:
|
| 10 |
+
return 'CRPE'
|
| 11 |
+
if 'OCRVQA' in dataset:
|
| 12 |
+
return 'OCRVQA'
|
| 13 |
+
if 'COCO_VAL' == dataset:
|
| 14 |
+
return 'COCO'
|
| 15 |
+
if 'MMMU' in dataset:
|
| 16 |
+
return 'MMMU'
|
| 17 |
+
if "QSpatial" in dataset:
|
| 18 |
+
return "QSpatial"
|
| 19 |
+
|
| 20 |
+
mmbench_root_map = {
|
| 21 |
+
'MMBench_DEV_EN': 'MMBench', 'MMBench_TEST_EN': 'MMBench',
|
| 22 |
+
'MMBench_DEV_CN': 'MMBench', 'MMBench_TEST_CN': 'MMBench',
|
| 23 |
+
'MMBench': 'MMBench', 'MMBench_CN': 'MMBench',
|
| 24 |
+
'MMBench_DEV_EN_V11': 'MMBench_V11', 'MMBench_TEST_EN_V11': 'MMBench_V11',
|
| 25 |
+
'MMBench_DEV_CN_V11': 'MMBench_V11', 'MMBench_TEST_CN_V11': 'MMBench_V11',
|
| 26 |
+
'MMBench_V11': 'MMBench', 'MMBench_CN_V11': 'MMBench',
|
| 27 |
+
}
|
| 28 |
+
if dataset in mmbench_root_map:
|
| 29 |
+
return mmbench_root_map[dataset]
|
| 30 |
+
return dataset
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class ImageBaseDataset:
|
| 34 |
+
|
| 35 |
+
MODALITY = 'IMAGE'
|
| 36 |
+
DATASET_URL = {}
|
| 37 |
+
DATASET_MD5 = {}
|
| 38 |
+
|
| 39 |
+
def __init__(self, dataset='MMBench', skip_noimg=True):
|
| 40 |
+
ROOT = LMUDataRoot()
|
| 41 |
+
# You can override this variable to save image files to a different directory
|
| 42 |
+
self.dataset_name = dataset
|
| 43 |
+
self.img_root = osp.join(ROOT, 'images', img_root_map(dataset))
|
| 44 |
+
|
| 45 |
+
data = self.load_data(dataset)
|
| 46 |
+
self.skip_noimg = skip_noimg
|
| 47 |
+
if skip_noimg and 'image' in data:
|
| 48 |
+
data = data[~pd.isna(data['image'])]
|
| 49 |
+
|
| 50 |
+
data['index'] = [str(x) for x in data['index']]
|
| 51 |
+
|
| 52 |
+
self.meta_only = True
|
| 53 |
+
|
| 54 |
+
# The image field can store the base64 encoded image or another question index (for saving space)
|
| 55 |
+
if 'image' in data:
|
| 56 |
+
data['image'] = [str(x) for x in data['image']]
|
| 57 |
+
image_map = {x: y for x, y in zip(data['index'], data['image'])}
|
| 58 |
+
for k in image_map:
|
| 59 |
+
if len(image_map[k]) <= 64:
|
| 60 |
+
idx = image_map[k]
|
| 61 |
+
assert idx in image_map and len(image_map[idx]) > 64
|
| 62 |
+
image_map[k] = image_map[idx]
|
| 63 |
+
|
| 64 |
+
images = [toliststr(image_map[k]) for k in data['index']]
|
| 65 |
+
data['image'] = [x[0] if len(x) == 1 else x for x in images]
|
| 66 |
+
self.meta_only = False
|
| 67 |
+
|
| 68 |
+
if 'image_path' in data:
|
| 69 |
+
paths = [toliststr(x) for x in data['image_path']]
|
| 70 |
+
data['image_path'] = [x[0] if len(x) == 1 else x for x in paths]
|
| 71 |
+
|
| 72 |
+
if np.all([istype(x, int) for x in data['index']]):
|
| 73 |
+
data['index'] = [int(x) for x in data['index']]
|
| 74 |
+
|
| 75 |
+
self.data = data
|
| 76 |
+
self.post_build(dataset)
|
| 77 |
+
|
| 78 |
+
def __len__(self):
|
| 79 |
+
return len(self.data)
|
| 80 |
+
|
| 81 |
+
def __getitem__(self, idx):
|
| 82 |
+
return dict(self.data.iloc[idx])
|
| 83 |
+
|
| 84 |
+
def prepare_tsv(self, url, file_md5=None):
|
| 85 |
+
data_root = LMUDataRoot()
|
| 86 |
+
os.makedirs(data_root, exist_ok=True)
|
| 87 |
+
update_flag = False
|
| 88 |
+
file_name = url.split('/')[-1]
|
| 89 |
+
data_path = osp.join(data_root, file_name)
|
| 90 |
+
if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
|
| 91 |
+
pass
|
| 92 |
+
else:
|
| 93 |
+
warnings.warn('The dataset tsv is not downloaded')
|
| 94 |
+
download_file(url, data_path)
|
| 95 |
+
update_flag = True
|
| 96 |
+
|
| 97 |
+
if file_size(data_path, 'GB') > 1:
|
| 98 |
+
local_path = data_path.replace('.tsv', '_local.tsv')
|
| 99 |
+
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag:
|
| 100 |
+
from ..tools import LOCALIZE
|
| 101 |
+
LOCALIZE(data_path, local_path)
|
| 102 |
+
data_path = local_path
|
| 103 |
+
return load(data_path)
|
| 104 |
+
|
| 105 |
+
def dump_image(self, line):
|
| 106 |
+
os.makedirs(self.img_root, exist_ok=True)
|
| 107 |
+
|
| 108 |
+
if 'image' in line:
|
| 109 |
+
if isinstance(line['image'], list):
|
| 110 |
+
tgt_path = []
|
| 111 |
+
assert 'image_path' in line
|
| 112 |
+
for img, im_name in zip(line['image'], line['image_path']):
|
| 113 |
+
path = osp.join(self.img_root, im_name)
|
| 114 |
+
if not read_ok(path):
|
| 115 |
+
decode_base64_to_image_file(img, path)
|
| 116 |
+
tgt_path.append(path)
|
| 117 |
+
else:
|
| 118 |
+
tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
|
| 119 |
+
if not read_ok(tgt_path):
|
| 120 |
+
decode_base64_to_image_file(line['image'], tgt_path)
|
| 121 |
+
tgt_path = [tgt_path]
|
| 122 |
+
else:
|
| 123 |
+
assert 'image_path' in line
|
| 124 |
+
tgt_path = toliststr(line['image_path'])
|
| 125 |
+
|
| 126 |
+
return tgt_path
|
| 127 |
+
|
| 128 |
+
def display(self, line):
|
| 129 |
+
if isinstance(line, int):
|
| 130 |
+
line = self.data.iloc[line]
|
| 131 |
+
assert isinstance(line, pd.Series) or isinstance(line, dict)
|
| 132 |
+
mmqa_display(line)
|
| 133 |
+
|
| 134 |
+
# Return a list of dataset names that are supported by this class, can override
|
| 135 |
+
@classmethod
|
| 136 |
+
def supported_datasets(cls):
|
| 137 |
+
return list(cls.DATASET_URL)
|
| 138 |
+
|
| 139 |
+
# Given the dataset name, return the dataset as a pandas dataframe, can override
|
| 140 |
+
def load_data(self, dataset):
|
| 141 |
+
url = self.DATASET_URL[dataset]
|
| 142 |
+
file_md5 = self.DATASET_MD5[dataset] if dataset in self.DATASET_MD5 else None
|
| 143 |
+
return self.prepare_tsv(url, file_md5)
|
| 144 |
+
|
| 145 |
+
# Post built hook, will be called after the dataset is built, can override
|
| 146 |
+
def post_build(self, dataset):
|
| 147 |
+
pass
|
| 148 |
+
|
| 149 |
+
# Given one data record, return the built prompt (a multi-modal message), can override
|
| 150 |
+
def build_prompt(self, line):
|
| 151 |
+
if isinstance(line, int):
|
| 152 |
+
line = self.data.iloc[line]
|
| 153 |
+
|
| 154 |
+
if self.meta_only:
|
| 155 |
+
tgt_path = toliststr(line['image_path'])
|
| 156 |
+
else:
|
| 157 |
+
tgt_path = self.dump_image(line)
|
| 158 |
+
|
| 159 |
+
question = line['question']
|
| 160 |
+
|
| 161 |
+
msgs = []
|
| 162 |
+
if isinstance(tgt_path, list):
|
| 163 |
+
msgs.extend([dict(type='image', value=p) for p in tgt_path])
|
| 164 |
+
else:
|
| 165 |
+
msgs = [dict(type='image', value=tgt_path)]
|
| 166 |
+
msgs.append(dict(type='text', value=question))
|
| 167 |
+
return msgs
|
| 168 |
+
|
| 169 |
+
# Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
|
| 170 |
+
@abstractmethod
|
| 171 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 172 |
+
pass
|
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_caption.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .image_base import ImageBaseDataset
|
| 2 |
+
from ..smp import *
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class COCO_Caption_Scorer():
|
| 6 |
+
def __init__(self, ref, gt):
|
| 7 |
+
from pycocoevalcap.bleu.bleu import Bleu
|
| 8 |
+
from pycocoevalcap.rouge.rouge import Rouge
|
| 9 |
+
from pycocoevalcap.cider.cider import Cider
|
| 10 |
+
|
| 11 |
+
self.ref = ref
|
| 12 |
+
self.gt = gt
|
| 13 |
+
print('setting up scorers...')
|
| 14 |
+
self.scorers = [
|
| 15 |
+
(Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']),
|
| 16 |
+
(Rouge(), 'ROUGE_L'),
|
| 17 |
+
(Cider(), 'CIDEr'),
|
| 18 |
+
]
|
| 19 |
+
|
| 20 |
+
def compute_scores(self):
|
| 21 |
+
total_scores = {}
|
| 22 |
+
for scorer, method in self.scorers:
|
| 23 |
+
print('computing %s score...' % (scorer.method()))
|
| 24 |
+
score, scores = scorer.compute_score(self.gt, self.ref)
|
| 25 |
+
if isinstance(method, list):
|
| 26 |
+
for sc, scs, m in zip(score, scores, method):
|
| 27 |
+
print('%s: %0.3f' % (m, sc * 100))
|
| 28 |
+
total_scores['Bleu'] = [x * 100 for x in score]
|
| 29 |
+
else:
|
| 30 |
+
print('%s: %0.3f' % (method, score * 100))
|
| 31 |
+
total_scores[method] = score * 100
|
| 32 |
+
|
| 33 |
+
print('*****DONE*****')
|
| 34 |
+
for key, value in total_scores.items():
|
| 35 |
+
print('{}:{}'.format(key, value))
|
| 36 |
+
return total_scores
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class ImageCaptionDataset(ImageBaseDataset):
|
| 40 |
+
|
| 41 |
+
TYPE = 'Caption'
|
| 42 |
+
|
| 43 |
+
DATASET_URL = {
|
| 44 |
+
'COCO_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/COCO_VAL.tsv',
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
DATASET_MD5 = {
|
| 48 |
+
'COCO_VAL': '72a5079dead060269ac222c5aa5128af',
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
def load_data(self, dataset):
|
| 52 |
+
data = super().load_data(dataset)
|
| 53 |
+
if 'question' not in data:
|
| 54 |
+
data['question'] = [(
|
| 55 |
+
'Please describe this image in general. Directly provide the description, '
|
| 56 |
+
'do not include prefix like "This image depicts". '
|
| 57 |
+
)] * len(data)
|
| 58 |
+
return data
|
| 59 |
+
|
| 60 |
+
# It returns a dictionary of scores
|
| 61 |
+
@classmethod
|
| 62 |
+
def evaluate(self, eval_file, **kwargs):
|
| 63 |
+
data = load(eval_file)
|
| 64 |
+
lt = len(data)
|
| 65 |
+
lines = [data.iloc[i] for i in range(lt)]
|
| 66 |
+
ref, gt = {}, {}
|
| 67 |
+
for i, line in enumerate(lines):
|
| 68 |
+
ref[str(i)] = [str(line['prediction'])]
|
| 69 |
+
gt[str(i)] = eval(line['answer'])
|
| 70 |
+
|
| 71 |
+
scorer = COCO_Caption_Scorer(ref, gt)
|
| 72 |
+
coco_caption_score_dict = scorer.compute_scores()
|
| 73 |
+
score_pth = eval_file.replace('.xlsx', '_score.json')
|
| 74 |
+
dump(coco_caption_score_dict, score_pth)
|
| 75 |
+
return coco_caption_score_dict
|
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_ccocr.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# flake8: noqa
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import re
|
| 5 |
+
import tempfile
|
| 6 |
+
from functools import partial
|
| 7 |
+
import pandas as pd
|
| 8 |
+
|
| 9 |
+
from .image_base import ImageBaseDataset
|
| 10 |
+
from ..smp import *
|
| 11 |
+
|
| 12 |
+
# should be the same as FAIL_MSG definded in vlmeval/inference.py
|
| 13 |
+
FAIL_MSG = 'Failed to obtain answer via API.'
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class CCOCRDataset(ImageBaseDataset):
|
| 17 |
+
TYPE = 'VQA'
|
| 18 |
+
DATASET_URL_MODELSCOPE = {
|
| 19 |
+
"CCOCR_DocParsing_DocPhotoChn": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/doc/doc_photo_chn_75.tsv",
|
| 20 |
+
"CCOCR_DocParsing_DocPhotoEng": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/doc/doc_photo_eng_75.tsv",
|
| 21 |
+
"CCOCR_DocParsing_DocScanChn": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/doc/doc_scan_chn_75.tsv",
|
| 22 |
+
"CCOCR_DocParsing_DocScanEng": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/doc/doc_scan_eng_75.tsv",
|
| 23 |
+
"CCOCR_DocParsing_TablePhotoChn": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/table/table_photo_chn_75.tsv",
|
| 24 |
+
"CCOCR_DocParsing_TablePhotoEng": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/table/table_photo_eng_75.tsv",
|
| 25 |
+
"CCOCR_DocParsing_TableScanChn": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/table/table_scan_chn_75.tsv",
|
| 26 |
+
"CCOCR_DocParsing_TableScanEng": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/table/table_scan_eng_75.tsv",
|
| 27 |
+
"CCOCR_DocParsing_MolecularHandwriting": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/molecular/molecular_handwriting_100.tsv",
|
| 28 |
+
"CCOCR_DocParsing_FormulaHandwriting": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/formula/formula_handwriting_100.tsv",
|
| 29 |
+
"CCOCR_Kie_Sroie2019Word": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/kie/constrained_category/sroie2019_word_347.tsv",
|
| 30 |
+
"CCOCR_Kie_Cord": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/kie/constrained_category/CORD_100.tsv",
|
| 31 |
+
"CCOCR_Kie_EphoieScut": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/kie/constrained_category/EPHOIE_SCUT_311.tsv",
|
| 32 |
+
"CCOCR_Kie_Poie": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/kie/constrained_category/POIE_250.tsv",
|
| 33 |
+
"CCOCR_Kie_ColdSibr": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/kie/open_category/COLD_SIBR_400.tsv",
|
| 34 |
+
"CCOCR_Kie_ColdCell": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/kie/open_category/COLD_CELL_600.tsv",
|
| 35 |
+
"CCOCR_MultiLanOcr_Arabic": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Arabic/Arabic_150.tsv",
|
| 36 |
+
"CCOCR_MultiLanOcr_French": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/French/French_150.tsv",
|
| 37 |
+
"CCOCR_MultiLanOcr_German": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/German/German_150.tsv",
|
| 38 |
+
"CCOCR_MultiLanOcr_Italian": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Italian/Italian_150.tsv",
|
| 39 |
+
"CCOCR_MultiLanOcr_Japanese": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Japanese/Japanese_150.tsv",
|
| 40 |
+
"CCOCR_MultiLanOcr_Korean": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Korean/Korean_150.tsv",
|
| 41 |
+
"CCOCR_MultiLanOcr_Portuguese": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Portuguese/Portuguese_150.tsv",
|
| 42 |
+
"CCOCR_MultiLanOcr_Russian": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Russian/Russian_150.tsv",
|
| 43 |
+
"CCOCR_MultiLanOcr_Spanish": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Spanish/Spanish_150.tsv",
|
| 44 |
+
"CCOCR_MultiLanOcr_Vietnamese": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Vietnamese/Vietnamese_150.tsv",
|
| 45 |
+
"CCOCR_MultiSceneOcr_Cord": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/document_text/CORD_100.tsv",
|
| 46 |
+
"CCOCR_MultiSceneOcr_Funsd": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/document_text/FUNSD_50.tsv",
|
| 47 |
+
"CCOCR_MultiSceneOcr_Iam": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/document_text/IAM_50.tsv",
|
| 48 |
+
"CCOCR_MultiSceneOcr_ZhDoc": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/document_text/zh_doc_100.tsv",
|
| 49 |
+
"CCOCR_MultiSceneOcr_ZhHandwriting": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/document_text/zh_handwriting_50.tsv",
|
| 50 |
+
"CCOCR_MultiSceneOcr_Hieragent": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/scene_text/Hieragent_100.tsv",
|
| 51 |
+
"CCOCR_MultiSceneOcr_Ic15": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/scene_text/IC15_500.tsv",
|
| 52 |
+
"CCOCR_MultiSceneOcr_Inversetext": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/scene_text/InverseText_500.tsv",
|
| 53 |
+
"CCOCR_MultiSceneOcr_Totaltext": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/scene_text/TotalText_300.tsv",
|
| 54 |
+
"CCOCR_MultiSceneOcr_ZhScene": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/scene_text/zh_scene_450.tsv",
|
| 55 |
+
"CCOCR_MultiSceneOcr_UgcLaion": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/ugc_text/ugc_laion_400.tsv",
|
| 56 |
+
"CCOCR_MultiSceneOcr_ZhDense": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/ugc_text/zh_dense_50.tsv",
|
| 57 |
+
"CCOCR_MultiSceneOcr_ZhVertical": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/ugc_text/zh_vertical_100.tsv"
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
DATASET_URL_HUGGINGFACE = {
|
| 61 |
+
"CCOCR_DocParsing_DocPhotoChn": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/doc/doc_photo_chn_75.tsv",
|
| 62 |
+
"CCOCR_DocParsing_DocPhotoEng": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/doc/doc_photo_eng_75.tsv",
|
| 63 |
+
"CCOCR_DocParsing_DocScanChn": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/doc/doc_scan_chn_75.tsv",
|
| 64 |
+
"CCOCR_DocParsing_DocScanEng": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/doc/doc_scan_eng_75.tsv",
|
| 65 |
+
"CCOCR_DocParsing_TablePhotoChn": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/table/table_photo_chn_75.tsv",
|
| 66 |
+
"CCOCR_DocParsing_TablePhotoEng": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/table/table_photo_eng_75.tsv",
|
| 67 |
+
"CCOCR_DocParsing_TableScanChn": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/table/table_scan_chn_75.tsv",
|
| 68 |
+
"CCOCR_DocParsing_TableScanEng": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/table/table_scan_eng_75.tsv",
|
| 69 |
+
"CCOCR_DocParsing_MolecularHandwriting": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/molecular/molecular_handwriting_100.tsv",
|
| 70 |
+
"CCOCR_DocParsing_FormulaHandwriting": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/formula/formula_handwriting_100.tsv",
|
| 71 |
+
"CCOCR_Kie_Sroie2019Word": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/kie/constrained_category/sroie2019_word_347.tsv",
|
| 72 |
+
"CCOCR_Kie_Cord": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/kie/constrained_category/CORD_100.tsv",
|
| 73 |
+
"CCOCR_Kie_EphoieScut": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/kie/constrained_category/EPHOIE_SCUT_311.tsv",
|
| 74 |
+
"CCOCR_Kie_Poie": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/kie/constrained_category/POIE_250.tsv",
|
| 75 |
+
"CCOCR_Kie_ColdSibr": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/kie/open_category/COLD_SIBR_400.tsv",
|
| 76 |
+
"CCOCR_Kie_ColdCell": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/kie/open_category/COLD_CELL_600.tsv",
|
| 77 |
+
"CCOCR_MultiLanOcr_Arabic": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Arabic/Arabic_150.tsv",
|
| 78 |
+
"CCOCR_MultiLanOcr_French": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/French/French_150.tsv",
|
| 79 |
+
"CCOCR_MultiLanOcr_German": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/German/German_150.tsv",
|
| 80 |
+
"CCOCR_MultiLanOcr_Italian": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Italian/Italian_150.tsv",
|
| 81 |
+
"CCOCR_MultiLanOcr_Japanese": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Japanese/Japanese_150.tsv",
|
| 82 |
+
"CCOCR_MultiLanOcr_Korean": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Korean/Korean_150.tsv",
|
| 83 |
+
"CCOCR_MultiLanOcr_Portuguese": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Portuguese/Portuguese_150.tsv",
|
| 84 |
+
"CCOCR_MultiLanOcr_Russian": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Russian/Russian_150.tsv",
|
| 85 |
+
"CCOCR_MultiLanOcr_Spanish": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Spanish/Spanish_150.tsv",
|
| 86 |
+
"CCOCR_MultiLanOcr_Vietnamese": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Vietnamese/Vietnamese_150.tsv",
|
| 87 |
+
"CCOCR_MultiSceneOcr_Cord": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/document_text/CORD_100.tsv",
|
| 88 |
+
"CCOCR_MultiSceneOcr_Funsd": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/document_text/FUNSD_50.tsv",
|
| 89 |
+
"CCOCR_MultiSceneOcr_Iam": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/document_text/IAM_50.tsv",
|
| 90 |
+
"CCOCR_MultiSceneOcr_ZhDoc": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/document_text/zh_doc_100.tsv",
|
| 91 |
+
"CCOCR_MultiSceneOcr_ZhHandwriting": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/document_text/zh_handwriting_50.tsv",
|
| 92 |
+
"CCOCR_MultiSceneOcr_Hieragent": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/scene_text/Hieragent_100.tsv",
|
| 93 |
+
"CCOCR_MultiSceneOcr_Ic15": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/scene_text/IC15_500.tsv",
|
| 94 |
+
"CCOCR_MultiSceneOcr_Inversetext": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/scene_text/InverseText_500.tsv",
|
| 95 |
+
"CCOCR_MultiSceneOcr_Totaltext": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/scene_text/TotalText_300.tsv",
|
| 96 |
+
"CCOCR_MultiSceneOcr_ZhScene": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/scene_text/zh_scene_450.tsv",
|
| 97 |
+
"CCOCR_MultiSceneOcr_UgcLaion": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/ugc_text/ugc_laion_400.tsv",
|
| 98 |
+
"CCOCR_MultiSceneOcr_ZhDense": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/ugc_text/zh_dense_50.tsv",
|
| 99 |
+
"CCOCR_MultiSceneOcr_ZhVertical": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/ugc_text/zh_vertical_100.tsv"
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
# define data path
|
| 103 |
+
DATASET_URL = DATASET_URL_MODELSCOPE
|
| 104 |
+
DATASET_MD5 = {
|
| 105 |
+
"CCOCR_DocParsing_DocPhotoChn": "9039dcbb31830d413261a95cfa29d97f",
|
| 106 |
+
"CCOCR_DocParsing_DocPhotoEng": "2ca0824881e1d7317626f2a19d902989",
|
| 107 |
+
"CCOCR_DocParsing_DocScanChn": "9e265c8aa760ebdf5c3bf9e892d55492",
|
| 108 |
+
"CCOCR_DocParsing_DocScanEng": "77d04637be3def86dbc2ce37ba64a704",
|
| 109 |
+
"CCOCR_DocParsing_TablePhotoChn": "c4dc85252ddad2b43a03a67b1d1ae983",
|
| 110 |
+
"CCOCR_DocParsing_TablePhotoEng": "02ab75d6169da0cd2ece9ce0ae14a479",
|
| 111 |
+
"CCOCR_DocParsing_TableScanChn": "f1f79959fdd01127df7377c9d46722f2",
|
| 112 |
+
"CCOCR_DocParsing_TableScanEng": "794903c7acf52bfe956eefba2166d14b",
|
| 113 |
+
"CCOCR_DocParsing_MolecularHandwriting": "30b7f7679b713ce000a939eca7b4078f",
|
| 114 |
+
"CCOCR_DocParsing_FormulaHandwriting": "e03047776ce5e79a61ae1c057e2a348e",
|
| 115 |
+
"CCOCR_Kie_Sroie2019Word": "3287d99a8e86a99b74171fa5a70f9acb",
|
| 116 |
+
"CCOCR_Kie_Cord": "ab297cadcbc7158884a301c366f3330a",
|
| 117 |
+
"CCOCR_Kie_EphoieScut": "bb8fa3ba7ea91cbf17be0904956ad3f3",
|
| 118 |
+
"CCOCR_Kie_Poie": "882b64317989ecbfed6518051cdffb14",
|
| 119 |
+
"CCOCR_Kie_ColdSibr": "109d5dad8b7081fb6a2f088e963196d4",
|
| 120 |
+
"CCOCR_Kie_ColdCell": "7b44c45b4d7d768d1dbdc08872fe7d3a",
|
| 121 |
+
"CCOCR_MultiLanOcr_Arabic": "e9a3f2bb9298d0b882ebc7a98980c3f3",
|
| 122 |
+
"CCOCR_MultiLanOcr_French": "729407ed2036c22e602eff645eddd40c",
|
| 123 |
+
"CCOCR_MultiLanOcr_German": "96fc2edae747f0ec95b0a6f9bf723022",
|
| 124 |
+
"CCOCR_MultiLanOcr_Italian": "29a508fa5d5a5e767497dd69e2430ebb",
|
| 125 |
+
"CCOCR_MultiLanOcr_Japanese": "bbcca96ccf25fff63597c2ab4f3ebb1f",
|
| 126 |
+
"CCOCR_MultiLanOcr_Korean": "0f55dbd24eba5edc189c91e124411641",
|
| 127 |
+
"CCOCR_MultiLanOcr_Portuguese": "a6fcf8831775a61aa631c0cf1c422ae7",
|
| 128 |
+
"CCOCR_MultiLanOcr_Russian": "19d2f84062a1699d3e9333912bd6b303",
|
| 129 |
+
"CCOCR_MultiLanOcr_Spanish": "f5a0cfa9f2ae4115c91c7b362034e591",
|
| 130 |
+
"CCOCR_MultiLanOcr_Vietnamese": "bf1cd4e83d91767f4906f81550cec8b9",
|
| 131 |
+
"CCOCR_MultiSceneOcr_Cord": "92943f0ccb4c5a196c574222e76759a0",
|
| 132 |
+
"CCOCR_MultiSceneOcr_Funsd": "229cc38d193edd00f4383610e98ee873",
|
| 133 |
+
"CCOCR_MultiSceneOcr_Iam": "d897a6d6c3880c65e752ec11b211204c",
|
| 134 |
+
"CCOCR_MultiSceneOcr_ZhDoc": "303682cc16c8bb51b2b896f8ceb8bd38",
|
| 135 |
+
"CCOCR_MultiSceneOcr_ZhHandwriting": "faa298d366bc05e5cfb39e334afb8eff",
|
| 136 |
+
"CCOCR_MultiSceneOcr_Hieragent": "6f132cdd0473d7cc145c3e3a08957dd6",
|
| 137 |
+
"CCOCR_MultiSceneOcr_Ic15": "3d94869f312a41d53d0578a06a2fb1f2",
|
| 138 |
+
"CCOCR_MultiSceneOcr_Inversetext": "e141d424a0c4cf9579064428a270f13d",
|
| 139 |
+
"CCOCR_MultiSceneOcr_Totaltext": "ca1daf81d49eeb57ef844b72a23c2e62",
|
| 140 |
+
"CCOCR_MultiSceneOcr_ZhScene": "9295152a66e6f117db8bfbb20a9013e6",
|
| 141 |
+
"CCOCR_MultiSceneOcr_UgcLaion": "8e9ea1fbf9d56532157e807eabf39b21",
|
| 142 |
+
"CCOCR_MultiSceneOcr_ZhDense": "de8f48ee0c8a2cf8ed7f2b3a81e6322d",
|
| 143 |
+
"CCOCR_MultiSceneOcr_ZhVertical": "4892b4aec6e7fd11e39aaea23712709b"
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
# It returns a DataFrame
|
| 147 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 148 |
+
"""
|
| 149 |
+
"""
|
| 150 |
+
df = load(eval_file)
|
| 151 |
+
dict_list = df.to_dict(orient='records')
|
| 152 |
+
|
| 153 |
+
required_colume_list = ['answer', 'prediction', "category", "image_name", "l2-category", "split"]
|
| 154 |
+
for required_colume in required_colume_list:
|
| 155 |
+
assert required_colume in df, "required_colume: {} NOT found".format(required_colume)
|
| 156 |
+
|
| 157 |
+
gt_info, ptd_info = {}, {}
|
| 158 |
+
for data_info in dict_list:
|
| 159 |
+
image_name = data_info['image_name']
|
| 160 |
+
gt_info[image_name] = data_info['answer']
|
| 161 |
+
|
| 162 |
+
# warning the FAIL samples
|
| 163 |
+
if data_info['prediction'] != FAIL_MSG:
|
| 164 |
+
ptd_info[image_name] = data_info['prediction']
|
| 165 |
+
|
| 166 |
+
# assert eval_file is a single dataset
|
| 167 |
+
group_name = set([str(x) for x in df['category']]).pop()
|
| 168 |
+
op_name = set([str(x) for x in df['l2-category']]).pop()
|
| 169 |
+
data_name = set([str(x) for x in df['split']]).pop()
|
| 170 |
+
|
| 171 |
+
data_info = {"op": op_name, "group": group_name, "dataset": data_name, "num": len(gt_info)}
|
| 172 |
+
try:
|
| 173 |
+
from .utils.ccocr_evaluator import evaluator_map_info as ccocr_evaluator_map
|
| 174 |
+
except ImportError as err:
|
| 175 |
+
import warnings
|
| 176 |
+
warnings.warn('The dependency of CCOCR evaluator is not properly installed')
|
| 177 |
+
warnings.warn(f'{type(err)}: {err}')
|
| 178 |
+
eval_func = ccocr_evaluator_map.get(group_name, None)
|
| 179 |
+
if eval_func is None:
|
| 180 |
+
raise ValueError("error: evaluator not defined for: {}".format(group_name))
|
| 181 |
+
meta_info, eval_info = eval_func(ptd_info, gt_info, **data_info)
|
| 182 |
+
|
| 183 |
+
output_info = {"meta": meta_info, "evaluation": eval_info, "config": data_info}
|
| 184 |
+
result_file = os.path.splitext(os.path.abspath(eval_file))[0] + "_eval.json"
|
| 185 |
+
dump(output_info, result_file)
|
| 186 |
+
|
| 187 |
+
# update global status for summary
|
| 188 |
+
# warning: the evaluate function should NOT run in parallel
|
| 189 |
+
all_status_info = {}
|
| 190 |
+
global_status_path = os.path.join(os.path.dirname(eval_file), "status.json")
|
| 191 |
+
if os.path.exists(global_status_path):
|
| 192 |
+
with open(global_status_path, "r") as f:
|
| 193 |
+
all_status_info = json.load(f)
|
| 194 |
+
all_status_info[data_name] = output_info
|
| 195 |
+
with open(global_status_path, "w") as f:
|
| 196 |
+
json.dump(all_status_info, f, ensure_ascii=False, indent=4)
|
| 197 |
+
return eval_info.get("summary")
|
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_mcq.py
ADDED
|
@@ -0,0 +1,904 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import warnings
|
| 2 |
+
|
| 3 |
+
from .image_base import ImageBaseDataset
|
| 4 |
+
from .utils import build_judge, DEBUG_MESSAGE
|
| 5 |
+
from ..smp import *
|
| 6 |
+
import pandas as pd
|
| 7 |
+
|
| 8 |
+
MMMB_URLS = {
|
| 9 |
+
'MMMB_ar': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_ar.tsv',
|
| 10 |
+
'MMMB_cn': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_cn.tsv',
|
| 11 |
+
'MMMB_en': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_en.tsv',
|
| 12 |
+
'MMMB_pt': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_pt.tsv',
|
| 13 |
+
'MMMB_ru': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_ru.tsv',
|
| 14 |
+
'MMMB_tr': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_tr.tsv',
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
MTL_MMBench_URLS = {
|
| 18 |
+
'MMBench_dev_ar': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_ar.tsv',
|
| 19 |
+
'MMBench_dev_cn': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_cn.tsv',
|
| 20 |
+
'MMBench_dev_en': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_en.tsv',
|
| 21 |
+
'MMBench_dev_pt': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_pt.tsv',
|
| 22 |
+
'MMBench_dev_tr': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_tr.tsv',
|
| 23 |
+
'MMBench_dev_ru': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_ru.tsv',
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
MMMB_MD5 = {
|
| 27 |
+
'MMMB_ar': 'f3a18b6385f1d9701840aa42de27aead', 'MMMB_cn': '13ed82fa89730037292fcaa27f08f430',
|
| 28 |
+
'MMMB_en': '1cd781a71ec5a2983c090b84105d6a01', 'MMMB_pt': '548ea2b3bb2da991790386f0015d30d1',
|
| 29 |
+
'MMMB_ru': 'ce1cc8a0533425ab0d86b326ebfc2984', 'MMMB_tr': '0733739d43090327975294292bc5cd67'
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
MTL_MMBench_MD5 = {
|
| 33 |
+
'MMBench_dev_ar': '4271b4a0d0200e1a86380a878e0d64a4', 'MMBench_dev_cn': '2ed5135326fed02c8e51ea50dda8222f',
|
| 34 |
+
'MMBench_dev_en': 'd9ab776fc018b3d45785e9a5c23431c2', 'MMBench_dev_pt': '4ddfbcd27ef12444b908c03831cd0295',
|
| 35 |
+
'MMBench_dev_tr': '4fab39d501389d3d6cc90264bb708f11', 'MMBench_dev_ru': '5ba1171ff2e68f80637bf78349e402a5'
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class ImageMCQDataset(ImageBaseDataset):
|
| 40 |
+
|
| 41 |
+
TYPE = 'MCQ'
|
| 42 |
+
|
| 43 |
+
DATASET_URL = {
|
| 44 |
+
# MMBench v1.0
|
| 45 |
+
'MMBench_DEV_EN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_EN.tsv',
|
| 46 |
+
'MMBench_TEST_EN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_EN.tsv',
|
| 47 |
+
'MMBench_DEV_CN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_CN.tsv',
|
| 48 |
+
'MMBench_TEST_CN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_CN.tsv',
|
| 49 |
+
'MMBench': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench.tsv', # Internal
|
| 50 |
+
'MMBench_CN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_CN.tsv', # Internal
|
| 51 |
+
# MMBench v1.1
|
| 52 |
+
'MMBench_DEV_EN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_EN_V11.tsv',
|
| 53 |
+
'MMBench_TEST_EN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_EN_V11.tsv',
|
| 54 |
+
'MMBench_DEV_CN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_CN_V11.tsv',
|
| 55 |
+
'MMBench_TEST_CN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_CN_V11.tsv',
|
| 56 |
+
'MMBench_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_V11.tsv', # Internal
|
| 57 |
+
'MMBench_CN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_CN_V11.tsv', # Internal
|
| 58 |
+
# SEEDBench Series
|
| 59 |
+
'SEEDBench_IMG': 'https://opencompass.openxlab.space/utils/benchmarks/SEEDBench/SEEDBench_IMG.tsv',
|
| 60 |
+
'SEEDBench2': 'https://huggingface.co/datasets/VLMEval/SEEDBench2/resolve/main/SEEDBench2.tsv',
|
| 61 |
+
'SEEDBench2_Plus': 'https://opencompass.openxlab.space/utils/benchmarks/SEEDBench/SEEDBench2_Plus.tsv',
|
| 62 |
+
# ScienceQA Series
|
| 63 |
+
'ScienceQA_VAL': 'https://opencompass.openxlab.space/utils/benchmarks/ScienceQA/ScienceQA_VAL.tsv',
|
| 64 |
+
'ScienceQA_TEST': 'https://opencompass.openxlab.space/utils/benchmarks/ScienceQA/ScienceQA_TEST.tsv',
|
| 65 |
+
# MMT-Bench
|
| 66 |
+
'MMT-Bench_ALL_MI': 'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_ALL_MI.tsv',
|
| 67 |
+
'MMT-Bench_ALL': 'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_ALL.tsv',
|
| 68 |
+
'MMT-Bench_VAL_MI': 'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_VAL_MI.tsv',
|
| 69 |
+
'MMT-Bench_VAL': 'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_VAL.tsv',
|
| 70 |
+
# AesBench
|
| 71 |
+
'AesBench_VAL': 'https://huggingface.co/datasets/VLMEval/AesBench/resolve/main/AesBench_VAL.tsv',
|
| 72 |
+
'AesBench_TEST': 'https://huggingface.co/datasets/VLMEval/AesBench/resolve/main/AesBench_TEST.tsv',
|
| 73 |
+
# Q-Bench1
|
| 74 |
+
'Q-Bench1_VAL': 'https://huggingface.co/datasets/zhangzicheng/qbench_tsv/resolve/main/Q-Bench1_VAL.tsv',
|
| 75 |
+
'Q-Bench1_TEST': 'https://huggingface.co/datasets/zhangzicheng/qbench_tsv/resolve/main/Q-Bench1_TEST.tsv',
|
| 76 |
+
# A-Bench
|
| 77 |
+
'A-Bench_VAL': 'https://huggingface.co/datasets/zhangzicheng/abench_tsv/resolve/main/A-bench_VAL.tsv',
|
| 78 |
+
'A-Bench_TEST': 'https://huggingface.co/datasets/zhangzicheng/abench_tsv/resolve/main/A-bench_TEST.tsv',
|
| 79 |
+
# R-Bench
|
| 80 |
+
'R-Bench-Dis': 'https://huggingface.co/datasets/lcysyzxdxc/R-Bench/blob/main/R-bench-dis.tsv',
|
| 81 |
+
'R-Bench-Ref': 'https://huggingface.co/datasets/lcysyzxdxc/R-Bench/blob/main/R-bench-ref.tsv',
|
| 82 |
+
# Other Benchmarks
|
| 83 |
+
'CCBench': 'https://opencompass.openxlab.space/utils/VLMEval/CCBench.tsv',
|
| 84 |
+
'AI2D_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST.tsv',
|
| 85 |
+
'AI2D_TEST_NO_MASK': 'https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST_NO_MASK.tsv',
|
| 86 |
+
'MMStar': 'https://opencompass.openxlab.space/utils/VLMEval/MMStar.tsv',
|
| 87 |
+
'RealWorldQA': 'https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv',
|
| 88 |
+
'MLLMGuard_DS': 'https://opencompass.openxlab.space/utils/VLMEval/MLLMGuard_DS.tsv',
|
| 89 |
+
'BLINK': 'https://opencompass.openxlab.space/utils/VLMEval/BLINK.tsv',
|
| 90 |
+
'TaskMeAnything_v1_imageqa_random': (
|
| 91 |
+
'https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-random/'
|
| 92 |
+
'resolve/main/TaskMeAnything-v1-imageqa-random.tsv'
|
| 93 |
+
),
|
| 94 |
+
'A-OKVQA': 'https://huggingface.co/datasets/Allen8/A-OKVQA/resolve/main/a-okvqa.tsv',
|
| 95 |
+
'WorldMedQA-V': 'https://opencompass.openxlab.space/utils/VLMEval/WorldMedQA-V.tsv',
|
| 96 |
+
'VisOnlyQA-VLMEvalKit': (
|
| 97 |
+
'https://huggingface.co/datasets/ryokamoi/VisOnlyQA_Eval_Real/'
|
| 98 |
+
'resolve/main/visonlyqa_vlmevalkit.tsv'
|
| 99 |
+
),
|
| 100 |
+
'3DSRBench': (
|
| 101 |
+
'https://huggingface.co/datasets/ccvl/3DSRBench/'
|
| 102 |
+
'resolve/main/3dsrbench_v1_vlmevalkit_circular.tsv'
|
| 103 |
+
),
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
DATASET_MD5 = {
|
| 107 |
+
# MMBench v1.0
|
| 108 |
+
'MMBench_DEV_EN': 'b6caf1133a01c6bb705cf753bb527ed8',
|
| 109 |
+
'MMBench_TEST_EN': '6939fadb0ce626fefc0bdc9c64efc528',
|
| 110 |
+
'MMBench_DEV_CN': '08b8fc3324a5ed74155350f57be69fbd',
|
| 111 |
+
'MMBench_TEST_CN': '7e1239baf0ee4c8b513e19705a0f317e',
|
| 112 |
+
'MMBench': '4115aea3383f3dd0083be6a633e0f820', # Internal Only
|
| 113 |
+
'MMBench_CN': '2e053ffc90ea598b1feae13c36dc13ee', # Internal Only
|
| 114 |
+
# MMBench v1.1
|
| 115 |
+
'MMBench_DEV_EN_V11': '30c05be8f2f347a50be25aa067248184',
|
| 116 |
+
'MMBench_TEST_EN_V11': '26f0f15381a21720255091d3e0316ce6',
|
| 117 |
+
'MMBench_DEV_CN_V11': '593f9b5f6bea453d870a798b34ae4f37',
|
| 118 |
+
'MMBench_TEST_CN_V11': '74bbe4556dac745613c7cbe5ad787050',
|
| 119 |
+
'MMBench_V11': 'b9276414f57af1308dcc4d0cd9b42e7c', # Internal Only
|
| 120 |
+
'MMBench_CN_V11': '95f6980dd1b4de38e3cbffe0305a3f25', # Internal Only
|
| 121 |
+
# SEEDBench
|
| 122 |
+
'SEEDBench_IMG': '68017231464752261a2526d6ca3a10c0',
|
| 123 |
+
'SEEDBench2': '4ec15cf864c4f16274112284f531813e',
|
| 124 |
+
'SEEDBench2_Plus': 'e32d3216dc4f452b0fe497a52015d1fd',
|
| 125 |
+
# ScienceQA
|
| 126 |
+
'ScienceQA_VAL': '96320d05e142e585e7204e72affd29f3',
|
| 127 |
+
'ScienceQA_TEST': 'e42e9e00f9c59a80d8a5db35bc32b71f',
|
| 128 |
+
# MMT-Bench
|
| 129 |
+
'MMT-Bench_ALL_MI': '5272157097e19cdd7cb41e412ab3b7c7',
|
| 130 |
+
'MMT-Bench_ALL': 'b273a2f4c596fe4f2605de0494cd632f',
|
| 131 |
+
'MMT-Bench_VAL_MI': 'c7d7b998eb5cd9aa36c7d4f721472462',
|
| 132 |
+
'MMT-Bench_VAL': '8dd4b730f53dbf9c3aed90ca31c928e0',
|
| 133 |
+
# AesBench
|
| 134 |
+
'AesBench_VAL': '3edb0c319e9187aa0b97fe7a11700a8c',
|
| 135 |
+
'AesBench_TEST': '58b1f7ba2cc32e1d68896d6ee716bbf8',
|
| 136 |
+
# Q-Bench1
|
| 137 |
+
'Q-Bench1_VAL': '837bdb6cd2da571713543462815187b7',
|
| 138 |
+
'Q-Bench1_TEST': '15e759bfd58c9d5f30b23a317d347153',
|
| 139 |
+
# A-Bench
|
| 140 |
+
'A-Bench_VAL': '218563ec50d34bb336c814143a5bb9c1',
|
| 141 |
+
'A-Bench_TEST': '567013fb033a20cf23f51d8e865bd16c',
|
| 142 |
+
# R-Bench
|
| 143 |
+
'R-Bench-Dis': 'd6e961dbfc43350688af2560226830b4',
|
| 144 |
+
'R-Bench-Ref': '270c1cb555acb523f3fdb178ed57021d',
|
| 145 |
+
# Other Benchmarks
|
| 146 |
+
'CCBench': 'f5dde47f24dc5a6fb6e595b409b466ac',
|
| 147 |
+
'AI2D_TEST': '0f593e0d1c7df9a3d69bf1f947e71975',
|
| 148 |
+
'AI2D_TEST_NO_MASK': 'fd8f463634d4fe9fbd23b876e8eea5be',
|
| 149 |
+
'MMStar': 'e1ecd2140806c1b1bbf54b43372efb9e',
|
| 150 |
+
'RealWorldQA': '4de008f55dc4fd008ca9e15321dc44b7',
|
| 151 |
+
'MLLMGuard_DS': '975fc0dd7119386e198c37d71e274b3f',
|
| 152 |
+
'BLINK': '3b6649b6a662184ea046908e5506260e',
|
| 153 |
+
'TaskMeAnything_v1_imageqa_random': '023fef69e2ca21827afb77c5ec3bc889',
|
| 154 |
+
'WorldMedQA-V': '441e63875e30c87f5750528b57b41285',
|
| 155 |
+
"VisOnlyQA-VLMEvalKit": 'cf460a31d2acb8d3a7cecd0e69298bfa',
|
| 156 |
+
'3DSRBench': '13a99f33164dc1b9faf0e8b8b01fd6f2',
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
DATASET_URL.update(MMMB_URLS)
|
| 160 |
+
DATASET_URL.update(MTL_MMBench_URLS)
|
| 161 |
+
DATASET_MD5.update(MMMB_MD5)
|
| 162 |
+
DATASET_MD5.update(MTL_MMBench_MD5)
|
| 163 |
+
|
| 164 |
+
def build_prompt(self, line):
|
| 165 |
+
|
| 166 |
+
if isinstance(line, int):
|
| 167 |
+
line = self.data.iloc[line]
|
| 168 |
+
|
| 169 |
+
if self.meta_only:
|
| 170 |
+
tgt_path = toliststr(line['image_path'])
|
| 171 |
+
else:
|
| 172 |
+
tgt_path = self.dump_image(line)
|
| 173 |
+
|
| 174 |
+
question = line['question']
|
| 175 |
+
options = {
|
| 176 |
+
cand: line[cand]
|
| 177 |
+
for cand in string.ascii_uppercase
|
| 178 |
+
if cand in line and not pd.isna(line[cand])
|
| 179 |
+
}
|
| 180 |
+
options_prompt = 'Options:\n'
|
| 181 |
+
for key, item in options.items():
|
| 182 |
+
options_prompt += f'{key}. {item}\n'
|
| 183 |
+
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
|
| 184 |
+
prompt = ''
|
| 185 |
+
if hint is not None:
|
| 186 |
+
prompt += f'Hint: {hint}\n'
|
| 187 |
+
prompt += f'Question: {question}\n'
|
| 188 |
+
if len(options):
|
| 189 |
+
prompt += options_prompt
|
| 190 |
+
prompt += 'Please select the correct answer from the options above. \n'
|
| 191 |
+
|
| 192 |
+
msgs = []
|
| 193 |
+
if isinstance(tgt_path, list):
|
| 194 |
+
msgs.extend([dict(type='image', value=p) for p in tgt_path])
|
| 195 |
+
else:
|
| 196 |
+
msgs = [dict(type='image', value=tgt_path)]
|
| 197 |
+
msgs.append(dict(type='text', value=prompt))
|
| 198 |
+
|
| 199 |
+
return msgs
|
| 200 |
+
|
| 201 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 202 |
+
from .utils.multiple_choice import report_acc, report_acc_MMT, mcq_circular_eval, mcq_vanilla_eval
|
| 203 |
+
# assert dataset is not None
|
| 204 |
+
dataset_map = {
|
| 205 |
+
'MMBench_TEST_EN': 'MMBench', 'MMBench_TEST_EN_V11': 'MMBench_V11',
|
| 206 |
+
'MMBench_TEST_CN': 'MMBench_CN', 'MMBench_TEST_CN_V11': 'MMBench_CN_V11'
|
| 207 |
+
}
|
| 208 |
+
dataset = self.dataset_name
|
| 209 |
+
if dataset in dataset_map:
|
| 210 |
+
dataset = dataset_map[dataset]
|
| 211 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
| 212 |
+
|
| 213 |
+
circular = False
|
| 214 |
+
if listinstr(['mmbench', 'ccbench'], dataset.lower()):
|
| 215 |
+
data = load(eval_file)
|
| 216 |
+
data['index'] = [int(x) for x in data['index']]
|
| 217 |
+
dump(data, eval_file)
|
| 218 |
+
circular = True
|
| 219 |
+
|
| 220 |
+
suffix = eval_file.split('.')[-1]
|
| 221 |
+
model = judge_kwargs.get('model', 'exact_matching')
|
| 222 |
+
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
|
| 223 |
+
name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
|
| 224 |
+
name_str = name_str_map[model] if model in name_str_map else model
|
| 225 |
+
|
| 226 |
+
if model == 'exact_matching':
|
| 227 |
+
model = None
|
| 228 |
+
elif gpt_key_set():
|
| 229 |
+
model = build_judge(**judge_kwargs)
|
| 230 |
+
if not model.working():
|
| 231 |
+
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
|
| 232 |
+
warnings.warn(DEBUG_MESSAGE)
|
| 233 |
+
model = None
|
| 234 |
+
else:
|
| 235 |
+
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
|
| 236 |
+
model = None
|
| 237 |
+
|
| 238 |
+
result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
|
| 239 |
+
|
| 240 |
+
data = load(eval_file)
|
| 241 |
+
data = data.sort_values(by='index')
|
| 242 |
+
data['prediction'] = [str(x) for x in data['prediction']]
|
| 243 |
+
# If not choice label, then use lower case
|
| 244 |
+
for k in data.keys():
|
| 245 |
+
data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
|
| 246 |
+
|
| 247 |
+
meta = self.data
|
| 248 |
+
meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
|
| 249 |
+
data_map = {x: y for x, y in zip(data['index'], data['question'])}
|
| 250 |
+
for k in data_map:
|
| 251 |
+
assert k in meta_q_map, (
|
| 252 |
+
f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
if circular:
|
| 256 |
+
data = mcq_circular_eval(model, data, meta, nproc, result_file, self.dataset_name)
|
| 257 |
+
else:
|
| 258 |
+
data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
|
| 259 |
+
|
| 260 |
+
# load split
|
| 261 |
+
dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
|
| 262 |
+
data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
|
| 263 |
+
|
| 264 |
+
# May have different report acc functions for different datasets
|
| 265 |
+
if 'MMT' in dataset:
|
| 266 |
+
acc = report_acc_MMT(data)
|
| 267 |
+
else:
|
| 268 |
+
acc = report_acc(data)
|
| 269 |
+
|
| 270 |
+
score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
|
| 271 |
+
dump(acc, score_file)
|
| 272 |
+
|
| 273 |
+
if dataset == 'AesBench_VAL':
|
| 274 |
+
warnings.warn('Note that AesBench VAL is just a toy version of AesBench TEST. For full results, \
|
| 275 |
+
please evaluate on AesBench TEST. The AesBench TEST dataset is more than 20 times \
|
| 276 |
+
larger than the VAL dataset and the leaderboard results are based on AesBench TEST.')
|
| 277 |
+
if dataset == 'VisOnlyQA-VLMEvalKit':
|
| 278 |
+
warnings.warn('Note that the results on VisOnlyQA-VLMEvalKit are different from the results on \
|
| 279 |
+
the original VisOnlyQA. VisOnlyQA-VLMEvalKit does not include the \
|
| 280 |
+
chemistry__shape_multi split and uses a different evaluation prompt. Please \
|
| 281 |
+
explicitly specify the version of the dataset when you report results.')
|
| 282 |
+
|
| 283 |
+
return acc
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
class MMMUDataset(ImageMCQDataset):
|
| 287 |
+
|
| 288 |
+
DATASET_URL = {
|
| 289 |
+
'MMMU_DEV_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_DEV_VAL.tsv',
|
| 290 |
+
'MMMU_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_TEST.tsv',
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
DATASET_MD5 = {
|
| 294 |
+
'MMMU_DEV_VAL': '585e8ad75e73f75dcad265dfd0417d64',
|
| 295 |
+
'MMMU_TEST': 'c19875d11a2d348d07e5eb4bdf33166d',
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
@staticmethod
|
| 299 |
+
def split_MMMU(msgs):
|
| 300 |
+
text, images = None, []
|
| 301 |
+
for s in msgs:
|
| 302 |
+
if s['type'] == 'image':
|
| 303 |
+
images.append(s['value'])
|
| 304 |
+
elif s['type'] == 'text':
|
| 305 |
+
assert text is None
|
| 306 |
+
text = s['value']
|
| 307 |
+
text_segs = text.split('<image ')
|
| 308 |
+
if len(text_segs) == 1:
|
| 309 |
+
return msgs
|
| 310 |
+
|
| 311 |
+
segs = [dict(type='text', value=text_segs[0])]
|
| 312 |
+
for i, seg in enumerate(text_segs):
|
| 313 |
+
if i == 0:
|
| 314 |
+
continue
|
| 315 |
+
assert istype(seg[0], int) and seg[1] == '>'
|
| 316 |
+
image_idx = int(seg[0]) - 1
|
| 317 |
+
segs.append(dict(type='image', value=images[image_idx]))
|
| 318 |
+
segs.append(dict(type='text', value=seg[2:]))
|
| 319 |
+
return segs
|
| 320 |
+
|
| 321 |
+
def build_prompt(self, line):
|
| 322 |
+
msgs = super().build_prompt(line)
|
| 323 |
+
msgs = self.split_MMMU(msgs)
|
| 324 |
+
return msgs
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
class MUIRDataset(ImageMCQDataset):
|
| 328 |
+
|
| 329 |
+
DATASET_URL = {
|
| 330 |
+
'MUIRBench': 'http://opencompass.openxxlab.com/utils/VLMEval/MUIRBench.tsv'
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
+
DATASET_MD5 = {
|
| 334 |
+
'MUIRBench': '2e5e6fd7699761b08a7cb3ab8c0c2ec8'
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
@staticmethod
|
| 338 |
+
def split_MUIR(msgs):
|
| 339 |
+
text, images = None, []
|
| 340 |
+
|
| 341 |
+
# Separate images and text from msgs
|
| 342 |
+
for s in msgs:
|
| 343 |
+
if s['type'] == 'image':
|
| 344 |
+
images.append(s['value'])
|
| 345 |
+
elif s['type'] == 'text':
|
| 346 |
+
assert text is None # Ensure only one text entry is expected
|
| 347 |
+
text = s['value']
|
| 348 |
+
|
| 349 |
+
# Split text by <image> tags
|
| 350 |
+
text_segs = text.split('<image>')
|
| 351 |
+
|
| 352 |
+
# Initialize the segments list
|
| 353 |
+
segs = []
|
| 354 |
+
|
| 355 |
+
# Iterate through the text segments and images
|
| 356 |
+
for i, seg in enumerate(text_segs):
|
| 357 |
+
# Append the image if this is not the first segment and there are still images left
|
| 358 |
+
if i > 0 and i - 1 < len(images):
|
| 359 |
+
segs.append(dict(type='image', value=images[i - 1]))
|
| 360 |
+
# Append the text segment (if it's non-empty)
|
| 361 |
+
if len(seg) > 0:
|
| 362 |
+
segs.append(dict(type='text', value=seg))
|
| 363 |
+
|
| 364 |
+
return segs
|
| 365 |
+
|
| 366 |
+
def build_prompt(self, line):
|
| 367 |
+
|
| 368 |
+
if isinstance(line, int):
|
| 369 |
+
line = self.data.iloc[line]
|
| 370 |
+
|
| 371 |
+
if self.meta_only:
|
| 372 |
+
tgt_path = toliststr(line['image_path'])
|
| 373 |
+
else:
|
| 374 |
+
tgt_path = self.dump_image(line)
|
| 375 |
+
|
| 376 |
+
question = line['question']
|
| 377 |
+
options = {
|
| 378 |
+
cand: line[cand]
|
| 379 |
+
for cand in string.ascii_uppercase
|
| 380 |
+
if cand in line and not pd.isna(line[cand])
|
| 381 |
+
}
|
| 382 |
+
# options_prompt = ''
|
| 383 |
+
options_prompt = '\n'.join([f'{key}. {item}' for key, item in options.items()])
|
| 384 |
+
# for key, item in options.items():
|
| 385 |
+
# options_prompt += f'{key}. {item}\n'
|
| 386 |
+
|
| 387 |
+
prompt = ''
|
| 388 |
+
|
| 389 |
+
prompt += f'{question}\n'
|
| 390 |
+
if len(options):
|
| 391 |
+
prompt += options_prompt
|
| 392 |
+
prompt += "\nAnswer with the option's letter from the given choices directly."
|
| 393 |
+
|
| 394 |
+
msgs = []
|
| 395 |
+
if isinstance(tgt_path, list):
|
| 396 |
+
msgs.extend([dict(type='image', value=p) for p in tgt_path])
|
| 397 |
+
else:
|
| 398 |
+
msgs = [dict(type='image', value=tgt_path)]
|
| 399 |
+
msgs.append(dict(type='text', value=prompt))
|
| 400 |
+
|
| 401 |
+
msgs = self.split_MUIR(msgs)
|
| 402 |
+
return msgs
|
| 403 |
+
|
| 404 |
+
|
| 405 |
+
class GMAIMMBenchDataset(ImageMCQDataset):
|
| 406 |
+
|
| 407 |
+
DATASET_URL = {
|
| 408 |
+
'GMAI-MMBench_VAL': 'https://huggingface.co/datasets/VLMEval/GMAI-MMBench/resolve/main/GMAI-MMBench_VAL.tsv',
|
| 409 |
+
'GMAI_mm_bench_TEST_part_1': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_1.tsv', # noqa: E501
|
| 410 |
+
'GMAI_mm_bench_TEST_part_2': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_2.tsv', # noqa: E501
|
| 411 |
+
'GMAI_mm_bench_TEST_part_3': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_3.tsv', # noqa: E501
|
| 412 |
+
'GMAI_mm_bench_TEST_part_4': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_4.tsv', # noqa: E501
|
| 413 |
+
'GMAI_mm_bench_TEST_part_5': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_5.tsv', # noqa: E501
|
| 414 |
+
'GMAI_mm_bench_TEST_part_6': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_6.tsv', # noqa: E501
|
| 415 |
+
'GMAI_mm_bench_TEST_part_7': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_7.tsv', # noqa: E501
|
| 416 |
+
'GMAI_mm_bench_TEST_part_8': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_8.tsv', # noqa: E501
|
| 417 |
+
'GMAI_mm_bench_TEST_part_9': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_9.tsv', # noqa: E501
|
| 418 |
+
'GMAI_mm_bench_TEST_part_10': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_10.tsv', # noqa: E501
|
| 419 |
+
'GMAI_mm_bench_TEST_part_11': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_11.tsv', # noqa: E501
|
| 420 |
+
}
|
| 421 |
+
|
| 422 |
+
DATASET_MD5 = {
|
| 423 |
+
'GMAI-MMBench_VAL': '254bd581627866f1c499d3d6b4422324',
|
| 424 |
+
'GMAI_mm_bench_TEST_part_1': '900d735231230a63f4ed45665c078ef4',
|
| 425 |
+
'GMAI_mm_bench_TEST_part_2': '1b27ab621386945d7e4a765ad2d22b0e',
|
| 426 |
+
'GMAI_mm_bench_TEST_part_3': '44bdc2b6267dd505d529b8cad06f0fb2',
|
| 427 |
+
'GMAI_mm_bench_TEST_part_4': '5a04a04fcac9f1466709f242fdb80acb',
|
| 428 |
+
'GMAI_mm_bench_TEST_part_5': 'c70baf8909eda9af0ddeab275c721336',
|
| 429 |
+
'GMAI_mm_bench_TEST_part_6': '825abc39596b644dead9350d0cfa3b96',
|
| 430 |
+
'GMAI_mm_bench_TEST_part_7': 'defb8aed2fb77365a76b6b9abd6a2701',
|
| 431 |
+
'GMAI_mm_bench_TEST_part_8': 'ff490d60b85f2bb0abb67a435b298c65',
|
| 432 |
+
'GMAI_mm_bench_TEST_part_9': 'ff67c86f40da93b09139ac1d1ba5dc6b',
|
| 433 |
+
'GMAI_mm_bench_TEST_part_10': '3dae94627b9ac0fe00180d4780fbf6dc',
|
| 434 |
+
'GMAI_mm_bench_TEST_part_11': 'd08dc813f0eb6bbab63cae2a9d113c4b',
|
| 435 |
+
}
|
| 436 |
+
|
| 437 |
+
@classmethod
|
| 438 |
+
def supported_datasets(cls):
|
| 439 |
+
return ['GMAI-MMBench_VAL', 'GMAI-MMBench_TEST']
|
| 440 |
+
|
| 441 |
+
def load_data(self, dataset):
|
| 442 |
+
if dataset == 'GMAI-MMBench_VAL':
|
| 443 |
+
data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
|
| 444 |
+
if file_size(data_path, 'GB') > 1:
|
| 445 |
+
local_path = data_path.replace('.tsv', '_local.tsv')
|
| 446 |
+
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL'):
|
| 447 |
+
from ..tools import LOCALIZE
|
| 448 |
+
LOCALIZE(data_path, local_path)
|
| 449 |
+
data_path = local_path
|
| 450 |
+
return load(data_path)
|
| 451 |
+
elif dataset == 'GMAI-MMBench_TEST':
|
| 452 |
+
dfs = []
|
| 453 |
+
for part_num in range(1, 12):
|
| 454 |
+
part_name = f'GMAI_mm_bench_TEST_part_{part_num}'
|
| 455 |
+
url = self.DATASET_URL[part_name]
|
| 456 |
+
file_md5 = self.DATASET_MD5.get(part_name)
|
| 457 |
+
tsv_path = osp.join(LMUDataRoot(), f'{part_name}.tsv')
|
| 458 |
+
if not osp.exists(tsv_path) or (file_md5 and md5(tsv_path) != file_md5):
|
| 459 |
+
download_file(url, filename=tsv_path)
|
| 460 |
+
local_path = tsv_path.replace('.tsv', '_local.tsv')
|
| 461 |
+
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL'):
|
| 462 |
+
from ..tools import LOCALIZE
|
| 463 |
+
LOCALIZE(tsv_path, local_path)
|
| 464 |
+
tsv_path = local_path
|
| 465 |
+
# 加载数据
|
| 466 |
+
df = load(tsv_path)
|
| 467 |
+
dfs.append(df)
|
| 468 |
+
# 合并所有数据
|
| 469 |
+
data = pd.concat(dfs, ignore_index=True)
|
| 470 |
+
return data
|
| 471 |
+
else:
|
| 472 |
+
raise ValueError(f"未知的数据集:{dataset}")
|
| 473 |
+
|
| 474 |
+
def report_acc_by_groups(self, df, group_column):
|
| 475 |
+
res = defaultdict(list)
|
| 476 |
+
|
| 477 |
+
# Check for the 'split' column
|
| 478 |
+
if 'split' in df:
|
| 479 |
+
splits = list(set(df['split']))
|
| 480 |
+
res['split'] = splits
|
| 481 |
+
else:
|
| 482 |
+
df['split'] = ['none'] * len(df)
|
| 483 |
+
res['split'] = ['none']
|
| 484 |
+
|
| 485 |
+
res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']]
|
| 486 |
+
|
| 487 |
+
if group_column not in df:
|
| 488 |
+
raise ValueError(f"Column '{group_column}' not found in dataframe.") # noqa: E713
|
| 489 |
+
|
| 490 |
+
abilities = list(set(df[group_column]))
|
| 491 |
+
abilities = ['None' if isinstance(ab, float) and pd.isna(ab) else ab for ab in abilities]
|
| 492 |
+
abilities.sort()
|
| 493 |
+
|
| 494 |
+
for ab in abilities:
|
| 495 |
+
ab_name = ab
|
| 496 |
+
sub_df = df[df[group_column] == ab]
|
| 497 |
+
res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']]
|
| 498 |
+
|
| 499 |
+
return pd.DataFrame(res)
|
| 500 |
+
|
| 501 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 502 |
+
from .utils.multiple_choice import report_acc, mcq_vanilla_eval
|
| 503 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
| 504 |
+
|
| 505 |
+
suffix = eval_file.split('.')[-1]
|
| 506 |
+
model = judge_kwargs.get('model', 'exact_matching')
|
| 507 |
+
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
|
| 508 |
+
name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
|
| 509 |
+
name_str = name_str_map[model] if model in name_str_map else model
|
| 510 |
+
|
| 511 |
+
if model == 'exact_matching':
|
| 512 |
+
model = None
|
| 513 |
+
elif gpt_key_set():
|
| 514 |
+
model = build_judge(**judge_kwargs)
|
| 515 |
+
if not model.working():
|
| 516 |
+
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
|
| 517 |
+
warnings.warn(DEBUG_MESSAGE)
|
| 518 |
+
model = None
|
| 519 |
+
else:
|
| 520 |
+
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
|
| 521 |
+
model = None
|
| 522 |
+
|
| 523 |
+
result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
|
| 524 |
+
|
| 525 |
+
data = load(eval_file)
|
| 526 |
+
data = data.sort_values(by='index')
|
| 527 |
+
data['prediction'] = [str(x) for x in data['prediction']]
|
| 528 |
+
# If not choice label, then use lower case
|
| 529 |
+
for k in data.keys():
|
| 530 |
+
data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
|
| 531 |
+
|
| 532 |
+
meta = self.data
|
| 533 |
+
meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
|
| 534 |
+
data_map = {x: y for x, y in zip(data['index'], data['question'])}
|
| 535 |
+
for k in data_map:
|
| 536 |
+
assert k in meta_q_map, (
|
| 537 |
+
f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
|
| 538 |
+
)
|
| 539 |
+
|
| 540 |
+
data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
|
| 541 |
+
|
| 542 |
+
# load split
|
| 543 |
+
dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
|
| 544 |
+
data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
|
| 545 |
+
|
| 546 |
+
acc = report_acc(data)
|
| 547 |
+
|
| 548 |
+
for group_col in ['clinical vqa task', 'department', 'perceptual granularity']:
|
| 549 |
+
acc_grouped = self.report_acc_by_groups(data, group_col)
|
| 550 |
+
score_file_grouped = eval_file.replace(f'.{suffix}', f'_{group_col}_acc.csv')
|
| 551 |
+
dump(acc_grouped, score_file_grouped)
|
| 552 |
+
|
| 553 |
+
return acc
|
| 554 |
+
|
| 555 |
+
|
| 556 |
+
class MMERealWorld(ImageMCQDataset):
|
| 557 |
+
|
| 558 |
+
TYPE = 'MMERealWorld'
|
| 559 |
+
|
| 560 |
+
DATASET_MD5 = {
|
| 561 |
+
'MME-RealWorld': '271c33ec814c39533c467ec6fb8a6f36',
|
| 562 |
+
'MME-RealWorld-Lite': '4c17057d7d3b6c4a0d4397c3dae0881c',
|
| 563 |
+
'MME-RealWorld-CN': 'daaa763d52a760a38606d5dedb3fe444',
|
| 564 |
+
}
|
| 565 |
+
SYS = {
|
| 566 |
+
'MME-RealWorld': (
|
| 567 |
+
'Select the best answer to the above multiple-choice question based on the image. '
|
| 568 |
+
'Respond with only the letter (A, B, C, D, or E) of the correct option. \n'
|
| 569 |
+
'The best answer is:'
|
| 570 |
+
),
|
| 571 |
+
'MME-RealWorld-Lite': (
|
| 572 |
+
'Select the best answer to the above multiple-choice question based on the image. '
|
| 573 |
+
'Respond with only the letter (A, B, C, D, or E) of the correct option. \n'
|
| 574 |
+
'The best answer is:'
|
| 575 |
+
),
|
| 576 |
+
'MME-RealWorld-CN': (
|
| 577 |
+
'根据图像选择上述多项选择题的最佳答案。只需回答正确选项的字母(A, B, C, D 或 E)。\n'
|
| 578 |
+
'最佳答案为:'
|
| 579 |
+
),
|
| 580 |
+
}
|
| 581 |
+
|
| 582 |
+
@classmethod
|
| 583 |
+
def supported_datasets(cls):
|
| 584 |
+
return ['MME-RealWorld', 'MME-RealWorld-CN', 'MME-RealWorld-Lite',]
|
| 585 |
+
|
| 586 |
+
def load_data(
|
| 587 |
+
self, dataset="MME-RealWorld", repo_id="yifanzhang114/MME-RealWorld-Base64"
|
| 588 |
+
):
|
| 589 |
+
|
| 590 |
+
def check_integrity(pth):
|
| 591 |
+
data_file = osp.join(pth, f"{dataset}.tsv")
|
| 592 |
+
|
| 593 |
+
if not os.path.exists(data_file):
|
| 594 |
+
return False
|
| 595 |
+
|
| 596 |
+
if md5(data_file) != self.DATASET_MD5[dataset]:
|
| 597 |
+
return False
|
| 598 |
+
return True
|
| 599 |
+
|
| 600 |
+
def generate_tsv(pth):
|
| 601 |
+
tsv_file = os.path.join(pth, f"{dataset}.tsv")
|
| 602 |
+
|
| 603 |
+
if os.path.exists(tsv_file):
|
| 604 |
+
print(f"{tsv_file} already exists.")
|
| 605 |
+
return
|
| 606 |
+
|
| 607 |
+
json_dir = os.path.join(pth, dataset)
|
| 608 |
+
json_files = [f for f in os.listdir(json_dir) if f.endswith(".json")]
|
| 609 |
+
|
| 610 |
+
data_list = []
|
| 611 |
+
for json_file in json_files:
|
| 612 |
+
with open(os.path.join(json_dir, json_file), "r") as f:
|
| 613 |
+
data = json.load(f)
|
| 614 |
+
for item in tqdm(data):
|
| 615 |
+
choice_prompt = (
|
| 616 |
+
"The choices are listed below:\n"
|
| 617 |
+
if dataset in ["MME-RealWorld", "MME-RealWorld-Lite"]
|
| 618 |
+
else "选项如下所示:\n"
|
| 619 |
+
)
|
| 620 |
+
data_list.append(
|
| 621 |
+
{
|
| 622 |
+
"index": item["index"],
|
| 623 |
+
"image": item["image"],
|
| 624 |
+
"question": item["question"],
|
| 625 |
+
"multi-choice options": choice_prompt
|
| 626 |
+
+ "\n".join(item["multi-choice options"]),
|
| 627 |
+
"A": item["multi-choice options"][0][4:],
|
| 628 |
+
"B": item["multi-choice options"][1][4:],
|
| 629 |
+
"C": item["multi-choice options"][2][4:],
|
| 630 |
+
"D": item["multi-choice options"][3][4:],
|
| 631 |
+
"E": item["multi-choice options"][4][4:],
|
| 632 |
+
"answer": item["answer"],
|
| 633 |
+
"category": item["category"],
|
| 634 |
+
"l2-category": item["l2-category"],
|
| 635 |
+
}
|
| 636 |
+
)
|
| 637 |
+
df = pd.DataFrame(data_list)
|
| 638 |
+
df.to_csv(tsv_file, sep="\t", index=False)
|
| 639 |
+
print(f"TSV file saved to {tsv_file}")
|
| 640 |
+
|
| 641 |
+
# Check if dataset is cached and has integrity
|
| 642 |
+
if dataset == "MME-RealWorld-Lite":
|
| 643 |
+
url = 'https://huggingface.co/datasets/yifanzhang114/MME-RealWorld-Base64/resolve/main/mme_realworld_lite.tsv' # noqa: E501
|
| 644 |
+
file_md5 = (
|
| 645 |
+
self.DATASET_MD5[dataset] if dataset in self.DATASET_MD5 else None
|
| 646 |
+
)
|
| 647 |
+
datas = self.prepare_tsv(url, file_md5)
|
| 648 |
+
choice_prompt = "The choices are listed below:\n"
|
| 649 |
+
for index, item in datas.iterrows():
|
| 650 |
+
options = eval(item["multi-choice options"])
|
| 651 |
+
datas.loc[index, "multi-choice options"] = choice_prompt + "\n".join(
|
| 652 |
+
options
|
| 653 |
+
)
|
| 654 |
+
datas.loc[index, "A"] = options[0][4:]
|
| 655 |
+
datas.loc[index, "B"] = options[1][4:]
|
| 656 |
+
datas.loc[index, "C"] = options[2][4:]
|
| 657 |
+
datas.loc[index, "D"] = options[3][4:]
|
| 658 |
+
datas.loc[index, "E"] = options[4][4:]
|
| 659 |
+
return datas
|
| 660 |
+
|
| 661 |
+
update_flag = False
|
| 662 |
+
cache_path = get_cache_path(repo_id)
|
| 663 |
+
if cache_path is not None and check_integrity(cache_path):
|
| 664 |
+
dataset_path = cache_path
|
| 665 |
+
print(f"Using cached dataset from {cache_path}")
|
| 666 |
+
else:
|
| 667 |
+
from huggingface_hub import snapshot_download
|
| 668 |
+
|
| 669 |
+
# Download or find the dataset path
|
| 670 |
+
dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
|
| 671 |
+
generate_tsv(dataset_path)
|
| 672 |
+
update_flag = True
|
| 673 |
+
|
| 674 |
+
data_path = os.path.join(dataset_path, f"{dataset}.tsv")
|
| 675 |
+
if file_size(data_path, "GB") > 1:
|
| 676 |
+
local_path = data_path.replace(".tsv", "_local.tsv")
|
| 677 |
+
if (
|
| 678 |
+
not osp.exists(local_path)
|
| 679 |
+
or os.environ.get("FORCE_LOCAL", None)
|
| 680 |
+
or update_flag
|
| 681 |
+
):
|
| 682 |
+
from vlmeval.tools import LOCALIZE
|
| 683 |
+
|
| 684 |
+
LOCALIZE(data_path, local_path)
|
| 685 |
+
data_path = local_path
|
| 686 |
+
return load(data_path)
|
| 687 |
+
|
| 688 |
+
def post_build(self, dataset):
|
| 689 |
+
self.TYPE = 'MMERealWorld'
|
| 690 |
+
|
| 691 |
+
# Given one data record, return the built prompt (a multi-modal message), can override
|
| 692 |
+
def build_prompt(self, line):
|
| 693 |
+
if isinstance(line, int):
|
| 694 |
+
line = self.data.iloc[line]
|
| 695 |
+
|
| 696 |
+
if self.meta_only:
|
| 697 |
+
tgt_path = toliststr(line['image_path'])
|
| 698 |
+
else:
|
| 699 |
+
tgt_path = self.dump_image(line)
|
| 700 |
+
|
| 701 |
+
question = line['question']
|
| 702 |
+
|
| 703 |
+
choice_prompt = line['multi-choice options'] + '\n'
|
| 704 |
+
question += ' ' + choice_prompt + self.SYS[self.dataset_name]
|
| 705 |
+
|
| 706 |
+
msgs = []
|
| 707 |
+
if isinstance(tgt_path, list):
|
| 708 |
+
msgs.extend([dict(type='image', value=p) for p in tgt_path])
|
| 709 |
+
else:
|
| 710 |
+
msgs = [dict(type='image', value=tgt_path)]
|
| 711 |
+
msgs.append(dict(type='text', value=question))
|
| 712 |
+
return msgs
|
| 713 |
+
|
| 714 |
+
# It returns a dictionary
|
| 715 |
+
@classmethod
|
| 716 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 717 |
+
from .utils.multiple_choice import extract_characters_regex, get_dimension_rating
|
| 718 |
+
assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
|
| 719 |
+
FAIL_MSG = 'Failed to obtain answer via API.'
|
| 720 |
+
tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
|
| 721 |
+
tgt_file = eval_file.replace('.xlsx', '_rating.json')
|
| 722 |
+
score_file = eval_file.replace('.xlsx', '_score.xlsx')
|
| 723 |
+
|
| 724 |
+
if not osp.exists(score_file):
|
| 725 |
+
|
| 726 |
+
res = {} if not osp.exists(tmp_file) else load(tmp_file)
|
| 727 |
+
res = {k: v for k, v in res.items() if FAIL_MSG not in v}
|
| 728 |
+
|
| 729 |
+
data = load(eval_file)
|
| 730 |
+
cnt_rejected = 0
|
| 731 |
+
data_un = data[~pd.isna(data['prediction'])]
|
| 732 |
+
|
| 733 |
+
for idx in data['index']:
|
| 734 |
+
ans = data.loc[data['index'] == idx, 'answer'].values[0]
|
| 735 |
+
pred = data.loc[data['index'] == idx, 'prediction'].values[0]
|
| 736 |
+
|
| 737 |
+
extract_pred = extract_characters_regex(pred)
|
| 738 |
+
if extract_pred == '':
|
| 739 |
+
cnt_rejected += 1
|
| 740 |
+
data.loc[data['index'] == idx, 'score'] = 0
|
| 741 |
+
else:
|
| 742 |
+
data.loc[data['index'] == idx, 'score'] = int(extract_pred == ans)
|
| 743 |
+
|
| 744 |
+
print(
|
| 745 |
+
f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
|
| 746 |
+
f'failed to obtain the score for another {cnt_rejected} questions. '
|
| 747 |
+
f'Those questions will be counted as 0 score in ALL rating.'
|
| 748 |
+
)
|
| 749 |
+
|
| 750 |
+
dump(data, score_file)
|
| 751 |
+
|
| 752 |
+
rating = get_dimension_rating(score_file)
|
| 753 |
+
dump(rating, tgt_file)
|
| 754 |
+
return rating
|
| 755 |
+
|
| 756 |
+
|
| 757 |
+
class HRBenchDataset(ImageMCQDataset):
|
| 758 |
+
|
| 759 |
+
DATASET_URL = {
|
| 760 |
+
'HRBench4K': 'https://huggingface.co/datasets/DreamMr/HR-Bench/resolve/main/hr_bench_4k.tsv',
|
| 761 |
+
'HRBench8K': 'https://huggingface.co/datasets/DreamMr/HR-Bench/resolve/main/hr_bench_8k.tsv',
|
| 762 |
+
}
|
| 763 |
+
|
| 764 |
+
DATASET_MD5 = {
|
| 765 |
+
'HRBench4K': 'f6b041b03d49543494b8a56d2e35be65',
|
| 766 |
+
'HRBench8K': '274c9c7f89329b804a4723178a00219c',
|
| 767 |
+
}
|
| 768 |
+
|
| 769 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 770 |
+
assert os.path.exists(eval_file), '{} does not exist!'.format(eval_file)
|
| 771 |
+
from .utils.multiple_choice import mcq_vanilla_eval
|
| 772 |
+
from .utils.hrbench import report_acc_hrbench
|
| 773 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
| 774 |
+
|
| 775 |
+
suffix = eval_file.split('.')[-1]
|
| 776 |
+
model = judge_kwargs.get('model', 'extract_matching')
|
| 777 |
+
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
|
| 778 |
+
name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
|
| 779 |
+
name_str = name_str_map[model] if model in name_str_map else model
|
| 780 |
+
|
| 781 |
+
if model == 'exact_matching':
|
| 782 |
+
model = None
|
| 783 |
+
elif gpt_key_set():
|
| 784 |
+
model = build_judge(**judge_kwargs)
|
| 785 |
+
if not model.working():
|
| 786 |
+
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
|
| 787 |
+
warnings.warn(DEBUG_MESSAGE)
|
| 788 |
+
model = None
|
| 789 |
+
else:
|
| 790 |
+
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
|
| 791 |
+
model = None
|
| 792 |
+
|
| 793 |
+
result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
|
| 794 |
+
|
| 795 |
+
data = load(eval_file)
|
| 796 |
+
data = data.sort_values(by='index')
|
| 797 |
+
data['prediction'] = [str(x) for x in data['prediction']]
|
| 798 |
+
# If not choice label, then use lower case
|
| 799 |
+
for k in data.keys():
|
| 800 |
+
data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
|
| 801 |
+
|
| 802 |
+
meta = self.data
|
| 803 |
+
meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
|
| 804 |
+
data_map = {x: y for x, y in zip(data['index'], data['question'])}
|
| 805 |
+
for k in data_map:
|
| 806 |
+
assert k in meta_q_map, (
|
| 807 |
+
f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
|
| 808 |
+
)
|
| 809 |
+
|
| 810 |
+
score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
|
| 811 |
+
|
| 812 |
+
if osp.exists(score_file):
|
| 813 |
+
acc = load(score_file)
|
| 814 |
+
return acc
|
| 815 |
+
data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
|
| 816 |
+
dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
|
| 817 |
+
data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
|
| 818 |
+
|
| 819 |
+
acc = report_acc_hrbench(data)
|
| 820 |
+
|
| 821 |
+
score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
|
| 822 |
+
dump(acc, score_file)
|
| 823 |
+
|
| 824 |
+
return acc
|
| 825 |
+
|
| 826 |
+
|
| 827 |
+
class CustomMCQDataset(ImageMCQDataset):
|
| 828 |
+
|
| 829 |
+
def load_data(self, dataset):
|
| 830 |
+
data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
|
| 831 |
+
|
| 832 |
+
if file_size(data_path, 'GB') > 1:
|
| 833 |
+
local_path = data_path.replace('.tsv', '_local.tsv')
|
| 834 |
+
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None):
|
| 835 |
+
from ..tools import LOCALIZE
|
| 836 |
+
LOCALIZE(data_path, local_path)
|
| 837 |
+
data_path = local_path
|
| 838 |
+
return load(data_path)
|
| 839 |
+
|
| 840 |
+
|
| 841 |
+
class NaturalBenchDataset(ImageMCQDataset):
|
| 842 |
+
|
| 843 |
+
DATASET_URL = {
|
| 844 |
+
'NaturalBenchDataset': (
|
| 845 |
+
'https://huggingface.co/datasets/BaiqiL/'
|
| 846 |
+
'NaturalBench/resolve/main/NaturalBenchDataset.tsv'
|
| 847 |
+
),
|
| 848 |
+
}
|
| 849 |
+
DATASET_MD5 = {
|
| 850 |
+
'NaturalBenchDataset':'dbe25b044bc35696426381e9ba4fe930',
|
| 851 |
+
}
|
| 852 |
+
|
| 853 |
+
def build_prompt(self, line):
|
| 854 |
+
SUFFIX_FOR_VQA = {
|
| 855 |
+
"yes_no": "Please answer Yes or No.",
|
| 856 |
+
"multiple_choice": "Please output the letter corresponding to the correct option."
|
| 857 |
+
}
|
| 858 |
+
if isinstance(line, int):
|
| 859 |
+
line = self.data.iloc[line]
|
| 860 |
+
|
| 861 |
+
if self.meta_only:
|
| 862 |
+
tgt_path = toliststr(line['image_path'])
|
| 863 |
+
else:
|
| 864 |
+
tgt_path = self.dump_image(line)
|
| 865 |
+
|
| 866 |
+
question = line['question']
|
| 867 |
+
prompt = f'{question} {SUFFIX_FOR_VQA[line["type"]]}'
|
| 868 |
+
msgs = []
|
| 869 |
+
if isinstance(tgt_path, list):
|
| 870 |
+
msgs.extend([dict(type='image', value=p) for p in tgt_path])
|
| 871 |
+
else:
|
| 872 |
+
msgs = [dict(type='image', value=tgt_path)]
|
| 873 |
+
msgs.append(dict(type='text', value=prompt))
|
| 874 |
+
|
| 875 |
+
return msgs
|
| 876 |
+
|
| 877 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 878 |
+
from .utils.naturalbench import extract_answer, get_scores
|
| 879 |
+
|
| 880 |
+
data = load(eval_file)
|
| 881 |
+
data = data.sort_values(by='index')
|
| 882 |
+
predictions = [str(x) for x in data['prediction']]
|
| 883 |
+
answers = [str(x) for x in data['answer']]
|
| 884 |
+
indexs = [str(x) for x in data['index']]
|
| 885 |
+
meta = self.data
|
| 886 |
+
types = [str(x) for x in meta['type']]
|
| 887 |
+
results = {}
|
| 888 |
+
assert len(predictions) == len(answers) == len(indexs) == len(types) == (1900 * 4)
|
| 889 |
+
number_answered_samples = len(predictions) // 4
|
| 890 |
+
for i in range(number_answered_samples):
|
| 891 |
+
results[i] = {
|
| 892 |
+
"q0_i0": extract_answer(predictions[i * 4], types[i * 4]),
|
| 893 |
+
"q0_i1": extract_answer(predictions[i * 4 + 1], types[i * 4 + 1]),
|
| 894 |
+
"q1_i0": extract_answer(predictions[i * 4 + 2], types[i * 4 + 2]),
|
| 895 |
+
"q1_i1": extract_answer(predictions[i * 4 + 3], types[i * 4 + 3])
|
| 896 |
+
}
|
| 897 |
+
|
| 898 |
+
scores = get_scores(results)
|
| 899 |
+
print(scores)
|
| 900 |
+
score_file = 'NaturalBench_acc.csv'
|
| 901 |
+
df = pd.DataFrame(list(scores.items()), columns=['Metric', 'Score'])
|
| 902 |
+
dump(df, score_file)
|
| 903 |
+
|
| 904 |
+
return scores
|
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_mt.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .image_base import ImageBaseDataset
|
| 2 |
+
from .utils.judge_util import build_judge
|
| 3 |
+
from ..smp import *
|
| 4 |
+
from ..utils import track_progress_rich
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class ImageMTDataset(ImageBaseDataset):
|
| 8 |
+
|
| 9 |
+
TYPE = 'MT'
|
| 10 |
+
|
| 11 |
+
def build_prompt(self, line):
|
| 12 |
+
if isinstance(line, int):
|
| 13 |
+
line = self.data.iloc[line]
|
| 14 |
+
|
| 15 |
+
if self.meta_only:
|
| 16 |
+
tgt_path = toliststr(line['image_path'])
|
| 17 |
+
else:
|
| 18 |
+
tgt_path = self.dump_image(line)
|
| 19 |
+
|
| 20 |
+
questions = toliststr(line['question'])
|
| 21 |
+
if 'answer' in line:
|
| 22 |
+
answers = toliststr(line['answer'])
|
| 23 |
+
else:
|
| 24 |
+
answers = [''] * len(questions)
|
| 25 |
+
assert len(questions) == len(answers)
|
| 26 |
+
|
| 27 |
+
dlgs, pics_number = [], 0
|
| 28 |
+
for i in range(len(questions)):
|
| 29 |
+
q, a = questions[i], answers[i]
|
| 30 |
+
if '<ImageHere>' in q:
|
| 31 |
+
content = []
|
| 32 |
+
tag_number = q.count('<ImageHere>')
|
| 33 |
+
images = tgt_path[pics_number: pics_number + tag_number]
|
| 34 |
+
pics_number += tag_number
|
| 35 |
+
q_split = q.split('<ImageHere>')
|
| 36 |
+
for i in range(tag_number):
|
| 37 |
+
qsp, im = q_split[i], images[i]
|
| 38 |
+
if qsp != '':
|
| 39 |
+
content.append(dict(type='text', value=qsp))
|
| 40 |
+
content.append(dict(type='image', value=im))
|
| 41 |
+
if q_split[-1] != '':
|
| 42 |
+
content.append(dict(type='text', value=q_split[-1]))
|
| 43 |
+
else:
|
| 44 |
+
content = [dict(type='text', value=q)]
|
| 45 |
+
dlgs.append(dict(role='user', content=content))
|
| 46 |
+
assert '<ImageHere>' not in a, 'We currently do not support images in the answer. '
|
| 47 |
+
content = [dict(type='text', value=a)]
|
| 48 |
+
dlgs.append(dict(role='assistant', content=content))
|
| 49 |
+
return dlgs
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class MMDUDataset(ImageMTDataset):
|
| 53 |
+
|
| 54 |
+
DATASET_URL = {'MMDU': 'https://opencompass.openxlab.space/utils/VLMEval/MMDU.tsv'}
|
| 55 |
+
DATASET_MD5 = {'MMDU': '848b635a88a078f49aebcc6e39792061'}
|
| 56 |
+
DIMS = [
|
| 57 |
+
'Creativity', 'Richness', 'Visual Perception', 'Logical Coherence',
|
| 58 |
+
'Answer Accuracy', 'Image Relationship Understanding', 'Overall Score'
|
| 59 |
+
]
|
| 60 |
+
|
| 61 |
+
def calculat_metric(self, ans):
|
| 62 |
+
all = defaultdict(lambda: 0)
|
| 63 |
+
tot = defaultdict(lambda: 0)
|
| 64 |
+
valid = defaultdict(lambda: 0)
|
| 65 |
+
for k in ans:
|
| 66 |
+
res = ans[k]['res']
|
| 67 |
+
assert isinstance(res, pd.DataFrame)
|
| 68 |
+
lt = len(res)
|
| 69 |
+
for i in range(lt):
|
| 70 |
+
line = res.iloc[i]
|
| 71 |
+
for k in self.DIMS:
|
| 72 |
+
tot[k] += 1
|
| 73 |
+
if k in line and line[k] is not None:
|
| 74 |
+
try:
|
| 75 |
+
score = int(line[k])
|
| 76 |
+
score = np.clip(score, 0, 10)
|
| 77 |
+
all[k] += score
|
| 78 |
+
valid[k] += 1
|
| 79 |
+
except Exception as e:
|
| 80 |
+
print(f'Failed to parse the score: {str(e)}')
|
| 81 |
+
sp1 = {'set': 'all'}
|
| 82 |
+
sp1.update({k: all[k] / tot[k] * 10 for k in self.DIMS})
|
| 83 |
+
sp2 = {'set': 'valid'}
|
| 84 |
+
sp2.update({k: all[k] / valid[k] * 10 for k in self.DIMS})
|
| 85 |
+
|
| 86 |
+
return pd.DataFrame([sp1, sp2])
|
| 87 |
+
|
| 88 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 89 |
+
suffix = eval_file.split('.')[-1]
|
| 90 |
+
model = judge_kwargs['model']
|
| 91 |
+
|
| 92 |
+
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
|
| 93 |
+
score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
|
| 94 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
| 95 |
+
|
| 96 |
+
data = load(eval_file)
|
| 97 |
+
model = judge_kwargs.pop('model', 'gpt-4o')
|
| 98 |
+
judge_model = build_judge(model=model, **judge_kwargs)
|
| 99 |
+
|
| 100 |
+
lt = len(data)
|
| 101 |
+
lines = [data.iloc[i] for i in range(lt)]
|
| 102 |
+
tups = [(judge_model, line) for line in lines]
|
| 103 |
+
indices = [line['index'] for line in lines]
|
| 104 |
+
|
| 105 |
+
ans = {}
|
| 106 |
+
if osp.exists(tmp_file):
|
| 107 |
+
ans = load(tmp_file)
|
| 108 |
+
|
| 109 |
+
tups = [x for x, i in zip(tups, indices) if i not in ans]
|
| 110 |
+
indices = [i for i in indices if i not in ans]
|
| 111 |
+
|
| 112 |
+
from .utils.mmdu import mmdu_score
|
| 113 |
+
|
| 114 |
+
if len(indices):
|
| 115 |
+
new_results = track_progress_rich(
|
| 116 |
+
mmdu_score,
|
| 117 |
+
tups,
|
| 118 |
+
nproc=nproc,
|
| 119 |
+
chunksize=nproc,
|
| 120 |
+
keys=indices,
|
| 121 |
+
save=tmp_file,)
|
| 122 |
+
ans = load(tmp_file)
|
| 123 |
+
for k, v in zip(indices, new_results):
|
| 124 |
+
assert k in ans
|
| 125 |
+
|
| 126 |
+
metric = self.calculat_metric(ans)
|
| 127 |
+
dump(metric, score_file)
|
| 128 |
+
return metric
|
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_vqa.py
ADDED
|
@@ -0,0 +1,1475 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import tempfile
|
| 4 |
+
from functools import partial
|
| 5 |
+
|
| 6 |
+
import pandas as pd
|
| 7 |
+
|
| 8 |
+
from .image_base import ImageBaseDataset
|
| 9 |
+
from .utils import build_judge, DEBUG_MESSAGE
|
| 10 |
+
from ..smp import *
|
| 11 |
+
from ..utils import track_progress_rich
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ImageVQADataset(ImageBaseDataset):
|
| 15 |
+
TYPE = 'VQA'
|
| 16 |
+
|
| 17 |
+
DATASET_URL = {
|
| 18 |
+
'OCRVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/OCRVQA_TEST.tsv',
|
| 19 |
+
'OCRVQA_TESTCORE': 'https://opencompass.openxlab.space/utils/VLMEval/OCRVQA_TESTCORE.tsv',
|
| 20 |
+
'TextVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/TextVQA_VAL.tsv',
|
| 21 |
+
'DocVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/DocVQA_VAL.tsv',
|
| 22 |
+
'DocVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/DocVQA_TEST.tsv',
|
| 23 |
+
'InfoVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/InfoVQA_VAL.tsv',
|
| 24 |
+
'InfoVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/InfoVQA_TEST.tsv',
|
| 25 |
+
'ChartQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/ChartQA_TEST.tsv',
|
| 26 |
+
'GQA_TestDev_Balanced': 'https://opencompass.openxlab.space/utils/VLMEval/GQA_TestDev_Balanced.tsv',
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
DATASET_MD5 = {
|
| 30 |
+
'OCRVQA_TEST': 'ca46a6d74b403e9d6c0b670f6fc00db9',
|
| 31 |
+
'OCRVQA_TESTCORE': 'c5239fe77db8bdc1f2ad8e55e0d1fe97',
|
| 32 |
+
'TextVQA_VAL': 'b233b31f551bbf4056f2f955da3a92cd',
|
| 33 |
+
'DocVQA_VAL': 'd5ee77e1926ff10690d469c56b73eabf',
|
| 34 |
+
'DocVQA_TEST': '6a2f28cac26ef2d3447374e8c6f6c8e9',
|
| 35 |
+
'InfoVQA_VAL': '2342e9c225222f0ef4dec545ebb126fe',
|
| 36 |
+
'InfoVQA_TEST': 'df535bf51b88dc9718252c34131a6227',
|
| 37 |
+
'ChartQA_TEST': 'c902e0aa9be5582a7aad6dcf52734b42',
|
| 38 |
+
'GQA_TestDev_Balanced': '99b62f22e224d9b2f32dcbe41359d1c9',
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
def build_prompt(self, line):
|
| 42 |
+
msgs = super().build_prompt(line)
|
| 43 |
+
assert msgs[-1]['type'] == 'text'
|
| 44 |
+
msgs[-1]['value'] += '\nAnswer the question using a single word or phrase.'
|
| 45 |
+
return msgs
|
| 46 |
+
|
| 47 |
+
# It returns a DataFrame
|
| 48 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 49 |
+
from .utils.vqa_eval import hit_calculate, process_line
|
| 50 |
+
|
| 51 |
+
data = load(eval_file)
|
| 52 |
+
dataset = self.dataset_name
|
| 53 |
+
assert 'answer' in data and 'prediction' in data
|
| 54 |
+
data['prediction'] = [str(x) for x in data['prediction']]
|
| 55 |
+
data['answer'] = [str(x) for x in data['answer']]
|
| 56 |
+
lt = len(data)
|
| 57 |
+
pool = mp.Pool(16)
|
| 58 |
+
lines = [data.iloc[i] for i in range(lt)]
|
| 59 |
+
if listinstr(['TextVQA'], dataset):
|
| 60 |
+
res = pool.map(partial(process_line, method='vqa_score'), lines)
|
| 61 |
+
elif listinstr(['ChartQA'], dataset):
|
| 62 |
+
res = pool.map(partial(process_line, method='relaxed_accuracy'), lines)
|
| 63 |
+
elif listinstr(['OCRVQA', 'GQA'], dataset):
|
| 64 |
+
res = pool.map(partial(process_line, method='accuracy'), lines)
|
| 65 |
+
elif listinstr(['DocVQA', 'InfoVQA'], dataset):
|
| 66 |
+
res = pool.map(partial(process_line, method='anls'), lines)
|
| 67 |
+
else: # default using vqa_score to calculate score
|
| 68 |
+
res = pool.map(process_line, lines)
|
| 69 |
+
hit = hit_calculate(res, dataset)
|
| 70 |
+
ret = dict()
|
| 71 |
+
if 'split' in data:
|
| 72 |
+
splits = set(data['split'])
|
| 73 |
+
for sp in splits:
|
| 74 |
+
sub = [r for l, r in zip(lines, res) if l['split'] == sp]
|
| 75 |
+
# [np.mean(x['match']) >= full_score_weight for x in sub]
|
| 76 |
+
hit = hit_calculate(sub, dataset)
|
| 77 |
+
ret[sp] = np.mean(hit) * 100
|
| 78 |
+
sub = [r for l, r in zip(lines, res)]
|
| 79 |
+
hit = hit_calculate(sub, dataset)
|
| 80 |
+
ret['Overall'] = np.mean(hit) * 100
|
| 81 |
+
else:
|
| 82 |
+
ret['Overall'] = np.mean(hit) * 100
|
| 83 |
+
if 'category' in data:
|
| 84 |
+
cates = list(set(data['category']))
|
| 85 |
+
cates.sort()
|
| 86 |
+
for c in cates:
|
| 87 |
+
sub = [r for l, r in zip(lines, res) if l['category'] == c]
|
| 88 |
+
# [np.mean(x['match']) >= full_score_weight for x in sub]
|
| 89 |
+
hit = hit_calculate(sub, dataset)
|
| 90 |
+
ret[c] = np.mean(hit) * 100
|
| 91 |
+
ret = d2df(ret)
|
| 92 |
+
ret.round(2)
|
| 93 |
+
|
| 94 |
+
suffix = eval_file.split('.')[-1]
|
| 95 |
+
result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
|
| 96 |
+
dump(ret, result_file)
|
| 97 |
+
return ret
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
class VizWiz(ImageBaseDataset):
|
| 101 |
+
TYPE = 'VQA'
|
| 102 |
+
DATASET_URL = {
|
| 103 |
+
'VizWiz': 'https://opencompass.openxlab.space/utils/VLMEval/VizWiz.tsv'
|
| 104 |
+
}
|
| 105 |
+
DATASET_MD5 = {
|
| 106 |
+
'VizWiz': 'fa4ac4164467563ed2fac6eac6631bd0'
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
@classmethod
|
| 110 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 111 |
+
from .utils.vqa_eval import hit_calculate, process_line
|
| 112 |
+
|
| 113 |
+
suffix = eval_file.split('.')[-1]
|
| 114 |
+
result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
|
| 115 |
+
|
| 116 |
+
if not osp.exists(result_file):
|
| 117 |
+
data = load(eval_file)
|
| 118 |
+
assert 'answers' in data and 'prediction' in data
|
| 119 |
+
data['prediction'] = [str(x) for x in data['prediction']]
|
| 120 |
+
data['answer'] = [str(x) for x in data['answers']]
|
| 121 |
+
|
| 122 |
+
lt = len(data)
|
| 123 |
+
pool = mp.Pool(16)
|
| 124 |
+
lines = [data.iloc[i] for i in range(lt)]
|
| 125 |
+
res = pool.map(process_line, lines)
|
| 126 |
+
|
| 127 |
+
hit = hit_calculate(res, 'VizWiz')
|
| 128 |
+
ret = dict()
|
| 129 |
+
|
| 130 |
+
ret['Overall'] = np.mean(hit) * 100
|
| 131 |
+
ret = d2df(ret)
|
| 132 |
+
ret.round(2)
|
| 133 |
+
|
| 134 |
+
dump(ret, result_file)
|
| 135 |
+
|
| 136 |
+
retz = pd.read_csv(result_file)
|
| 137 |
+
return retz
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
class OCRBench(ImageBaseDataset):
|
| 141 |
+
TYPE = 'VQA'
|
| 142 |
+
DATASET_URL = {
|
| 143 |
+
'OCRBench': 'https://opencompass.openxlab.space/utils/VLMEval/OCRBench.tsv'
|
| 144 |
+
}
|
| 145 |
+
DATASET_MD5 = {'OCRBench': 'e953d98a987cc6e26ef717b61260b778'}
|
| 146 |
+
|
| 147 |
+
# It returns a dictionary
|
| 148 |
+
@classmethod
|
| 149 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 150 |
+
OCRBench_score = {
|
| 151 |
+
'Regular Text Recognition': 0,
|
| 152 |
+
'Irregular Text Recognition': 0,
|
| 153 |
+
'Artistic Text Recognition': 0,
|
| 154 |
+
'Handwriting Recognition': 0,
|
| 155 |
+
'Digit String Recognition': 0,
|
| 156 |
+
'Non-Semantic Text Recognition': 0,
|
| 157 |
+
'Scene Text-centric VQA': 0,
|
| 158 |
+
'Doc-oriented VQA': 0,
|
| 159 |
+
'Key Information Extraction': 0,
|
| 160 |
+
'Handwritten Mathematical Expression Recognition': 0,
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
data = load(eval_file)
|
| 164 |
+
lt = len(data)
|
| 165 |
+
lines = [data.iloc[i] for i in range(lt)]
|
| 166 |
+
for i in tqdm(range(len(lines))):
|
| 167 |
+
line = lines[i]
|
| 168 |
+
predict = str(line['prediction'])
|
| 169 |
+
answers = eval(line['answer'])
|
| 170 |
+
category = line['category']
|
| 171 |
+
if category == 'Handwritten Mathematical Expression Recognition':
|
| 172 |
+
for j in range(len(answers)):
|
| 173 |
+
answer = answers[j].strip().replace('\n', ' ').replace(' ', '')
|
| 174 |
+
predict = predict.strip().replace('\n', ' ').replace(' ', '')
|
| 175 |
+
if answer in predict:
|
| 176 |
+
OCRBench_score[category] += 1
|
| 177 |
+
break
|
| 178 |
+
else:
|
| 179 |
+
for j in range(len(answers)):
|
| 180 |
+
answer = answers[j].lower().strip().replace('\n', ' ')
|
| 181 |
+
predict = predict.lower().strip().replace('\n', ' ')
|
| 182 |
+
if answer in predict:
|
| 183 |
+
OCRBench_score[category] += 1
|
| 184 |
+
break
|
| 185 |
+
|
| 186 |
+
final_score_dict = {}
|
| 187 |
+
final_score_dict['Text Recognition'] = \
|
| 188 |
+
(OCRBench_score['Regular Text Recognition'] + OCRBench_score['Irregular Text Recognition']
|
| 189 |
+
+ OCRBench_score['Artistic Text Recognition'] + OCRBench_score['Handwriting Recognition']
|
| 190 |
+
+ OCRBench_score['Digit String Recognition'] + OCRBench_score['Non-Semantic Text Recognition'])
|
| 191 |
+
final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA']
|
| 192 |
+
final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA']
|
| 193 |
+
final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction']
|
| 194 |
+
final_score_dict['Handwritten Mathematical Expression Recognition'] = \
|
| 195 |
+
(OCRBench_score['Handwritten Mathematical Expression Recognition'])
|
| 196 |
+
final_score_dict['Final Score'] = \
|
| 197 |
+
(final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA']
|
| 198 |
+
+ final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction']
|
| 199 |
+
+ final_score_dict['Handwritten Mathematical Expression Recognition'])
|
| 200 |
+
final_score_dict['Final Score Norm'] = (float(final_score_dict['Final Score']) / 10)
|
| 201 |
+
score_pth = eval_file.replace('.xlsx', '_score.json')
|
| 202 |
+
dump(final_score_dict, score_pth)
|
| 203 |
+
return final_score_dict
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
class MathVista(ImageBaseDataset):
|
| 207 |
+
TYPE = 'VQA'
|
| 208 |
+
DATASET_URL = {
|
| 209 |
+
'MathVista_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/MathVista_MINI.tsv'
|
| 210 |
+
}
|
| 211 |
+
DATASET_MD5 = {'MathVista_MINI': 'f199b98e178e5a2a20e7048f5dcb0464'}
|
| 212 |
+
|
| 213 |
+
# It returns a DataFrame
|
| 214 |
+
@classmethod
|
| 215 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 216 |
+
from .utils.mathvista import MathVista_auxeval, MathVista_acc
|
| 217 |
+
|
| 218 |
+
model = judge_kwargs['model']
|
| 219 |
+
suffix = eval_file.split('.')[-1]
|
| 220 |
+
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
|
| 221 |
+
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
|
| 222 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
| 223 |
+
|
| 224 |
+
if not osp.exists(storage):
|
| 225 |
+
data = load(eval_file)
|
| 226 |
+
model = build_judge(max_tokens=128, **judge_kwargs)
|
| 227 |
+
assert model.working(), ('MathVista evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
|
| 228 |
+
lt = len(data)
|
| 229 |
+
lines = [data.iloc[i] for i in range(lt)]
|
| 230 |
+
tups = [(model, line) for line in lines]
|
| 231 |
+
indices = [line['index'] for line in lines]
|
| 232 |
+
|
| 233 |
+
ans = {}
|
| 234 |
+
if osp.exists(tmp_file):
|
| 235 |
+
ans = load(tmp_file)
|
| 236 |
+
tups = [x for x, i in zip(tups, indices) if i not in ans]
|
| 237 |
+
indices = [i for i in indices if i not in ans]
|
| 238 |
+
|
| 239 |
+
if len(indices):
|
| 240 |
+
new_results = track_progress_rich(
|
| 241 |
+
MathVista_auxeval,
|
| 242 |
+
tups,
|
| 243 |
+
nproc=nproc,
|
| 244 |
+
chunksize=nproc,
|
| 245 |
+
keys=indices,
|
| 246 |
+
save=tmp_file,
|
| 247 |
+
)
|
| 248 |
+
ans = load(tmp_file)
|
| 249 |
+
for k, v in zip(indices, new_results):
|
| 250 |
+
assert k in ans
|
| 251 |
+
assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
|
| 252 |
+
|
| 253 |
+
data['res'] = [ans[idx]['res'] for idx in data['index']]
|
| 254 |
+
data['log'] = [ans[idx]['log'] for idx in data['index']]
|
| 255 |
+
dump(data, storage)
|
| 256 |
+
|
| 257 |
+
score = MathVista_acc(storage)
|
| 258 |
+
score_pth = storage.replace('.xlsx', '_score.csv')
|
| 259 |
+
dump(score, score_pth)
|
| 260 |
+
return score
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
class MathVerse(ImageBaseDataset):
|
| 264 |
+
TYPE = 'VQA'
|
| 265 |
+
DATASET_URL = {
|
| 266 |
+
'MathVerse_MINI': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIV.tsv', # noqa
|
| 267 |
+
'MathVerse_MINI_Vision_Only': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVOnly.tsv', # noqa
|
| 268 |
+
'MathVerse_MINI_Vision_Dominant': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVDom.tsv', # noqa
|
| 269 |
+
'MathVerse_MINI_Vision_Intensive': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVInt.tsv', # noqa
|
| 270 |
+
'MathVerse_MINI_Text_Lite': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINITLite.tsv', # noqa
|
| 271 |
+
'MathVerse_MINI_Text_Dominant': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINITDom.tsv', # noqa
|
| 272 |
+
}
|
| 273 |
+
DATASET_MD5 = {
|
| 274 |
+
'MathVerse_MINI': '5017caca32b7fa110c350a1bea861b65',
|
| 275 |
+
'MathVerse_MINI_Vision_Only': '68a11d4680014ac881fa37adeadea3a4',
|
| 276 |
+
'MathVerse_MINI_Vision_Dominant': 'b8fb63852d261ab2aaefba29cc2414d3',
|
| 277 |
+
'MathVerse_MINI_Vision_Intensive': '01cbd35be202bb0c4873a4186a63bc19',
|
| 278 |
+
'MathVerse_MINI_Text_Lite': '19e4b13bdd30b89a03b2e358bcfefa04',
|
| 279 |
+
'MathVerse_MINI_Text_Dominant': '4f5cd2fa6630ea00bb11d6fde1f6fe6a',
|
| 280 |
+
}
|
| 281 |
+
|
| 282 |
+
# It returns a DataFrame
|
| 283 |
+
@classmethod
|
| 284 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 285 |
+
from .utils.mathverse import MathVerse_auxeval_extract, MathVerse_auxeval_score, MathVerse_acc
|
| 286 |
+
|
| 287 |
+
model = judge_kwargs['model']
|
| 288 |
+
suffix = eval_file.split('.')[-1]
|
| 289 |
+
storage_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.xlsx')
|
| 290 |
+
tmp_file_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.pkl')
|
| 291 |
+
storage_score = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
|
| 292 |
+
tmp_file_score = eval_file.replace(f'.{suffix}', f'_{model}_score.pkl')
|
| 293 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
| 294 |
+
# stage1: extract the answer
|
| 295 |
+
if not osp.exists(storage_extract):
|
| 296 |
+
data = load(eval_file)
|
| 297 |
+
model = build_judge(max_tokens=128, **judge_kwargs)
|
| 298 |
+
assert model.working(), ('MathVerse evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
|
| 299 |
+
lt = len(data)
|
| 300 |
+
lines = [data.iloc[i] for i in range(lt)]
|
| 301 |
+
tups = [(model, line) for line in lines]
|
| 302 |
+
indices = [line['index'] for line in lines]
|
| 303 |
+
|
| 304 |
+
ans = {}
|
| 305 |
+
if osp.exists(tmp_file_extract):
|
| 306 |
+
ans = load(tmp_file_extract)
|
| 307 |
+
tups = [x for x, i in zip(tups, indices) if i not in ans]
|
| 308 |
+
indices = [i for i in indices if i not in ans]
|
| 309 |
+
|
| 310 |
+
if len(indices):
|
| 311 |
+
new_results = track_progress_rich(
|
| 312 |
+
MathVerse_auxeval_extract,
|
| 313 |
+
tups,
|
| 314 |
+
nproc=nproc,
|
| 315 |
+
chunksize=nproc,
|
| 316 |
+
keys=indices,
|
| 317 |
+
save=tmp_file_extract,
|
| 318 |
+
)
|
| 319 |
+
ans = load(tmp_file_extract)
|
| 320 |
+
for k, v in zip(indices, new_results):
|
| 321 |
+
assert k in ans
|
| 322 |
+
assert ans[k]['log_extract'] == v['log_extract'] and ans[k]['extract'] == v['extract']
|
| 323 |
+
|
| 324 |
+
data['extract'] = [ans[idx]['extract'] for idx in data['index']]
|
| 325 |
+
data['log_extract'] = [ans[idx]['log_extract'] for idx in data['index']]
|
| 326 |
+
dump(data, storage_extract)
|
| 327 |
+
|
| 328 |
+
# stage2: score the answer
|
| 329 |
+
if not osp.exists(storage_score):
|
| 330 |
+
data = load(storage_extract)
|
| 331 |
+
model = build_judge(max_tokens=128, **judge_kwargs)
|
| 332 |
+
assert model.working(), ('MathVerse evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
|
| 333 |
+
lt = len(data)
|
| 334 |
+
lines = [data.iloc[i] for i in range(lt)]
|
| 335 |
+
tups = [(model, line) for line in lines]
|
| 336 |
+
indices = [line['index'] for line in lines]
|
| 337 |
+
|
| 338 |
+
ans = {}
|
| 339 |
+
if osp.exists(tmp_file_score):
|
| 340 |
+
ans = load(tmp_file_score)
|
| 341 |
+
tups = [x for x, i in zip(tups, indices) if i not in ans]
|
| 342 |
+
indices = [i for i in indices if i not in ans]
|
| 343 |
+
|
| 344 |
+
if len(indices):
|
| 345 |
+
new_results = track_progress_rich(
|
| 346 |
+
MathVerse_auxeval_score,
|
| 347 |
+
tups,
|
| 348 |
+
nproc=nproc,
|
| 349 |
+
chunksize=nproc,
|
| 350 |
+
keys=indices,
|
| 351 |
+
save=tmp_file_score,
|
| 352 |
+
)
|
| 353 |
+
ans = load(tmp_file_score)
|
| 354 |
+
for k, v in zip(indices, new_results):
|
| 355 |
+
assert k in ans
|
| 356 |
+
assert ans[k]['log_score'] == v['log_score'] and ans[k]['score'] == v['score']
|
| 357 |
+
|
| 358 |
+
data['score'] = [ans[idx]['score'] for idx in data['index']]
|
| 359 |
+
data['log_score'] = [ans[idx]['log_score'] for idx in data['index']]
|
| 360 |
+
dump(data, storage_score)
|
| 361 |
+
|
| 362 |
+
score = MathVerse_acc(storage_score)
|
| 363 |
+
score_pth = storage_score.replace('.xlsx', '.csv')
|
| 364 |
+
dump(score, score_pth)
|
| 365 |
+
return score
|
| 366 |
+
|
| 367 |
+
|
| 368 |
+
class MathVision(ImageBaseDataset):
|
| 369 |
+
TYPE = 'VQA'
|
| 370 |
+
DATASET_URL = {
|
| 371 |
+
'MathVision': 'https://opencompass.openxlab.space/utils/VLMEval/MathVision.tsv',
|
| 372 |
+
'MathVision_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/MathVision_MINI.tsv'
|
| 373 |
+
}
|
| 374 |
+
DATASET_MD5 = {
|
| 375 |
+
'MathVision': '93f6de14f7916e598aa1b7165589831e',
|
| 376 |
+
'MathVision_MINI': '060fe4fa5d868987ce179307bd5f8a33'
|
| 377 |
+
}
|
| 378 |
+
|
| 379 |
+
# It returns a DataFrame
|
| 380 |
+
@classmethod
|
| 381 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 382 |
+
from .utils.mathv import MATH_V_auxeval, MATH_V_acc
|
| 383 |
+
|
| 384 |
+
if 'model' in judge_kwargs:
|
| 385 |
+
model = judge_kwargs['model']
|
| 386 |
+
else:
|
| 387 |
+
model = os.path.basename(os.environ.get('LOCAL_LLM'))
|
| 388 |
+
suffix = eval_file.split('.')[-1]
|
| 389 |
+
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
|
| 390 |
+
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
|
| 391 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
| 392 |
+
|
| 393 |
+
if not osp.exists(storage):
|
| 394 |
+
data = load(eval_file)
|
| 395 |
+
model = build_judge(max_tokens=128, **judge_kwargs)
|
| 396 |
+
assert model.working(), ('MATH-Vision evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
|
| 397 |
+
lt = len(data)
|
| 398 |
+
lines = [data.iloc[i] for i in range(lt)]
|
| 399 |
+
tups = [(model, line) for line in lines]
|
| 400 |
+
indices = [line['index'] for line in lines]
|
| 401 |
+
|
| 402 |
+
ans = {}
|
| 403 |
+
if osp.exists(tmp_file):
|
| 404 |
+
ans = load(tmp_file)
|
| 405 |
+
tups = [x for x, i in zip(tups, indices) if i not in ans]
|
| 406 |
+
indices = [i for i in indices if i not in ans]
|
| 407 |
+
|
| 408 |
+
if len(indices):
|
| 409 |
+
new_results = track_progress_rich(
|
| 410 |
+
MATH_V_auxeval,
|
| 411 |
+
tups,
|
| 412 |
+
nproc=nproc,
|
| 413 |
+
chunksize=nproc,
|
| 414 |
+
keys=indices,
|
| 415 |
+
save=tmp_file,
|
| 416 |
+
)
|
| 417 |
+
ans = load(tmp_file)
|
| 418 |
+
for k, v in zip(indices, new_results):
|
| 419 |
+
assert k in ans
|
| 420 |
+
assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
|
| 421 |
+
|
| 422 |
+
data['res'] = [ans[idx]['res'] for idx in data['index']]
|
| 423 |
+
data['log'] = [ans[idx]['log'] for idx in data['index']]
|
| 424 |
+
dump(data, storage)
|
| 425 |
+
|
| 426 |
+
score = MATH_V_acc(storage)
|
| 427 |
+
score_pth = storage.replace('.xlsx', '_score.csv')
|
| 428 |
+
dump(score, score_pth)
|
| 429 |
+
return score
|
| 430 |
+
|
| 431 |
+
|
| 432 |
+
class OlympiadBench(ImageBaseDataset):
|
| 433 |
+
TYPE = 'VQA_ex_prompt'
|
| 434 |
+
DATASET_URL = {
|
| 435 |
+
'OlympiadBench': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench.tsv',
|
| 436 |
+
'OlympiadBench_EN': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench_EN.tsv',
|
| 437 |
+
'OlympiadBench_CN': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench_CN.tsv'
|
| 438 |
+
}
|
| 439 |
+
DATASET_MD5 = {
|
| 440 |
+
'OlympiadBench': '9735ae0f0299eae1e7d07f5a7feab914',
|
| 441 |
+
'OlympiadBench_EN': '5c68e100d394351fc7049f29d4d4efed',
|
| 442 |
+
'OlympiadBench_CN': 'ea01b16788955702c79650c701e5b623'
|
| 443 |
+
}
|
| 444 |
+
|
| 445 |
+
def dump_image(self, line):
|
| 446 |
+
os.makedirs(self.img_root, exist_ok=True)
|
| 447 |
+
|
| 448 |
+
tgt_path_z = []
|
| 449 |
+
if isinstance(line['image'], list):
|
| 450 |
+
for i in range(len(line['image'])):
|
| 451 |
+
tgt_path = osp.join(self.img_root, f"{line['index']}--{i + 1}.jpg")
|
| 452 |
+
if not read_ok(tgt_path):
|
| 453 |
+
decode_base64_to_image_file(line['image'][i], tgt_path)
|
| 454 |
+
tgt_path_z.append(tgt_path)
|
| 455 |
+
else:
|
| 456 |
+
tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
|
| 457 |
+
if not read_ok(tgt_path):
|
| 458 |
+
decode_base64_to_image_file(line['image'], tgt_path)
|
| 459 |
+
tgt_path_z.append(tgt_path)
|
| 460 |
+
return tgt_path_z
|
| 461 |
+
|
| 462 |
+
def build_prompt(self, line):
|
| 463 |
+
|
| 464 |
+
from .utils.olympiadbench import get_answer_type_text, make_input
|
| 465 |
+
|
| 466 |
+
self.is_chinese = 'zh' in line['source']
|
| 467 |
+
self.is_math = 'maths' in line['source']
|
| 468 |
+
self.is_theorem_proving = 'TP' in line['source']
|
| 469 |
+
|
| 470 |
+
if self.is_chinese:
|
| 471 |
+
subject_content = '数学' if self.is_math else '物理'
|
| 472 |
+
if self.is_theorem_proving:
|
| 473 |
+
prompt = (
|
| 474 |
+
f"以下是中国{subject_content}竞赛中的证明题。请根据题目的要求,运用逻辑推理及常用定理证明题目中的命题。"
|
| 475 |
+
"证明过程中使用的变量和公式请使用LaTeX格式表示。"
|
| 476 |
+
)
|
| 477 |
+
else:
|
| 478 |
+
answer_type_text = get_answer_type_text(line['answer_type'], is_chinese=True,
|
| 479 |
+
multiple_answer=line['is_multiple_answer'])
|
| 480 |
+
if line['is_multiple_answer']:
|
| 481 |
+
multiple_answer_text = '\\boxed{用英文逗号连接的多个答案}'
|
| 482 |
+
else:
|
| 483 |
+
multiple_answer_text = '\\boxed{答案}'
|
| 484 |
+
unit_text = ''
|
| 485 |
+
if line['unit']:
|
| 486 |
+
multiple_answer_text += '(单位)'
|
| 487 |
+
unit_text = ',注意答案的单位不要放在\\boxed{}中'
|
| 488 |
+
prompt = (
|
| 489 |
+
f'以下是中国{subject_content}竞赛中的解答题{answer_type_text}。请根据题目的要求和所提供的信息计算得出答案。'
|
| 490 |
+
f'解答过程和结果中使用的变量和公式请使用LaTeX格式表示。请在最后以“所以最终答案是{multiple_answer_text}。”'
|
| 491 |
+
f'显式给出结果{unit_text}。'
|
| 492 |
+
)
|
| 493 |
+
else:
|
| 494 |
+
subject_content = 'Math' if self.is_math else 'Physics'
|
| 495 |
+
if self.is_theorem_proving:
|
| 496 |
+
prompt = (
|
| 497 |
+
f'The following is a theorem proving problem from an International {subject_content} competition. '
|
| 498 |
+
'Please use logical reasoning and common theorems to prove the proposition in the problem '
|
| 499 |
+
'according to the given requirements. '
|
| 500 |
+
'Please use LaTeX format to represent the variables and formulas used in the proof.'
|
| 501 |
+
)
|
| 502 |
+
else:
|
| 503 |
+
if line['is_multiple_answer']:
|
| 504 |
+
multiple_answer_text = '\\boxed{multiple answers connected with commas}'
|
| 505 |
+
else:
|
| 506 |
+
multiple_answer_text = '\\boxed{answer}'
|
| 507 |
+
unit_text = ''
|
| 508 |
+
if line['unit']:
|
| 509 |
+
multiple_answer_text += '(unit)'
|
| 510 |
+
unit_text = ', note that the unit of the answer should not be included in \\boxed{}'
|
| 511 |
+
answer_type_text = get_answer_type_text(line['answer_type'], is_chinese=False,
|
| 512 |
+
multiple_answer=line['is_multiple_answer'])
|
| 513 |
+
prompt = (
|
| 514 |
+
f'The following is an open-ended problem from an International {subject_content} competition. '
|
| 515 |
+
f'{answer_type_text}Please calculate the answer according to the given requirements and '
|
| 516 |
+
'the information provided. Please use LaTeX format to represent the variables and formulas '
|
| 517 |
+
'used in the solution process and results. Please end your solution with "So the final answer '
|
| 518 |
+
f'is {multiple_answer_text}." and give the result explicitly{unit_text}.'
|
| 519 |
+
)
|
| 520 |
+
|
| 521 |
+
if self.is_math:
|
| 522 |
+
input = make_input(prompt, line['question'])
|
| 523 |
+
else:
|
| 524 |
+
if 'context' in line.keys() and str(line['context']) != 'nan': # cannot be null
|
| 525 |
+
input = make_input(prompt, line['context'] + '\n' + line['question'])
|
| 526 |
+
else:
|
| 527 |
+
input = make_input(prompt, line['question'])
|
| 528 |
+
|
| 529 |
+
ret = [dict(type='text', value=input)]
|
| 530 |
+
tgt_path = self.dump_image(line)
|
| 531 |
+
|
| 532 |
+
ret.extend([dict(type='image', value=s) for s in tgt_path])
|
| 533 |
+
|
| 534 |
+
return ret
|
| 535 |
+
|
| 536 |
+
@classmethod
|
| 537 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 538 |
+
from .utils.olympiadbench import MathJudger, extract_answer
|
| 539 |
+
judger = MathJudger()
|
| 540 |
+
|
| 541 |
+
suffix = eval_file.split('.')[-1]
|
| 542 |
+
name_str1 = 'judge'
|
| 543 |
+
name_str2 = 'score'
|
| 544 |
+
result_file = eval_file.replace(f'.{suffix}', f'_{name_str1}_result.xlsx')
|
| 545 |
+
score_file = eval_file.replace(f'.{suffix}', f'_{name_str2}_result.csv')
|
| 546 |
+
|
| 547 |
+
if not osp.exists(result_file):
|
| 548 |
+
data = load(eval_file)
|
| 549 |
+
scorez = []
|
| 550 |
+
|
| 551 |
+
for i in tqdm(data.iterrows()):
|
| 552 |
+
line = i[1]
|
| 553 |
+
model_answer = line['prediction']
|
| 554 |
+
is_chinese = 'zh' in line['source']
|
| 555 |
+
model_answer = extract_answer(is_chinese, model_answer, is_deepseek=False)
|
| 556 |
+
answer_type = line['answer_type']
|
| 557 |
+
|
| 558 |
+
final_answer = line['final_answer'][2:-2]
|
| 559 |
+
|
| 560 |
+
if str(answer_type) != 'nan' and 'Tuple' in answer_type:
|
| 561 |
+
judge_result = judger.judge(model_answer, final_answer)
|
| 562 |
+
else:
|
| 563 |
+
if str(line['error']) != 'nan':
|
| 564 |
+
if ',' in line['error']:
|
| 565 |
+
precisions = line['error'].split(',')
|
| 566 |
+
precisions = [float(p) if p else 1e-8 for p in precisions]
|
| 567 |
+
judge_result = judger.judge(model_answer, final_answer, precisions)
|
| 568 |
+
else:
|
| 569 |
+
precision = float(line['error'])
|
| 570 |
+
judge_result = judger.judge(model_answer, final_answer, precision)
|
| 571 |
+
else:
|
| 572 |
+
judge_result = judger.judge(model_answer, final_answer)
|
| 573 |
+
scorez.append(judge_result)
|
| 574 |
+
|
| 575 |
+
data['score'] = scorez
|
| 576 |
+
dump(data, result_file)
|
| 577 |
+
|
| 578 |
+
judge_file = load(result_file)
|
| 579 |
+
|
| 580 |
+
if not osp.exists(score_file):
|
| 581 |
+
name_list = ['OE_MM_maths_en_COMP', 'OE_MM_maths_zh_CEE', 'OE_MM_maths_zh_COMP', 'OE_MM_physics_en_COMP',
|
| 582 |
+
'OE_MM_physics_zh_CEE','OE_TO_maths_en_COMP', 'OE_TO_maths_zh_CEE', 'OE_TO_maths_zh_COMP',
|
| 583 |
+
'OE_TO_physics_en_COMP', 'OE_TO_physics_zh_CEE']
|
| 584 |
+
|
| 585 |
+
sample_list = [[] for _ in range(len(name_list))]
|
| 586 |
+
for i in judge_file.iterrows():
|
| 587 |
+
line = i[1]
|
| 588 |
+
for j in range(len(name_list)):
|
| 589 |
+
if line['source'] == name_list[j]:
|
| 590 |
+
sample_list[j].append(line['score'])
|
| 591 |
+
|
| 592 |
+
acc_dict = {}
|
| 593 |
+
correct_list = []
|
| 594 |
+
|
| 595 |
+
# fine-grained
|
| 596 |
+
for i in range(len(name_list)):
|
| 597 |
+
correct_num = 0
|
| 598 |
+
for j in sample_list[i]:
|
| 599 |
+
if j:
|
| 600 |
+
correct_num += 1
|
| 601 |
+
correct_list.append(correct_num)
|
| 602 |
+
acc = 100 * correct_num / len(sample_list[i])
|
| 603 |
+
acc_dict[name_list[i]] = [acc]
|
| 604 |
+
|
| 605 |
+
# 4 grained
|
| 606 |
+
labela = ['zh', 'en']
|
| 607 |
+
labelb = ['maths', 'physics']
|
| 608 |
+
|
| 609 |
+
grain_list = [[x,y] for x in labela for y in labelb]
|
| 610 |
+
for j in grain_list:
|
| 611 |
+
dict_name = j[0] + "_" + j[1]
|
| 612 |
+
correct_num = 0
|
| 613 |
+
full_num = 0
|
| 614 |
+
for i in range(len(name_list)):
|
| 615 |
+
if all(k in name_list[i] for k in j):
|
| 616 |
+
correct_num += correct_list[i]
|
| 617 |
+
full_num += len(sample_list[i])
|
| 618 |
+
acc = 100 * correct_num / full_num
|
| 619 |
+
acc_dict[dict_name] = [acc]
|
| 620 |
+
|
| 621 |
+
# 2 grained
|
| 622 |
+
grain_list = ['maths', 'physics']
|
| 623 |
+
for j in grain_list:
|
| 624 |
+
dict_name = j
|
| 625 |
+
correct_num = 0
|
| 626 |
+
full_num = 0
|
| 627 |
+
for i in range(len(name_list)):
|
| 628 |
+
if j in name_list[i]:
|
| 629 |
+
correct_num += correct_list[i]
|
| 630 |
+
full_num += len(sample_list[i])
|
| 631 |
+
acc = 100 * correct_num / full_num
|
| 632 |
+
acc_dict[dict_name] = [acc]
|
| 633 |
+
|
| 634 |
+
# AVG
|
| 635 |
+
correct_num = sum(correct_list)
|
| 636 |
+
acc = 100 * correct_num / len(judge_file)
|
| 637 |
+
acc_dict['AVG'] = [acc]
|
| 638 |
+
|
| 639 |
+
acc_pd = pd.DataFrame(acc_dict)
|
| 640 |
+
acc_pd.to_csv(score_file, index=False, encoding='gbk')
|
| 641 |
+
|
| 642 |
+
accdz = pd.read_csv(score_file)
|
| 643 |
+
return accdz
|
| 644 |
+
|
| 645 |
+
|
| 646 |
+
class WeMath(ImageBaseDataset):
|
| 647 |
+
TYPE = 'VQA'
|
| 648 |
+
DATASET_URL = {
|
| 649 |
+
'WeMath': 'https://opencompass.openxlab.space/utils/VLMEval/WeMath.tsv'
|
| 650 |
+
}
|
| 651 |
+
DATASET_MD5 = {'WeMath': '056142c89b09d864702450b5b5ea0913'}
|
| 652 |
+
|
| 653 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 654 |
+
from .utils.wemath import wemath_evaluate_models, wemath_accuracy
|
| 655 |
+
from .utils.multiple_choice import mcq_vanilla_eval
|
| 656 |
+
|
| 657 |
+
# model = judge_kwargs['model']
|
| 658 |
+
model = judge_kwargs.get('model', 'exact_matching')
|
| 659 |
+
assert model in ['exact_matching', 'gpt-4-0125', 'gpt-4-turbo', 'gpt-4o-mini'], model
|
| 660 |
+
name_str_map = {'gpt-4-0125': 'gpt4', 'gpt-4-turbo': 'gpt4-turbo', 'gpt-4o-mini': 'gpt4o-mini'}
|
| 661 |
+
name_str = name_str_map[model] if model in name_str_map else model
|
| 662 |
+
|
| 663 |
+
if model == 'exact_matching':
|
| 664 |
+
model = None
|
| 665 |
+
elif gpt_key_set():
|
| 666 |
+
model = build_judge(**judge_kwargs)
|
| 667 |
+
if not model.working():
|
| 668 |
+
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
|
| 669 |
+
warnings.warn(DEBUG_MESSAGE)
|
| 670 |
+
model = None
|
| 671 |
+
else:
|
| 672 |
+
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
|
| 673 |
+
model = None
|
| 674 |
+
|
| 675 |
+
suffix = eval_file.split('.')[-1]
|
| 676 |
+
storage = eval_file.replace(f'.{suffix}', f'_{name_str}.xlsx')
|
| 677 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
| 678 |
+
|
| 679 |
+
if not osp.exists(storage) and model is not None:
|
| 680 |
+
data = load(eval_file)
|
| 681 |
+
result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
|
| 682 |
+
|
| 683 |
+
data = load(eval_file)
|
| 684 |
+
data = data.sort_values(by='index')
|
| 685 |
+
data['prediction'] = [str(x) for x in data['prediction']]
|
| 686 |
+
# If not choice label, then use lower case
|
| 687 |
+
for k in data.keys():
|
| 688 |
+
data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
|
| 689 |
+
|
| 690 |
+
meta = self.data
|
| 691 |
+
meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
|
| 692 |
+
data_map = {x: y for x, y in zip(data['index'], data['question'])}
|
| 693 |
+
for k in data_map:
|
| 694 |
+
assert k in meta_q_map, (
|
| 695 |
+
f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
|
| 696 |
+
)
|
| 697 |
+
data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
|
| 698 |
+
|
| 699 |
+
if 'id' in data.columns:
|
| 700 |
+
# 更改列名
|
| 701 |
+
data.rename(columns={'id': 'ID'}, inplace=True)
|
| 702 |
+
dump(data, storage)
|
| 703 |
+
if osp.exists(storage):
|
| 704 |
+
accuracy_scores = wemath_evaluate_models(storage)
|
| 705 |
+
four_dim_scores = wemath_accuracy(storage)
|
| 706 |
+
else:
|
| 707 |
+
accuracy_scores = wemath_evaluate_models(eval_file)
|
| 708 |
+
four_dim_scores = wemath_accuracy(eval_file)
|
| 709 |
+
combine_score = {**accuracy_scores, **four_dim_scores}
|
| 710 |
+
combine_score = pd.DataFrame(combine_score)
|
| 711 |
+
score_pth = storage.replace('.xlsx', '_score.csv')
|
| 712 |
+
dump(combine_score, score_pth)
|
| 713 |
+
return combine_score
|
| 714 |
+
|
| 715 |
+
|
| 716 |
+
class LogicVista(ImageBaseDataset):
|
| 717 |
+
TYPE = 'VQA'
|
| 718 |
+
DATASET_URL = {
|
| 719 |
+
'LogicVista': 'https://opencompass.openxlab.space/utils/VLMEval/LogicVista.tsv'
|
| 720 |
+
}
|
| 721 |
+
DATASET_MD5 = {'LogicVista': '41c5d33adf33765c399e0e6ae588c061'}
|
| 722 |
+
|
| 723 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 724 |
+
from .utils.logicvista import LogicVista_auxeval, evaluate_logicvista
|
| 725 |
+
|
| 726 |
+
# model = judge_kwargs['model']
|
| 727 |
+
model = judge_kwargs.get('model', 'exact_matching')
|
| 728 |
+
assert model in ['exact_matching', 'gpt-4-0125', 'gpt-4-turbo', 'gpt-4o-mini'], model
|
| 729 |
+
name_str_map = {'gpt-4-0125': 'gpt4', 'gpt-4-turbo': 'gpt4-turbo', 'gpt-4o-mini': 'gpt4o-mini'}
|
| 730 |
+
name_str = name_str_map[model] if model in name_str_map else model
|
| 731 |
+
|
| 732 |
+
if model == 'exact_matching':
|
| 733 |
+
model = None
|
| 734 |
+
elif gpt_key_set():
|
| 735 |
+
model = build_judge(**judge_kwargs)
|
| 736 |
+
if not model.working():
|
| 737 |
+
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
|
| 738 |
+
warnings.warn(DEBUG_MESSAGE)
|
| 739 |
+
model = None
|
| 740 |
+
else:
|
| 741 |
+
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
|
| 742 |
+
model = None
|
| 743 |
+
|
| 744 |
+
suffix = eval_file.split('.')[-1]
|
| 745 |
+
storage = eval_file.replace(f'.{suffix}', f'_{name_str}.xlsx')
|
| 746 |
+
tmp_file = eval_file.replace(f'.{suffix}', f'_{name_str}.pkl')
|
| 747 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
| 748 |
+
|
| 749 |
+
if not osp.exists(storage) and model is not None:
|
| 750 |
+
data = load(eval_file)
|
| 751 |
+
model = build_judge(max_tokens=128, **judge_kwargs)
|
| 752 |
+
assert model.working(), ('LogicVista evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
|
| 753 |
+
lt = len(data)
|
| 754 |
+
lines = [data.iloc[i] for i in range(lt)]
|
| 755 |
+
tups = [(model, line) for line in lines]
|
| 756 |
+
indices = [line['index'] for line in lines]
|
| 757 |
+
|
| 758 |
+
ans = {}
|
| 759 |
+
if osp.exists(tmp_file):
|
| 760 |
+
ans = load(tmp_file)
|
| 761 |
+
tups = [x for x, i in zip(tups, indices) if i not in ans]
|
| 762 |
+
indices = [i for i in indices if i not in ans]
|
| 763 |
+
|
| 764 |
+
if len(indices):
|
| 765 |
+
new_results = track_progress_rich(
|
| 766 |
+
LogicVista_auxeval,
|
| 767 |
+
tups,
|
| 768 |
+
nproc=nproc,
|
| 769 |
+
chunksize=nproc,
|
| 770 |
+
keys=indices,
|
| 771 |
+
save=tmp_file,
|
| 772 |
+
)
|
| 773 |
+
ans = load(tmp_file)
|
| 774 |
+
for k, v in zip(indices, new_results):
|
| 775 |
+
assert k in ans
|
| 776 |
+
assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res'] and ans[k]['hit'] == v['hit']
|
| 777 |
+
|
| 778 |
+
data['res'] = [ans[idx]['res'] for idx in data['index']]
|
| 779 |
+
data['log'] = [ans[idx]['log'] for idx in data['index']]
|
| 780 |
+
data['hit'] = [ans[idx]['hit'] for idx in data['index']]
|
| 781 |
+
|
| 782 |
+
dump(data, storage)
|
| 783 |
+
if osp.exists(storage):
|
| 784 |
+
accuracy_scores = evaluate_logicvista(storage)
|
| 785 |
+
score_pth = storage.replace('.xlsx', '_score.csv')
|
| 786 |
+
dump(accuracy_scores, score_pth)
|
| 787 |
+
|
| 788 |
+
return accuracy_scores
|
| 789 |
+
|
| 790 |
+
class LLaVABench(ImageBaseDataset):
|
| 791 |
+
TYPE = 'VQA'
|
| 792 |
+
DATASET_URL = {'LLaVABench': 'https://opencompass.openxlab.space/utils/VLMEval/LLaVABench.tsv'}
|
| 793 |
+
DATASET_MD5 = {'LLaVABench': 'd382a093f749a697820d3dadd61c8428'}
|
| 794 |
+
|
| 795 |
+
# It returns a DataFrame
|
| 796 |
+
@classmethod
|
| 797 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 798 |
+
from .utils.llavabench import (
|
| 799 |
+
build_prompt,
|
| 800 |
+
LLaVABench_atomeval,
|
| 801 |
+
LLaVABench_score,
|
| 802 |
+
)
|
| 803 |
+
|
| 804 |
+
suffix = '.' + eval_file.split('.')[-1]
|
| 805 |
+
record_file = eval_file.replace(suffix, '_openai_result' + suffix)
|
| 806 |
+
score_file = eval_file.replace(suffix, '_score.csv')
|
| 807 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
| 808 |
+
system_prompt = 'You are a helpful and precise assistant for checking the quality of the answer.'
|
| 809 |
+
|
| 810 |
+
if not osp.exists(record_file):
|
| 811 |
+
data = load(eval_file)
|
| 812 |
+
lines = [data.iloc[i] for i in range(len(data))]
|
| 813 |
+
model = build_judge(temperature=0.2, system_prompt=system_prompt, **judge_kwargs)
|
| 814 |
+
assert model.working(), ('LLaVABench evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
|
| 815 |
+
|
| 816 |
+
prompts = [build_prompt(line) for line in lines]
|
| 817 |
+
tups = [(model, prompt) for prompt in prompts]
|
| 818 |
+
scores = track_progress_rich(LLaVABench_atomeval, tups, nproc=nproc, chunksize=nproc)
|
| 819 |
+
data['gpt4_score'] = [x[0] for x in scores]
|
| 820 |
+
data['score'] = [x[1] for x in scores]
|
| 821 |
+
dump(data, record_file)
|
| 822 |
+
|
| 823 |
+
data = load(record_file)
|
| 824 |
+
ret = LLaVABench_score(data).round(1)
|
| 825 |
+
dump(ret, score_file)
|
| 826 |
+
return ret
|
| 827 |
+
|
| 828 |
+
|
| 829 |
+
class MMVet(ImageBaseDataset):
|
| 830 |
+
TYPE = 'VQA'
|
| 831 |
+
DATASET_URL = {
|
| 832 |
+
'MMVet': 'https://opencompass.openxlab.space/utils/VLMEval/MMVet.tsv',
|
| 833 |
+
'MMVet_Hard': 'http://opencompass.openxlab.space/utils/VLMEval/MMVet_Hard.tsv'
|
| 834 |
+
}
|
| 835 |
+
DATASET_MD5 = {'MMVet': '748aa6d4aa9d4de798306a63718455e3', 'MMVet_Hard': '63a598819a936a2e77c410a78a21ff16'}
|
| 836 |
+
|
| 837 |
+
# It returns a DataFrame
|
| 838 |
+
@classmethod
|
| 839 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 840 |
+
from .utils.mmvet import MMVet_auxeval, MMVet_acc
|
| 841 |
+
|
| 842 |
+
suffix = eval_file.split('.')[-1]
|
| 843 |
+
model = judge_kwargs['model']
|
| 844 |
+
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
|
| 845 |
+
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
|
| 846 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
| 847 |
+
if not osp.exists(storage):
|
| 848 |
+
data = load(eval_file)
|
| 849 |
+
model = build_judge(max_tokens=3, **judge_kwargs)
|
| 850 |
+
assert model.working(), ('MMVet evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
|
| 851 |
+
|
| 852 |
+
lt = len(data)
|
| 853 |
+
lines = [data.iloc[i] for i in range(lt)]
|
| 854 |
+
tups = [(model, line) for line in lines]
|
| 855 |
+
indices = [line['index'] for line in lines]
|
| 856 |
+
|
| 857 |
+
ans = load(tmp_file) if osp.exists(tmp_file) else {}
|
| 858 |
+
tups = [x for x, i in zip(tups, indices) if i not in ans]
|
| 859 |
+
indices = [i for i in indices if i not in ans]
|
| 860 |
+
|
| 861 |
+
if len(indices):
|
| 862 |
+
new_results = track_progress_rich(
|
| 863 |
+
MMVet_auxeval,
|
| 864 |
+
tups,
|
| 865 |
+
nproc=nproc,
|
| 866 |
+
chunksize=nproc,
|
| 867 |
+
keys=indices,
|
| 868 |
+
save=tmp_file,
|
| 869 |
+
)
|
| 870 |
+
ans = load(tmp_file)
|
| 871 |
+
for k, v in zip(indices, new_results):
|
| 872 |
+
assert k in ans
|
| 873 |
+
assert ans[k]['log'] == v['log'] and ans[k]['score'] == v['score']
|
| 874 |
+
data['score'] = [ans[idx]['score'] for idx in data['index']]
|
| 875 |
+
data['log'] = [ans[idx]['log'] for idx in data['index']]
|
| 876 |
+
dump(data, storage)
|
| 877 |
+
|
| 878 |
+
score, score_fine = MMVet_acc(storage)
|
| 879 |
+
score_pth = storage.replace('.xlsx', '_score.csv')
|
| 880 |
+
score_fine_pth = storage.replace('.xlsx', '_score_fine.csv')
|
| 881 |
+
dump(score, score_pth)
|
| 882 |
+
dump(score_fine, score_fine_pth)
|
| 883 |
+
return score
|
| 884 |
+
|
| 885 |
+
|
| 886 |
+
class MTVQADataset(ImageBaseDataset):
|
| 887 |
+
TYPE = 'VQA'
|
| 888 |
+
DATASET_URL = {'MTVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/MTVQA_TEST.tsv'}
|
| 889 |
+
DATASET_MD5 = {'MTVQA_TEST': 'd87c17dbab934b7cd89c0a3c1c5657f4'}
|
| 890 |
+
|
| 891 |
+
@classmethod
|
| 892 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 893 |
+
data = load(eval_file)
|
| 894 |
+
assert 'answer' in data and 'prediction' in data and 'category' in data
|
| 895 |
+
data['prediction'] = [str(x) for x in data['prediction']]
|
| 896 |
+
data['answer'] = [str(x) for x in data['answer']]
|
| 897 |
+
if 'split' in data:
|
| 898 |
+
assert np.all([x.lower() == 'test' for x in data['split']]), 'We only support MTVQA_TEST for now. '
|
| 899 |
+
lt = len(data)
|
| 900 |
+
category_scores = defaultdict(list)
|
| 901 |
+
for i in range(lt):
|
| 902 |
+
line = data.iloc[i]
|
| 903 |
+
ans = line['answer'].strip().lower().replace('.', '')
|
| 904 |
+
pred = line['prediction'].strip().lower().replace('.', '')
|
| 905 |
+
cate = line['category']
|
| 906 |
+
score = 1.0 if ans in pred else 0.0
|
| 907 |
+
category_scores[cate].append(score)
|
| 908 |
+
category_scores['Average'].append(score)
|
| 909 |
+
# Calculate the average score for each category, the score is normalized to [0, 100]
|
| 910 |
+
category_averages = {category: np.mean(scores) * 100 for category, scores in category_scores.items()}
|
| 911 |
+
|
| 912 |
+
suffix = eval_file.split('.')[-1]
|
| 913 |
+
result_file = eval_file.replace(f'.{suffix}', '_acc.json')
|
| 914 |
+
dump(category_averages, result_file)
|
| 915 |
+
|
| 916 |
+
return category_averages
|
| 917 |
+
|
| 918 |
+
# MT-VQA adopts a custom prompt
|
| 919 |
+
def build_prompt(self, line):
|
| 920 |
+
msgs = super().build_prompt(line)
|
| 921 |
+
assert sum([x['type'] == 'text' for x in msgs]) == 1
|
| 922 |
+
for item in msgs:
|
| 923 |
+
if item['type'] == 'text':
|
| 924 |
+
item['value'] += '\nAnswer the question using a word or phrase in the language of the question.'
|
| 925 |
+
return msgs
|
| 926 |
+
|
| 927 |
+
|
| 928 |
+
class TableVQABench(ImageBaseDataset):
|
| 929 |
+
TYPE = 'VQA'
|
| 930 |
+
DATASET_URL = {
|
| 931 |
+
'TableVQABench': 'https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/mentor-vil/datasets/tablevqa-bench.tsv'
|
| 932 |
+
}
|
| 933 |
+
DATASET_MD5 = {'TableVQABench': '2550adc61bdc82d8e62f3b003de7c62d'}
|
| 934 |
+
|
| 935 |
+
from .utils.tablevqabench import FINTABNETQA_PROMPT, VTABFACT_PROMPT, VWTQ_PROMPT
|
| 936 |
+
|
| 937 |
+
# It returns a DataFrame
|
| 938 |
+
@classmethod
|
| 939 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 940 |
+
import pandas as pd
|
| 941 |
+
from .utils.tablevqabench import evaluate_fintabnet, evaluate_tabfact, evaluate_wtq
|
| 942 |
+
|
| 943 |
+
data = load(eval_file)
|
| 944 |
+
assert 'answer' in data and 'prediction' in data
|
| 945 |
+
|
| 946 |
+
data['prediction'] = data['prediction'].str.replace('^Answer: ', '', regex=True)
|
| 947 |
+
data_group = dict(tuple(data.groupby('split')))
|
| 948 |
+
eval_result = {'split': [], 'average_scores': []}
|
| 949 |
+
for split in ['fintabnetqa', 'vtabfact', 'vwtq', 'vwtq_syn']:
|
| 950 |
+
data_split = data_group[split].to_dict(orient='records')
|
| 951 |
+
if split == 'fintabnetqa':
|
| 952 |
+
split_eval_meta = evaluate_fintabnet(data_split, ['accuracy'])
|
| 953 |
+
elif split == 'vtabfact':
|
| 954 |
+
split_eval_meta = evaluate_tabfact(data_split, ['accuracy'])
|
| 955 |
+
elif split == 'vwtq' or split == 'vwtq_syn':
|
| 956 |
+
split_eval_meta = evaluate_wtq(data_split, ['accuracy'])
|
| 957 |
+
eval_result['split'].append(split)
|
| 958 |
+
eval_result['average_scores'].append(split_eval_meta['average_scores'])
|
| 959 |
+
|
| 960 |
+
suffix = eval_file.split('.')[-1]
|
| 961 |
+
result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
|
| 962 |
+
eval_result = pd.DataFrame(eval_result)
|
| 963 |
+
dump(eval_result, result_file)
|
| 964 |
+
|
| 965 |
+
return eval_result
|
| 966 |
+
|
| 967 |
+
# TableVQABench adopts a custom prompt
|
| 968 |
+
def build_prompt(self, line):
|
| 969 |
+
msgs = super().build_prompt(line)
|
| 970 |
+
assert sum([x['type'] == 'text' for x in msgs]) == 1
|
| 971 |
+
for item in msgs:
|
| 972 |
+
if item['type'] == 'text':
|
| 973 |
+
if line['split'] == 'fintabnetqa':
|
| 974 |
+
item['value'] = self.FINTABNETQA_PROMPT.format_map({'question': item['value']})
|
| 975 |
+
elif line['split'] == 'vtabfact':
|
| 976 |
+
item['value'] = self.VTABFACT_PROMPT.format_map({'question': item['value']})
|
| 977 |
+
elif line['split'] == 'vwtq_syn' or line['split'] == 'vwtq':
|
| 978 |
+
item['value'] = self.VWTQ_PROMPT.format_map({'question': item['value']})
|
| 979 |
+
return msgs
|
| 980 |
+
|
| 981 |
+
|
| 982 |
+
class CustomVQADataset(ImageBaseDataset):
|
| 983 |
+
TYPE = 'VQA'
|
| 984 |
+
|
| 985 |
+
def load_data(self, dataset):
|
| 986 |
+
data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
|
| 987 |
+
|
| 988 |
+
if file_size(data_path, 'GB') > 1:
|
| 989 |
+
local_path = data_path.replace('.tsv', '_local.tsv')
|
| 990 |
+
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None):
|
| 991 |
+
from ..tools import LOCALIZE
|
| 992 |
+
|
| 993 |
+
LOCALIZE(data_path, local_path)
|
| 994 |
+
data_path = local_path
|
| 995 |
+
return load(data_path)
|
| 996 |
+
|
| 997 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 998 |
+
raise NotImplementedError
|
| 999 |
+
|
| 1000 |
+
|
| 1001 |
+
class CRPE(ImageBaseDataset):
|
| 1002 |
+
TYPE = 'VQA'
|
| 1003 |
+
DATASET_URL = {
|
| 1004 |
+
'CRPE_EXIST': 'https://huggingface.co/datasets/petter12321/crpe_vlmevalkit/resolve/main/CRPE_EXIST.tsv',
|
| 1005 |
+
'CRPE_RELATION': 'https://huggingface.co/datasets/petter12321/crpe_vlmevalkit/resolve/main/CRPE_RELATION.tsv'
|
| 1006 |
+
}
|
| 1007 |
+
DATASET_MD5 = {
|
| 1008 |
+
'CRPE_EXIST': '315584e23ac1ff7f8719ed3b7ad90f08',
|
| 1009 |
+
'CRPE_RELATION': 'bad7094cde0b572288f4b119c2d0c656'}
|
| 1010 |
+
|
| 1011 |
+
@classmethod
|
| 1012 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 1013 |
+
from .utils.crpe import is_correct
|
| 1014 |
+
# find-image, count-text, find-text,
|
| 1015 |
+
# infer-choose, count-image, visual-reasoning
|
| 1016 |
+
score = {
|
| 1017 |
+
'exist': 0,
|
| 1018 |
+
'subject': 0,
|
| 1019 |
+
'predicate': 0,
|
| 1020 |
+
'object': 0,
|
| 1021 |
+
'total': 0,
|
| 1022 |
+
}
|
| 1023 |
+
num = {
|
| 1024 |
+
'exist': 0,
|
| 1025 |
+
'subject': 0,
|
| 1026 |
+
'predicate': 0,
|
| 1027 |
+
'object': 0,
|
| 1028 |
+
'total': 0,
|
| 1029 |
+
}
|
| 1030 |
+
final_score_dict = {
|
| 1031 |
+
'exist': 0,
|
| 1032 |
+
'subject': 0,
|
| 1033 |
+
'predicate': 0,
|
| 1034 |
+
'object': 0,
|
| 1035 |
+
'total': 0,
|
| 1036 |
+
}
|
| 1037 |
+
data = load(eval_file)
|
| 1038 |
+
lt = len(data)
|
| 1039 |
+
lines = [data.iloc[i] for i in range(lt)]
|
| 1040 |
+
for i in tqdm(range(len(lines))):
|
| 1041 |
+
line = lines[i]
|
| 1042 |
+
predict = str(line['prediction'])
|
| 1043 |
+
answers = str(line['answer'])
|
| 1044 |
+
# print("predict =", predict)
|
| 1045 |
+
# print("answers =", answers)
|
| 1046 |
+
category = line['category']
|
| 1047 |
+
if is_correct(answers, predict):
|
| 1048 |
+
score[category] += 1
|
| 1049 |
+
score['total'] += 1
|
| 1050 |
+
num[category] += 1
|
| 1051 |
+
num['total'] += 1
|
| 1052 |
+
|
| 1053 |
+
for category in ['exist', 'subject', 'predicate', 'object', 'total']:
|
| 1054 |
+
if num[category] != 0:
|
| 1055 |
+
final_score_dict[category] = score[category] / num[category]
|
| 1056 |
+
else:
|
| 1057 |
+
final_score_dict[category] = None
|
| 1058 |
+
|
| 1059 |
+
score_pth = eval_file.replace('.xlsx', '_score.json')
|
| 1060 |
+
dump(final_score_dict, score_pth)
|
| 1061 |
+
return final_score_dict
|
| 1062 |
+
|
| 1063 |
+
def build_prompt(self, line):
|
| 1064 |
+
ROOT = LMUDataRoot()
|
| 1065 |
+
msgs = super().build_prompt(line)
|
| 1066 |
+
for msg in msgs:
|
| 1067 |
+
if msg['type'] == 'image':
|
| 1068 |
+
msg['value'] = osp.join(osp.join(ROOT, 'images', self.dataset_name), msg['value'])
|
| 1069 |
+
return msgs
|
| 1070 |
+
|
| 1071 |
+
|
| 1072 |
+
class QSpatial(ImageBaseDataset):
|
| 1073 |
+
TYPE = 'VQA'
|
| 1074 |
+
DATASET_URL = {
|
| 1075 |
+
'QSpatial_plus': '',
|
| 1076 |
+
'QSpatial_scannet': ''
|
| 1077 |
+
}
|
| 1078 |
+
|
| 1079 |
+
# NOTE: To evaluate Q-Spatial-ScanNet, you need to get the permission from ScanNet website
|
| 1080 |
+
# Once you get the permission, you can use the helper code here to download and extract necessary images:
|
| 1081 |
+
# https://github.com/andrewliao11/Q-Spatial-Bench-code?tab=readme-ov-file#for-qspatial_scannet
|
| 1082 |
+
qspatial_root = "TO_BE_REPLACED_WITH_THE_PATH_TO_QSPATIAL_DATASET"
|
| 1083 |
+
url = "https://raw.githubusercontent.com/andrewliao11/Q-Spatial-Bench-code/refs/heads/main/prompt_templates/"
|
| 1084 |
+
|
| 1085 |
+
def post_build(self, dataset):
|
| 1086 |
+
# Download the prompt templates from github
|
| 1087 |
+
|
| 1088 |
+
links = [
|
| 1089 |
+
self.url + "system_prompt.txt",
|
| 1090 |
+
self.url + "spatial_prompt_single.txt",
|
| 1091 |
+
self.url + "spatial_prompt_steps.txt",
|
| 1092 |
+
self.url + "standard_prompt.txt",
|
| 1093 |
+
self.url + "zero_shot_prompt.txt"
|
| 1094 |
+
]
|
| 1095 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 1096 |
+
for link in links:
|
| 1097 |
+
tgt_path = os.path.join(temp_dir, link.split("/")[-1])
|
| 1098 |
+
os.system(f"wget {link} -O {tgt_path}")
|
| 1099 |
+
|
| 1100 |
+
self.system_prompt = open(os.path.join(temp_dir, "system_prompt.txt")).read()
|
| 1101 |
+
self._prompt_templates = dict(
|
| 1102 |
+
spatial_prompt_single=open(os.path.join(temp_dir, "spatial_prompt_single.txt")).read(),
|
| 1103 |
+
spatial_prompt_steps=open(os.path.join(temp_dir, "spatial_prompt_steps.txt")).read(),
|
| 1104 |
+
standard_prompt=open(os.path.join(temp_dir, "standard_prompt.txt")).read(),
|
| 1105 |
+
zero_shot_prompt=open(os.path.join(temp_dir, "zero_shot_prompt.txt")).read(),
|
| 1106 |
+
)
|
| 1107 |
+
|
| 1108 |
+
# Given one data record, return the built prompt (a multi-modal message), can override
|
| 1109 |
+
def build_prompt(self, line):
|
| 1110 |
+
from jinja2.sandbox import SandboxedEnvironment
|
| 1111 |
+
text_prompt_template = self._prompt_templates["spatial_prompt_single"]
|
| 1112 |
+
env = SandboxedEnvironment()
|
| 1113 |
+
text_prompt = env.from_string(text_prompt_template).render(question=line["question"])
|
| 1114 |
+
tgt_path = self.dump_image(line)
|
| 1115 |
+
|
| 1116 |
+
msgs = []
|
| 1117 |
+
if isinstance(tgt_path, list):
|
| 1118 |
+
msgs.extend([dict(type='image', value=p) for p in tgt_path])
|
| 1119 |
+
else:
|
| 1120 |
+
msgs = [dict(type='image', value=tgt_path)]
|
| 1121 |
+
|
| 1122 |
+
msgs.append(dict(type='text', value=f"{self.system_prompt}\n{text_prompt}"))
|
| 1123 |
+
return msgs
|
| 1124 |
+
|
| 1125 |
+
# Given the dataset name, return the dataset as a pandas dataframe, can override
|
| 1126 |
+
def load_data(self, dataset):
|
| 1127 |
+
import io
|
| 1128 |
+
import pandas as pd
|
| 1129 |
+
from datasets import load_dataset
|
| 1130 |
+
|
| 1131 |
+
hf_dataset = load_dataset("andrewliao11/Q-Spatial-Bench", split=dataset)
|
| 1132 |
+
df = hf_dataset.to_pandas()
|
| 1133 |
+
|
| 1134 |
+
df.reset_index(drop=True, inplace=True)
|
| 1135 |
+
df['index'] = df.index
|
| 1136 |
+
df['answer'] = list(zip(df['answer_value'], df['answer_unit']))
|
| 1137 |
+
df = df[['index'] + [col for col in df.columns if col != 'index']]
|
| 1138 |
+
|
| 1139 |
+
if dataset == "QSpatial_scannet":
|
| 1140 |
+
df = df.drop(columns=["image"])
|
| 1141 |
+
df["image"] = [Image.open(os.path.join(self.qspatial_root, image_path)) for image_path in df["image_path"]]
|
| 1142 |
+
else:
|
| 1143 |
+
df["image"] = [Image.open(io.BytesIO(image_dict["bytes"])) for image_dict in df["image"]]
|
| 1144 |
+
|
| 1145 |
+
df["image"] = [encode_image_to_base64(image) for image in df["image"]]
|
| 1146 |
+
return df
|
| 1147 |
+
|
| 1148 |
+
@classmethod
|
| 1149 |
+
def get_multiplier(self, unit):
|
| 1150 |
+
|
| 1151 |
+
unit = unit.lower()
|
| 1152 |
+
if unit in ["meters", "meter", "m", "metre", "metres"]:
|
| 1153 |
+
multiplier = 100
|
| 1154 |
+
elif unit in ["centimeters", "centimeter", "cm"]:
|
| 1155 |
+
multiplier = 1
|
| 1156 |
+
elif unit in ["feet", "foot", "ft"]:
|
| 1157 |
+
multiplier = 30.48
|
| 1158 |
+
elif unit in ["inch", "inches", "in"]:
|
| 1159 |
+
multiplier = 2.54
|
| 1160 |
+
elif unit in ["mm"]:
|
| 1161 |
+
multiplier = 0.1
|
| 1162 |
+
else:
|
| 1163 |
+
print(f"Unknown unit: {unit}")
|
| 1164 |
+
multiplier = 0.
|
| 1165 |
+
|
| 1166 |
+
return multiplier
|
| 1167 |
+
|
| 1168 |
+
@classmethod
|
| 1169 |
+
def parse_string(self, input_str):
|
| 1170 |
+
# Regular expression to match the pattern (number or range, text)
|
| 1171 |
+
match = re.match(r'\(([\d.-]+), (.+)\)', input_str)
|
| 1172 |
+
if match:
|
| 1173 |
+
number_part = match.group(1)
|
| 1174 |
+
text = match.group(2)
|
| 1175 |
+
|
| 1176 |
+
if '-' in number_part:
|
| 1177 |
+
start, end = map(float, number_part.split('-'))
|
| 1178 |
+
number = (start + end) / 2
|
| 1179 |
+
else:
|
| 1180 |
+
number = float(number_part)
|
| 1181 |
+
|
| 1182 |
+
return number * self.get_multiplier(text)
|
| 1183 |
+
else:
|
| 1184 |
+
print(f"Unable to parse the input string {input_str}")
|
| 1185 |
+
return 0
|
| 1186 |
+
|
| 1187 |
+
@classmethod
|
| 1188 |
+
def parse_prediction(self, vlm_response):
|
| 1189 |
+
# Value
|
| 1190 |
+
pattern = r'scalar{([^}]*)}'
|
| 1191 |
+
str_inside_scalar_boxes = re.findall(pattern, vlm_response)[-1]
|
| 1192 |
+
scalar_list = re.findall(r'\d+\.?\d*', str_inside_scalar_boxes)
|
| 1193 |
+
parsed_scalar = np.array(scalar_list).astype(float).mean()
|
| 1194 |
+
|
| 1195 |
+
# Unit
|
| 1196 |
+
pattern = r'distance_unit{([^}]*)}'
|
| 1197 |
+
str_inside_unit_boxes = re.findall(pattern, vlm_response)
|
| 1198 |
+
parsed_unit = str_inside_unit_boxes[-1]
|
| 1199 |
+
|
| 1200 |
+
pred_value_in_cms = parsed_scalar * self.get_multiplier(parsed_unit)
|
| 1201 |
+
return pred_value_in_cms
|
| 1202 |
+
|
| 1203 |
+
# It returns a dictionary
|
| 1204 |
+
@classmethod
|
| 1205 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 1206 |
+
|
| 1207 |
+
data = load(eval_file)
|
| 1208 |
+
if "model" in judge_kwargs:
|
| 1209 |
+
from .utils.qspatial import QSpatial_auxeval
|
| 1210 |
+
|
| 1211 |
+
# extract using model
|
| 1212 |
+
model = judge_kwargs['model']
|
| 1213 |
+
suffix = eval_file.split('.')[-1]
|
| 1214 |
+
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
|
| 1215 |
+
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
|
| 1216 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
| 1217 |
+
|
| 1218 |
+
if not osp.exists(storage):
|
| 1219 |
+
model = build_judge(max_tokens=128, **judge_kwargs)
|
| 1220 |
+
|
| 1221 |
+
assert model.working(), ('Evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
|
| 1222 |
+
lt = len(data)
|
| 1223 |
+
lines = [data.iloc[i] for i in range(lt)]
|
| 1224 |
+
tups = [(model, line) for line in lines]
|
| 1225 |
+
indices = [line['index'] for line in lines]
|
| 1226 |
+
|
| 1227 |
+
ans = {}
|
| 1228 |
+
if osp.exists(tmp_file):
|
| 1229 |
+
ans = load(tmp_file)
|
| 1230 |
+
tups = [x for x, i in zip(tups, indices) if i not in ans]
|
| 1231 |
+
indices = [i for i in indices if i not in ans]
|
| 1232 |
+
|
| 1233 |
+
if len(indices):
|
| 1234 |
+
new_results = track_progress_rich(
|
| 1235 |
+
QSpatial_auxeval,
|
| 1236 |
+
tups,
|
| 1237 |
+
nproc=nproc,
|
| 1238 |
+
chunksize=nproc,
|
| 1239 |
+
keys=indices,
|
| 1240 |
+
save=tmp_file,
|
| 1241 |
+
)
|
| 1242 |
+
ans = load(tmp_file)
|
| 1243 |
+
for k, v in zip(indices, new_results):
|
| 1244 |
+
assert k in ans
|
| 1245 |
+
assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
|
| 1246 |
+
|
| 1247 |
+
data['res'] = [ans[idx]['res'] for idx in data['index']]
|
| 1248 |
+
data['log'] = [ans[idx]['log'] for idx in data['index']]
|
| 1249 |
+
dump(data, storage)
|
| 1250 |
+
|
| 1251 |
+
data = load(storage)
|
| 1252 |
+
|
| 1253 |
+
pred_value_in_cms = []
|
| 1254 |
+
for res in data["res"]:
|
| 1255 |
+
try:
|
| 1256 |
+
pred_value_in_cms.append(self.parse_string(res))
|
| 1257 |
+
except ValueError:
|
| 1258 |
+
pred_value_in_cms.append(0.)
|
| 1259 |
+
|
| 1260 |
+
pred_value_in_cms = np.array(pred_value_in_cms) + 1e-8
|
| 1261 |
+
else:
|
| 1262 |
+
# regex parsing
|
| 1263 |
+
pred_value_in_cms = []
|
| 1264 |
+
n_errors_in_parsing = 0
|
| 1265 |
+
for pred in data["prediction"]:
|
| 1266 |
+
try:
|
| 1267 |
+
parsed_value = self.parse_prediction(pred)
|
| 1268 |
+
except IndexError:
|
| 1269 |
+
n_errors_in_parsing += 1
|
| 1270 |
+
parsed_value = 1e-8
|
| 1271 |
+
|
| 1272 |
+
pred_value_in_cms.append(parsed_value)
|
| 1273 |
+
|
| 1274 |
+
print(f"Encounter {n_errors_in_parsing} errors in parsing")
|
| 1275 |
+
pred_value_in_cms = np.array(pred_value_in_cms) + 1e-8
|
| 1276 |
+
|
| 1277 |
+
# Ground truth
|
| 1278 |
+
ground_truth_value_in_cms = []
|
| 1279 |
+
for answer in data["answer"]:
|
| 1280 |
+
value, unit = eval(answer)
|
| 1281 |
+
ground_truth_value_in_cms.append(value * self.get_multiplier(unit))
|
| 1282 |
+
ground_truth_value_in_cms = np.array(ground_truth_value_in_cms) + 1e-8
|
| 1283 |
+
|
| 1284 |
+
# Calculate the score
|
| 1285 |
+
pred_gt = pred_value_in_cms / ground_truth_value_in_cms
|
| 1286 |
+
gt_pred = ground_truth_value_in_cms / pred_value_in_cms
|
| 1287 |
+
delta_2 = np.stack([pred_gt, gt_pred]).max(0) < 2.
|
| 1288 |
+
delta_1_point_5 = np.stack([pred_gt, gt_pred]).max(0) < 1.5
|
| 1289 |
+
|
| 1290 |
+
data["eval_score_delta_2"] = delta_2
|
| 1291 |
+
data["eval_score_delta_1_point_5"] = delta_1_point_5
|
| 1292 |
+
|
| 1293 |
+
final_score_dict = {
|
| 1294 |
+
"delta_2": delta_2.mean(),
|
| 1295 |
+
"delta_1_point_5": delta_1_point_5.mean()
|
| 1296 |
+
}
|
| 1297 |
+
for question_type in set(data["question_type"]):
|
| 1298 |
+
filtered_data = data[data["question_type"] == question_type]
|
| 1299 |
+
delta_2_per_question_type = filtered_data["eval_score_delta_2"].mean()
|
| 1300 |
+
delta_1_point_5_per_question_type = filtered_data["eval_score_delta_1_point_5"].mean()
|
| 1301 |
+
final_score_dict.update({f"{question_type}_delta_2": delta_2_per_question_type})
|
| 1302 |
+
final_score_dict.update({f"{question_type}_delta_1_point_5": delta_1_point_5_per_question_type})
|
| 1303 |
+
|
| 1304 |
+
score_pth = eval_file.replace('.xlsx', '_score.json')
|
| 1305 |
+
dump(final_score_dict, score_pth)
|
| 1306 |
+
return final_score_dict
|
| 1307 |
+
|
| 1308 |
+
|
| 1309 |
+
class MMNIAH(ImageBaseDataset):
|
| 1310 |
+
TYPE = 'VQA'
|
| 1311 |
+
DATASET_URL = {
|
| 1312 |
+
'MM_NIAH_VAL':
|
| 1313 |
+
'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/MM_NIAH_VAL.tsv',
|
| 1314 |
+
'MM_NIAH_TEST':
|
| 1315 |
+
['https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-aa',
|
| 1316 |
+
'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ab',
|
| 1317 |
+
'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ac',
|
| 1318 |
+
'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ad',
|
| 1319 |
+
'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ae']}
|
| 1320 |
+
DATASET_MD5 = {'MM_NIAH_VAL': '27e5a8c3cef7746cb38f89cd86c474c5',
|
| 1321 |
+
'MM_NIAH_TEST': 'f490eb2a43096307465fe9e7ef13497c'}
|
| 1322 |
+
|
| 1323 |
+
def prepare_tsv(self, url, file_md5=None):
|
| 1324 |
+
import os
|
| 1325 |
+
data_root = LMUDataRoot()
|
| 1326 |
+
os.makedirs(data_root, exist_ok=True)
|
| 1327 |
+
update_flag = False
|
| 1328 |
+
file_name = 'MM_NIAH_VAL.tsv' if 'MM_NIAH_VAL' in url else 'MM_NIAH_TEST.tsv'
|
| 1329 |
+
data_path = osp.join(data_root, file_name)
|
| 1330 |
+
if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
|
| 1331 |
+
pass
|
| 1332 |
+
elif file_name == 'MM_NIAH_TEST.tsv':
|
| 1333 |
+
warnings.warn('The dataset tsv is not downloaded')
|
| 1334 |
+
for i in range(len(url)):
|
| 1335 |
+
if osp.exists(osp.join(data_root, 'part-a' + chr(ord('a') + i))):
|
| 1336 |
+
print('part_a' + chr(ord('a') + i) + ' is existed')
|
| 1337 |
+
continue
|
| 1338 |
+
download_file(url[i], data_path)
|
| 1339 |
+
file_prefix = 'part-'
|
| 1340 |
+
output_file = data_path
|
| 1341 |
+
split_files = sorted([f for f in os.listdir(data_root) if f.startswith(file_prefix)])
|
| 1342 |
+
with open(output_file, 'wb') as outfile:
|
| 1343 |
+
# 逐个读取每个拆分文件并写入到输出文件
|
| 1344 |
+
for filename in split_files:
|
| 1345 |
+
with open(osp.join(data_root, filename), 'rb') as infile:
|
| 1346 |
+
outfile.write(infile.read())
|
| 1347 |
+
update_flag = True
|
| 1348 |
+
else:
|
| 1349 |
+
warnings.warn('The dataset tsv is not downloaded')
|
| 1350 |
+
download_file(url, data_path)
|
| 1351 |
+
update_flag = True
|
| 1352 |
+
|
| 1353 |
+
if file_size(data_path, 'GB') > 1:
|
| 1354 |
+
local_path = data_path.replace('.tsv', '_local.tsv')
|
| 1355 |
+
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag:
|
| 1356 |
+
from ..tools import LOCALIZE
|
| 1357 |
+
LOCALIZE(data_path, local_path)
|
| 1358 |
+
data_path = local_path
|
| 1359 |
+
return load(data_path)
|
| 1360 |
+
|
| 1361 |
+
@classmethod
|
| 1362 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 1363 |
+
from .utils.mmniah import is_correct
|
| 1364 |
+
# find-image, count-text, find-text,
|
| 1365 |
+
# infer-choose, count-image, visual-reasoning
|
| 1366 |
+
MMNIAH_score = {
|
| 1367 |
+
'count-text': 0,
|
| 1368 |
+
'find-image': 0,
|
| 1369 |
+
'find-text': 0,
|
| 1370 |
+
'infer-choose': 0,
|
| 1371 |
+
'count-image': 0,
|
| 1372 |
+
'visual-reasoning': 0,
|
| 1373 |
+
'total': 0,
|
| 1374 |
+
}
|
| 1375 |
+
MMNIAH_num = {
|
| 1376 |
+
'count-text': 0,
|
| 1377 |
+
'find-image': 0,
|
| 1378 |
+
'find-text': 0,
|
| 1379 |
+
'infer-choose': 0,
|
| 1380 |
+
'count-image': 0,
|
| 1381 |
+
'visual-reasoning': 0,
|
| 1382 |
+
'total': 0,
|
| 1383 |
+
}
|
| 1384 |
+
final_score_dict = {
|
| 1385 |
+
'count-text': 0,
|
| 1386 |
+
'find-image': 0,
|
| 1387 |
+
'find-text': 0,
|
| 1388 |
+
'infer-choose': 0,
|
| 1389 |
+
'count-image': 0,
|
| 1390 |
+
'visual-reasoning': 0,
|
| 1391 |
+
'total': 0,
|
| 1392 |
+
}
|
| 1393 |
+
data = load(eval_file)
|
| 1394 |
+
lt = len(data)
|
| 1395 |
+
lines = [data.iloc[i] for i in range(lt)]
|
| 1396 |
+
for i in tqdm(range(len(lines))):
|
| 1397 |
+
line = lines[i]
|
| 1398 |
+
predict = line['prediction']
|
| 1399 |
+
answers = line['answer']
|
| 1400 |
+
category = line['category']
|
| 1401 |
+
if category in ['visual-reasoning', 'find-image']:
|
| 1402 |
+
answers = int(answers)
|
| 1403 |
+
if is_correct(answers, predict):
|
| 1404 |
+
MMNIAH_score[category] += 1
|
| 1405 |
+
MMNIAH_score['total'] += 1
|
| 1406 |
+
MMNIAH_num[category] += 1
|
| 1407 |
+
MMNIAH_num['total'] += 1
|
| 1408 |
+
|
| 1409 |
+
for category in ['find-image', 'count-text', 'find-text',
|
| 1410 |
+
'infer-choose', 'count-image', 'visual-reasoning', 'total']:
|
| 1411 |
+
if MMNIAH_num[category] != 0:
|
| 1412 |
+
final_score_dict[category] = MMNIAH_score[category] / MMNIAH_num[category]
|
| 1413 |
+
else:
|
| 1414 |
+
final_score_dict[category] = None
|
| 1415 |
+
|
| 1416 |
+
score_pth = eval_file.replace('.xlsx', '_score.json')
|
| 1417 |
+
dump(final_score_dict, score_pth)
|
| 1418 |
+
return final_score_dict
|
| 1419 |
+
|
| 1420 |
+
def build_prompt(self, line):
|
| 1421 |
+
msgs = super().build_prompt(line)
|
| 1422 |
+
if isinstance(line, int):
|
| 1423 |
+
line = self.data.iloc[line]
|
| 1424 |
+
totalchoice = line['multi-choice options']
|
| 1425 |
+
totalchoice = eval(totalchoice)
|
| 1426 |
+
# find-image, count-text, find-text,
|
| 1427 |
+
# infer-choose, count-image, visual-reasoning
|
| 1428 |
+
context = msgs[-1]['value']
|
| 1429 |
+
context = eval(context)
|
| 1430 |
+
question = context[0] + '\n' + context[1]
|
| 1431 |
+
# tgt_path是所有图像地址列表
|
| 1432 |
+
tgt_path = []
|
| 1433 |
+
for i in range(len(msgs) - 1):
|
| 1434 |
+
tgt_path.append(msgs[i]['value'])
|
| 1435 |
+
choices = totalchoice[0]
|
| 1436 |
+
choices_image = totalchoice[1]
|
| 1437 |
+
if choices:
|
| 1438 |
+
for c_idx, c in enumerate(choices):
|
| 1439 |
+
question = f"{question}\n{chr(c_idx + ord('A'))}. {c}"
|
| 1440 |
+
question += "\nAnswer with the option's letter from the given choices directly."
|
| 1441 |
+
elif choices_image:
|
| 1442 |
+
for c_idx in range(len(choices_image)):
|
| 1443 |
+
question = f"{question}\n{chr(c_idx + ord('A'))}. <image>"
|
| 1444 |
+
question += "\nAnswer with the option's letter from the given choices directly."
|
| 1445 |
+
else:
|
| 1446 |
+
question += '\nAnswer the question using a single word or phrase.'
|
| 1447 |
+
question = '<start>' + question + '<end>'
|
| 1448 |
+
question = question.split('<image>')
|
| 1449 |
+
if choices_image:
|
| 1450 |
+
for i in range(len(question) - 5):
|
| 1451 |
+
question[i] = question[i] + '\n<image>'
|
| 1452 |
+
for i in range(len(question) - 5, len(question) - 1):
|
| 1453 |
+
question[i] = question[i] + '<image>'
|
| 1454 |
+
else:
|
| 1455 |
+
for i in range(len(question) - 1):
|
| 1456 |
+
question[i] = question[i] + '\n<image>'
|
| 1457 |
+
assert len(tgt_path) + 1 == len(question)
|
| 1458 |
+
context = []
|
| 1459 |
+
for i in range(len(tgt_path)):
|
| 1460 |
+
context.append(question[i])
|
| 1461 |
+
context.append(tgt_path[i])
|
| 1462 |
+
context.append(question[-1])
|
| 1463 |
+
context[0] = context[0][7:]
|
| 1464 |
+
context[-1] = context[-1][:-5]
|
| 1465 |
+
msgs = []
|
| 1466 |
+
for i in range(len(context)):
|
| 1467 |
+
if i % 2 == 0:
|
| 1468 |
+
msgs.append(dict(type='text', value=context[i]))
|
| 1469 |
+
else:
|
| 1470 |
+
ROOT = LMUDataRoot()
|
| 1471 |
+
msgs.append(dict(type='image', value=osp.join(osp.join(ROOT, 'images', self.dataset_name), context[i])))
|
| 1472 |
+
for element in msgs:
|
| 1473 |
+
if element['value'] == '':
|
| 1474 |
+
msgs.remove(element)
|
| 1475 |
+
return msgs
|
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_yorn.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from ..smp import *
|
| 2 |
+
from ..utils import *
|
| 3 |
+
from .image_base import ImageBaseDataset
|
| 4 |
+
from .utils import build_judge, DEBUG_MESSAGE
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class ImageYORNDataset(ImageBaseDataset):
|
| 8 |
+
|
| 9 |
+
TYPE = 'Y/N'
|
| 10 |
+
|
| 11 |
+
DATASET_URL = {
|
| 12 |
+
'MME': 'https://opencompass.openxlab.space/utils/VLMEval/MME.tsv',
|
| 13 |
+
'HallusionBench': 'https://opencompass.openxlab.space/utils/VLMEval/HallusionBench.tsv',
|
| 14 |
+
'POPE': 'https://opencompass.openxlab.space/utils/VLMEval/POPE.tsv',
|
| 15 |
+
'AMBER': 'https://huggingface.co/datasets/yifanzhang114/AMBER_base64/resolve/main/AMBER.tsv',
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
DATASET_MD5 = {
|
| 19 |
+
'MME': 'b36b43c3f09801f5d368627fb92187c3',
|
| 20 |
+
'HallusionBench': '0c23ac0dc9ef46832d7a24504f2a0c7c',
|
| 21 |
+
'POPE': 'c12f5acb142f2ef1f85a26ba2fbe41d5',
|
| 22 |
+
'AMBER': '970d94c0410916166e0a76ba75da7934',
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
# It returns a dataframe
|
| 26 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 27 |
+
from .utils.yorn import YOrN_Extraction, YOrN_auxeval
|
| 28 |
+
from .utils.yorn import default_rating, MME_rating, Hallusion_rating, POPE_rating, AMBER_rating
|
| 29 |
+
|
| 30 |
+
dataset = self.dataset_name
|
| 31 |
+
data = load(eval_file)
|
| 32 |
+
data['prediction'] = [str(x) for x in data['prediction']]
|
| 33 |
+
storage = eval_file.replace('.xlsx', '_auxmatch.xlsx')
|
| 34 |
+
tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
|
| 35 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
| 36 |
+
|
| 37 |
+
if not osp.exists(storage):
|
| 38 |
+
ans_map = {k: YOrN_Extraction(v) for k, v in zip(data['index'], data['prediction'])}
|
| 39 |
+
if osp.exists(tmp_file):
|
| 40 |
+
tmp = load(tmp_file)
|
| 41 |
+
for k in tmp:
|
| 42 |
+
if ans_map[k] == 'Unknown' and tmp[k] != 'Unknown':
|
| 43 |
+
ans_map[k] = tmp[k]
|
| 44 |
+
|
| 45 |
+
data['extracted'] = [ans_map[x] for x in data['index']]
|
| 46 |
+
unknown = data[data['extracted'] == 'Unknown']
|
| 47 |
+
|
| 48 |
+
model = judge_kwargs.get('model', 'exact_matching')
|
| 49 |
+
if model == 'exact_matching':
|
| 50 |
+
model = None
|
| 51 |
+
elif gpt_key_set():
|
| 52 |
+
model = build_judge(**judge_kwargs)
|
| 53 |
+
if not model.working():
|
| 54 |
+
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
|
| 55 |
+
warnings.warn(DEBUG_MESSAGE)
|
| 56 |
+
model = None
|
| 57 |
+
else:
|
| 58 |
+
model = None
|
| 59 |
+
warnings.warn('OPENAI_API_KEY is not working properly, will use exact matching for evaluation')
|
| 60 |
+
|
| 61 |
+
if model is not None:
|
| 62 |
+
lt = len(unknown)
|
| 63 |
+
lines = [unknown.iloc[i] for i in range(lt)]
|
| 64 |
+
tups = [(model, line) for line in lines]
|
| 65 |
+
indices = list(unknown['index'])
|
| 66 |
+
if len(tups):
|
| 67 |
+
res = track_progress_rich(
|
| 68 |
+
YOrN_auxeval, tups, nproc=nproc, chunksize=nproc, keys=indices, save=tmp_file)
|
| 69 |
+
for k, v in zip(indices, res):
|
| 70 |
+
ans_map[k] = v
|
| 71 |
+
|
| 72 |
+
data['extracted'] = [ans_map[x] for x in data['index']]
|
| 73 |
+
dump(data, storage)
|
| 74 |
+
|
| 75 |
+
data = load(storage)
|
| 76 |
+
if listinstr(['AMBER'], dataset):
|
| 77 |
+
data['score'] = (data['answer'].str.lower() == data['extracted'].str.lower())
|
| 78 |
+
else:
|
| 79 |
+
data['score'] = (data['answer'] == data['extracted'])
|
| 80 |
+
dump(data, storage)
|
| 81 |
+
|
| 82 |
+
if dataset is not None and listinstr(['MME'], dataset):
|
| 83 |
+
score = MME_rating(storage)
|
| 84 |
+
elif dataset is not None and listinstr(['Hallusion'], dataset):
|
| 85 |
+
score = Hallusion_rating(storage)
|
| 86 |
+
elif dataset is not None and listinstr(['POPE'], dataset):
|
| 87 |
+
score = POPE_rating(storage)
|
| 88 |
+
elif dataset is not None and listinstr(['AMBER'], dataset):
|
| 89 |
+
score = AMBER_rating(storage)
|
| 90 |
+
else:
|
| 91 |
+
score = default_rating(storage)
|
| 92 |
+
|
| 93 |
+
score_tgt = eval_file.replace('.xlsx', '_score.csv')
|
| 94 |
+
dump(score, score_tgt)
|
| 95 |
+
return score
|
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/longvideobench.py
ADDED
|
@@ -0,0 +1,328 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from huggingface_hub import snapshot_download
|
| 2 |
+
from ..smp import *
|
| 3 |
+
from .video_base import VideoBaseDataset
|
| 4 |
+
from .utils import build_judge, DEBUG_MESSAGE
|
| 5 |
+
from glob import glob
|
| 6 |
+
|
| 7 |
+
FAIL_MSG = 'Failed to obtain answer via API.'
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def timestamp_to_seconds(timestamp):
|
| 11 |
+
# Split the timestamp into hours, minutes, and seconds
|
| 12 |
+
h, m, s = timestamp.split(":")
|
| 13 |
+
# Convert hours, minutes, and total seconds (including fractions) to float and compute total seconds
|
| 14 |
+
total_seconds = int(h) * 3600 + int(m) * 60 + float(s)
|
| 15 |
+
return total_seconds
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def uniformly_subsample(lst, K):
|
| 19 |
+
n = len(lst)
|
| 20 |
+
if K >= n:
|
| 21 |
+
return lst
|
| 22 |
+
step = n / K
|
| 23 |
+
return [lst[int(i * step)] for i in range(K)]
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def insert_subtitles_into_frames(
|
| 27 |
+
frames,
|
| 28 |
+
frame_timestamps,
|
| 29 |
+
subtitles,
|
| 30 |
+
starting_timestamp_for_subtitles,
|
| 31 |
+
duration,
|
| 32 |
+
):
|
| 33 |
+
interleaved_list = []
|
| 34 |
+
cur_i = 0
|
| 35 |
+
|
| 36 |
+
for subtitle in subtitles:
|
| 37 |
+
if "timestamp" in subtitle:
|
| 38 |
+
start, end = subtitle["timestamp"]
|
| 39 |
+
|
| 40 |
+
if not isinstance(end, float):
|
| 41 |
+
end = duration
|
| 42 |
+
|
| 43 |
+
start -= starting_timestamp_for_subtitles
|
| 44 |
+
end -= starting_timestamp_for_subtitles
|
| 45 |
+
|
| 46 |
+
subtitle_timestamp = (start + end) / 2
|
| 47 |
+
subtitle_text = subtitle["text"]
|
| 48 |
+
else:
|
| 49 |
+
start, end = subtitle["start"], subtitle["end"]
|
| 50 |
+
start = timestamp_to_seconds(start)
|
| 51 |
+
end = timestamp_to_seconds(end)
|
| 52 |
+
start -= starting_timestamp_for_subtitles
|
| 53 |
+
end -= starting_timestamp_for_subtitles
|
| 54 |
+
|
| 55 |
+
subtitle_timestamp = (start + end) / 2
|
| 56 |
+
subtitle_text = subtitle["line"]
|
| 57 |
+
|
| 58 |
+
for i, (frame, frame_timestamp) in enumerate(
|
| 59 |
+
zip(frames[cur_i:], frame_timestamps[cur_i:])
|
| 60 |
+
):
|
| 61 |
+
if frame_timestamp <= subtitle_timestamp:
|
| 62 |
+
# print("frame:", frame_timestamp)
|
| 63 |
+
interleaved_list.append({"type": "image", "value": frame})
|
| 64 |
+
cur_i += 1
|
| 65 |
+
else:
|
| 66 |
+
break
|
| 67 |
+
|
| 68 |
+
if end - start < 1:
|
| 69 |
+
end = subtitle_timestamp + 0.5
|
| 70 |
+
start = subtitle_timestamp - 0.5
|
| 71 |
+
|
| 72 |
+
covering_frames = False
|
| 73 |
+
for frame, frame_timestamp in zip(frames, frame_timestamps):
|
| 74 |
+
if frame_timestamp < end and frame_timestamp > start:
|
| 75 |
+
covering_frames = True
|
| 76 |
+
break
|
| 77 |
+
|
| 78 |
+
if covering_frames:
|
| 79 |
+
interleaved_list.append({"type": "text", "value": subtitle_text + "\n"})
|
| 80 |
+
else:
|
| 81 |
+
pass
|
| 82 |
+
|
| 83 |
+
for i, (frame, frame_timestamp) in enumerate(
|
| 84 |
+
zip(frames[cur_i:], frame_timestamps[cur_i:])
|
| 85 |
+
):
|
| 86 |
+
interleaved_list.append({"type": "image", "value": frame})
|
| 87 |
+
return interleaved_list
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
class LongVideoBench(VideoBaseDataset):
|
| 91 |
+
|
| 92 |
+
MD5 = '82905eae3a5ae7383c5a8ee9655e1ab9'
|
| 93 |
+
SYS = ''
|
| 94 |
+
|
| 95 |
+
TYPE = 'Video-MCQ'
|
| 96 |
+
|
| 97 |
+
def __init__(self, dataset='LongVideoBench', use_subtitle=False, nframe=0, fps=-1):
|
| 98 |
+
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
|
| 99 |
+
self.use_subtitle = use_subtitle
|
| 100 |
+
self.dataset_name = dataset
|
| 101 |
+
|
| 102 |
+
@classmethod
|
| 103 |
+
def supported_datasets(cls):
|
| 104 |
+
return ['LongVideoBench']
|
| 105 |
+
|
| 106 |
+
def prepare_dataset(self, dataset_name='LongVideoBench', repo_id='longvideobench/LongVideoBench'):
|
| 107 |
+
|
| 108 |
+
def check_integrity(pth):
|
| 109 |
+
data_file = osp.join(pth, f'{dataset_name}.tsv')
|
| 110 |
+
|
| 111 |
+
if not osp.exists(data_file):
|
| 112 |
+
return False
|
| 113 |
+
|
| 114 |
+
if md5(data_file) != self.MD5:
|
| 115 |
+
print("md5 mismatch", md5(data_file), self.MD5)
|
| 116 |
+
return False
|
| 117 |
+
data = load(data_file)
|
| 118 |
+
for video_pth in data['video_path']:
|
| 119 |
+
if not osp.exists(osp.join(pth, video_pth)):
|
| 120 |
+
print(video_pth, "is not found")
|
| 121 |
+
return False
|
| 122 |
+
return True
|
| 123 |
+
|
| 124 |
+
if modelscope_flag_set():
|
| 125 |
+
repo_id = "AI-ModelScope/LongVideoBench"
|
| 126 |
+
|
| 127 |
+
cache_path = get_cache_path(repo_id)
|
| 128 |
+
if cache_path is not None and check_integrity(cache_path):
|
| 129 |
+
dataset_path = cache_path
|
| 130 |
+
else:
|
| 131 |
+
def generate_tsv(pth):
|
| 132 |
+
data_file = osp.join(pth, f'{dataset_name}.tsv')
|
| 133 |
+
if osp.exists(data_file) and md5(data_file) == self.MD5:
|
| 134 |
+
return
|
| 135 |
+
|
| 136 |
+
data_file = pd.read_json(osp.join(pth, 'lvb_val.json'))
|
| 137 |
+
data_file = data_file.assign(index=range(len(data_file)))
|
| 138 |
+
data_file['video'] = data_file['video_id']
|
| 139 |
+
data_file['video_path'] = data_file['video_path'].apply(lambda x: f'./videos/{x}')
|
| 140 |
+
|
| 141 |
+
data_file.to_csv(osp.join(pth, f'{dataset_name}.tsv'), sep='\t', index=False)
|
| 142 |
+
|
| 143 |
+
if modelscope_flag_set():
|
| 144 |
+
from modelscope import dataset_snapshot_download
|
| 145 |
+
dataset_snapshot_download(dataset_id=repo_id)
|
| 146 |
+
else:
|
| 147 |
+
snapshot_download(repo_id=repo_id, repo_type='dataset')
|
| 148 |
+
print("All videos are downloaded for LongVideoBench")
|
| 149 |
+
|
| 150 |
+
if not glob(osp.join(cache_path, "videos")):
|
| 151 |
+
tar_files = glob(osp.join(cache_path, "**/*.tar*"), recursive=True)
|
| 152 |
+
|
| 153 |
+
def untar_video_data(tar_file, cache_dir):
|
| 154 |
+
import tarfile
|
| 155 |
+
with tarfile.open(tar_file, "r") as tar_ref:
|
| 156 |
+
tar_ref.extractall(cache_dir)
|
| 157 |
+
print(f"Extracted all files from {tar_file} to {cache_dir}")
|
| 158 |
+
|
| 159 |
+
def concat_tar_parts(tar_parts, output_tar):
|
| 160 |
+
with open(output_tar, "wb") as out_tar:
|
| 161 |
+
from tqdm import tqdm
|
| 162 |
+
for part in tqdm(sorted(tar_parts)):
|
| 163 |
+
with open(part, "rb") as part_file:
|
| 164 |
+
out_tar.write(part_file.read())
|
| 165 |
+
print(f"Concatenated parts {tar_parts} into {output_tar}")
|
| 166 |
+
|
| 167 |
+
tar_parts_dict = {}
|
| 168 |
+
|
| 169 |
+
# Group tar parts together
|
| 170 |
+
for tar_file in tar_files:
|
| 171 |
+
base_name = tar_file.split(".tar")[0]
|
| 172 |
+
if base_name not in tar_parts_dict:
|
| 173 |
+
tar_parts_dict[base_name] = []
|
| 174 |
+
tar_parts_dict[base_name].append(tar_file)
|
| 175 |
+
|
| 176 |
+
# Concatenate and untar split parts
|
| 177 |
+
for base_name, parts in tar_parts_dict.items():
|
| 178 |
+
print(f"Extracting following tar files: {parts}")
|
| 179 |
+
output_tar = base_name + ".tar"
|
| 180 |
+
if not osp.exists(output_tar):
|
| 181 |
+
print('Start concatenating tar files')
|
| 182 |
+
|
| 183 |
+
concat_tar_parts(parts, output_tar)
|
| 184 |
+
print('Finish concatenating tar files')
|
| 185 |
+
|
| 186 |
+
if not osp.exists(osp.join(cache_path, osp.basename(base_name))):
|
| 187 |
+
untar_video_data(output_tar, cache_path)
|
| 188 |
+
|
| 189 |
+
print('All videos are extracted for LongVideoBench')
|
| 190 |
+
|
| 191 |
+
dataset_path = cache_path
|
| 192 |
+
generate_tsv(dataset_path)
|
| 193 |
+
|
| 194 |
+
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
|
| 195 |
+
|
| 196 |
+
return dict(data_file=data_file, root=dataset_path)
|
| 197 |
+
|
| 198 |
+
def save_video_frames(self, video_path, video_llm=False):
|
| 199 |
+
|
| 200 |
+
vid_path = osp.join(self.data_root, video_path)
|
| 201 |
+
vid = decord.VideoReader(vid_path)
|
| 202 |
+
video_info = {
|
| 203 |
+
'fps': vid.get_avg_fps(),
|
| 204 |
+
'n_frames': len(vid),
|
| 205 |
+
}
|
| 206 |
+
if self.nframe > 0 and self.fps < 0:
|
| 207 |
+
step_size = len(vid) / (self.nframe + 1)
|
| 208 |
+
indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
|
| 209 |
+
frame_paths = self.frame_paths(video_path[:-4])
|
| 210 |
+
elif self.fps > 0:
|
| 211 |
+
# not constrained by num_frames, get frames by fps
|
| 212 |
+
total_duration = video_info['n_frames'] / video_info['fps']
|
| 213 |
+
required_frames = int(total_duration * self.fps)
|
| 214 |
+
step_size = video_info['fps'] / self.fps
|
| 215 |
+
indices = [int(i * step_size) for i in range(required_frames)]
|
| 216 |
+
frame_paths = self.frame_paths_fps(video_path[:-4], len(indices))
|
| 217 |
+
|
| 218 |
+
flag = np.all([osp.exists(p) for p in frame_paths])
|
| 219 |
+
|
| 220 |
+
if not flag:
|
| 221 |
+
images = [vid[i].asnumpy() for i in indices]
|
| 222 |
+
images = [Image.fromarray(arr) for arr in images]
|
| 223 |
+
for im, pth in zip(images, frame_paths):
|
| 224 |
+
if not osp.exists(pth) and not video_llm:
|
| 225 |
+
im.save(pth)
|
| 226 |
+
|
| 227 |
+
return frame_paths, indices, video_info
|
| 228 |
+
|
| 229 |
+
# def save_video_into_images(self, line, num_frames=8):
|
| 230 |
+
# frame_paths, indices, video_info = self.save_video_frames(line['video_path'], num_frames)
|
| 231 |
+
# return frame_paths
|
| 232 |
+
|
| 233 |
+
def build_prompt(self, line, video_llm):
|
| 234 |
+
if isinstance(line, int):
|
| 235 |
+
assert line < len(self)
|
| 236 |
+
line = self.data.iloc[line]
|
| 237 |
+
|
| 238 |
+
frames, indices, video_info = self.save_video_frames(line['video_path'], video_llm)
|
| 239 |
+
fps = video_info["fps"]
|
| 240 |
+
|
| 241 |
+
message = [dict(type='text', value=self.SYS)]
|
| 242 |
+
if video_llm:
|
| 243 |
+
message.append(dict(type='video', value=osp.join(self.data_root, line['video_path'])))
|
| 244 |
+
else:
|
| 245 |
+
if not self.use_subtitle:
|
| 246 |
+
with open(osp.join(self.data_root, "subtitles", line["subtitle_path"])) as f:
|
| 247 |
+
subtitles = json.load(f)
|
| 248 |
+
|
| 249 |
+
frame_message = insert_subtitles_into_frames(
|
| 250 |
+
frames,
|
| 251 |
+
[ind_ / fps for ind_ in indices],
|
| 252 |
+
subtitles,
|
| 253 |
+
line["starting_timestamp_for_subtitles"],
|
| 254 |
+
line["duration"]
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
message += frame_message
|
| 258 |
+
else:
|
| 259 |
+
for im in frames:
|
| 260 |
+
message.append(dict(type='image', value=im))
|
| 261 |
+
|
| 262 |
+
line['question'] += '\n' + '\n'.join(
|
| 263 |
+
["{}. {}".format(chr(ord("A") + i), cand) for i, cand in enumerate(eval(line['candidates']))]
|
| 264 |
+
)
|
| 265 |
+
prompt = line["question"] + "\nAnswer with the option's letter from the given choices directly."
|
| 266 |
+
message.append(dict(type='text', value=prompt))
|
| 267 |
+
return message
|
| 268 |
+
|
| 269 |
+
# It returns a dictionary
|
| 270 |
+
@classmethod
|
| 271 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 272 |
+
from .utils.longvideobench import get_dimension_rating, extract_characters_regex, extract_option
|
| 273 |
+
|
| 274 |
+
assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
|
| 275 |
+
|
| 276 |
+
tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
|
| 277 |
+
tgt_file = eval_file.replace('.xlsx', '_rating.json')
|
| 278 |
+
score_file = eval_file.replace('.xlsx', '_score.xlsx')
|
| 279 |
+
|
| 280 |
+
if not osp.exists(score_file):
|
| 281 |
+
model = judge_kwargs.get('model', 'exact_matching')
|
| 282 |
+
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
|
| 283 |
+
|
| 284 |
+
if model == 'exact_matching':
|
| 285 |
+
model = None
|
| 286 |
+
elif gpt_key_set():
|
| 287 |
+
model = build_judge(**judge_kwargs)
|
| 288 |
+
if not model.working():
|
| 289 |
+
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
|
| 290 |
+
warnings.warn(DEBUG_MESSAGE)
|
| 291 |
+
model = None
|
| 292 |
+
else:
|
| 293 |
+
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
|
| 294 |
+
model = None
|
| 295 |
+
res = {} if not osp.exists(tmp_file) else load(tmp_file)
|
| 296 |
+
res = {k: v for k, v in res.items() if FAIL_MSG not in v}
|
| 297 |
+
|
| 298 |
+
data = load(eval_file)
|
| 299 |
+
data_un = data[~pd.isna(data['prediction'])]
|
| 300 |
+
|
| 301 |
+
for idx in data['index']:
|
| 302 |
+
ans = data.loc[data['index'] == idx, 'correct_choice'].values[0]
|
| 303 |
+
ans = chr(ord("A") + ans)
|
| 304 |
+
pred = str(data.loc[data['index'] == idx, 'prediction'].values[0])
|
| 305 |
+
|
| 306 |
+
if extract_characters_regex(pred) == '':
|
| 307 |
+
extract_pred = extract_option(
|
| 308 |
+
model,
|
| 309 |
+
data.loc[data['index'] == idx].to_dict(orient='records')[0],
|
| 310 |
+
'LongVideoBench'
|
| 311 |
+
)
|
| 312 |
+
data.loc[idx, 'score'] = int(extract_pred == ans)
|
| 313 |
+
else:
|
| 314 |
+
data.loc[idx, 'score'] = int(extract_characters_regex(pred) == ans)
|
| 315 |
+
|
| 316 |
+
rejected = [x for x in data['score'] if x == -1]
|
| 317 |
+
|
| 318 |
+
print(
|
| 319 |
+
f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
|
| 320 |
+
f'failed to obtain the score for another {len(rejected)} questions. '
|
| 321 |
+
f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
|
| 322 |
+
)
|
| 323 |
+
|
| 324 |
+
dump(data, score_file)
|
| 325 |
+
|
| 326 |
+
rating = get_dimension_rating(score_file)
|
| 327 |
+
dump(rating, tgt_file)
|
| 328 |
+
return rating
|
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/miabench.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
from .image_base import ImageBaseDataset
|
| 7 |
+
from ..smp import *
|
| 8 |
+
from .utils import build_judge, DEBUG_MESSAGE
|
| 9 |
+
from ..utils import track_progress_rich
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def generate_prompt(d):
|
| 13 |
+
question = d['question']
|
| 14 |
+
weights = eval(d['component_weight'])
|
| 15 |
+
components = eval(d['components'])
|
| 16 |
+
num_of_component = int(d['num_of_component'])
|
| 17 |
+
response = d['prediction']
|
| 18 |
+
|
| 19 |
+
if num_of_component == 1:
|
| 20 |
+
components = f"The first component is: '{components[0]}'. "
|
| 21 |
+
score = f"The first component is worth: {weights[0]} scores. "
|
| 22 |
+
elif num_of_component == 2:
|
| 23 |
+
components = f"The first component is: '{components[0]}', and the second component is '{components[1]}'. "
|
| 24 |
+
score = f"The first and second component is each worth {weights[0]} and {weights[1]} scores. "
|
| 25 |
+
elif num_of_component == 3:
|
| 26 |
+
components = (
|
| 27 |
+
f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
|
| 28 |
+
f"and the third component is '{components[2]}'. "
|
| 29 |
+
)
|
| 30 |
+
score = (
|
| 31 |
+
"The first, second, and third component is each worth "
|
| 32 |
+
f"{weights[0]}, {weights[1]}, and {weights[2]} scores."
|
| 33 |
+
)
|
| 34 |
+
elif num_of_component == 4:
|
| 35 |
+
components = (
|
| 36 |
+
f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
|
| 37 |
+
f"and the third component is '{components[2]}', and the fourth component is '{components[3]}'. "
|
| 38 |
+
)
|
| 39 |
+
score = (
|
| 40 |
+
"The first, second, third, and fourth component is each worth "
|
| 41 |
+
f"{weights[0]}, {weights[1]}, {weights[2]}, and {weights[3]} scores."
|
| 42 |
+
)
|
| 43 |
+
elif num_of_component == 5:
|
| 44 |
+
components = (
|
| 45 |
+
f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
|
| 46 |
+
f"and the third component is '{components[2]}', and the fourth component is '{components[3]}', "
|
| 47 |
+
f"and the fifth component is '{components[4]}'. "
|
| 48 |
+
)
|
| 49 |
+
score = (
|
| 50 |
+
"The first, second, third, fourth, and fifth component is each worth "
|
| 51 |
+
f"{weights[0]}, {weights[1]}, {weights[2]}, {weights[3]}, and {weights[4]} scores."
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
return (
|
| 55 |
+
"Here is an instruction for a multimodal LLM: '"
|
| 56 |
+
f"{question}"
|
| 57 |
+
"'. You need to grade if the response from the model follows each component of the instruction. "
|
| 58 |
+
f"{components}"
|
| 59 |
+
"The response is: '"
|
| 60 |
+
f"{response}"
|
| 61 |
+
"'. You need to score the response and be strict. The total score ranges from 0 to 10, "
|
| 62 |
+
"depending on if the response follows the instruction. "
|
| 63 |
+
f"{score}"
|
| 64 |
+
"List scores of each component, and the total score in one sentence in this format: "
|
| 65 |
+
"score of component 1: x/2, score of component 2: y/8, total score: z/10. Then explain your reasons."
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def process_rawscore(component_type, raw_score):
|
| 70 |
+
first_sentence = raw_score.split('.')[0].split(',')
|
| 71 |
+
score_dict = {}
|
| 72 |
+
for i in range(len(first_sentence) - 1):
|
| 73 |
+
score_ = first_sentence[i].split(':')[1][1:].split('/')
|
| 74 |
+
score = int(score_[0]) / int(score_[1])
|
| 75 |
+
score_dict[component_type[i]] = score
|
| 76 |
+
total_score_ = first_sentence[i + 1].split(':')[1][1:].split('/')
|
| 77 |
+
total_score = int(total_score_[0]) / int(total_score_[1])
|
| 78 |
+
score_dict['total_score'] = total_score
|
| 79 |
+
return score_dict
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def get_score_dict(data, score_raw):
|
| 83 |
+
cat_score_dict = {}
|
| 84 |
+
for i in range(len(data)):
|
| 85 |
+
try:
|
| 86 |
+
cmp = data['component_type'][i][2:-2]
|
| 87 |
+
cmp_list = cmp.split('\', \'')
|
| 88 |
+
score_dict = process_rawscore(cmp_list, score_raw[i])
|
| 89 |
+
for key, val in score_dict.items():
|
| 90 |
+
if key not in cat_score_dict.keys():
|
| 91 |
+
cat_score_dict[key] = [val]
|
| 92 |
+
else:
|
| 93 |
+
cat_score_dict[key].append(val)
|
| 94 |
+
except:
|
| 95 |
+
pass
|
| 96 |
+
cat_score_dict_average = {}
|
| 97 |
+
for key, val in cat_score_dict.items():
|
| 98 |
+
cat_score_dict_average[key] = sum(val) / len(val)
|
| 99 |
+
return cat_score_dict_average
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
class MIABench(ImageBaseDataset):
|
| 103 |
+
TYPE = 'VQA'
|
| 104 |
+
|
| 105 |
+
DATASET_URL = {
|
| 106 |
+
'MIA-Bench': 'https://opencompass.openxlab.space/utils/VLMEval/Mia-Bench.tsv',
|
| 107 |
+
}
|
| 108 |
+
DATASET_MD5 = {
|
| 109 |
+
'MIA-Bench': '0b9de595f4dd40af18a69b94d89aba82',
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
@classmethod
|
| 113 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 114 |
+
judge_name = judge_kwargs.pop('model', 'gpt-4o')
|
| 115 |
+
|
| 116 |
+
model = build_judge(model=judge_name, **judge_kwargs)
|
| 117 |
+
suffix = eval_file.split('.')[-1]
|
| 118 |
+
|
| 119 |
+
storage = eval_file.replace(f'.{suffix}', f'_{judge_name}.xlsx') # noqa: F841
|
| 120 |
+
tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_name}.pkl') # noqa: F841
|
| 121 |
+
nproc = judge_kwargs.pop('nproc', 4) # noqa: F841
|
| 122 |
+
|
| 123 |
+
if not osp.exists(storage):
|
| 124 |
+
data = load(eval_file)
|
| 125 |
+
num_samples = len(data)
|
| 126 |
+
lines = [data.loc[i] for i in range(num_samples)]
|
| 127 |
+
prompts = [generate_prompt(line) for line in lines]
|
| 128 |
+
org_data = MIABench('MIA-Bench').data
|
| 129 |
+
img_map = {x: y for x, y in zip(org_data['index'], org_data['image'])}
|
| 130 |
+
image_b64 = [img_map[idx] for idx in data['index']]
|
| 131 |
+
indices = list(data['index'])
|
| 132 |
+
mm_messages = [
|
| 133 |
+
dict(message=[
|
| 134 |
+
dict(type='text', value=prompt),
|
| 135 |
+
dict(type='image', value=f'data:image/jpeg;base64,{b64}')
|
| 136 |
+
])
|
| 137 |
+
for prompt, b64 in zip(prompts, image_b64)
|
| 138 |
+
]
|
| 139 |
+
|
| 140 |
+
res = {}
|
| 141 |
+
if osp.exists(tmp_file):
|
| 142 |
+
res = load(tmp_file)
|
| 143 |
+
|
| 144 |
+
jobs = {k: v for k, v in zip(indices, mm_messages) if k not in res}
|
| 145 |
+
job_keys = list(jobs.keys())
|
| 146 |
+
job_vals = [jobs[k] for k in job_keys]
|
| 147 |
+
|
| 148 |
+
resps = track_progress_rich(
|
| 149 |
+
model.generate,
|
| 150 |
+
job_vals,
|
| 151 |
+
nproc=nproc,
|
| 152 |
+
chunksize=nproc,
|
| 153 |
+
keys=job_keys,
|
| 154 |
+
save=tmp_file,
|
| 155 |
+
)
|
| 156 |
+
for k, resp in zip(job_keys, resps):
|
| 157 |
+
res[k] = resp
|
| 158 |
+
data['score_raw'] = [res[idx] for idx in indices]
|
| 159 |
+
dump(data, storage)
|
| 160 |
+
|
| 161 |
+
goresult = load(storage)
|
| 162 |
+
results = get_score_dict(goresult, goresult['score_raw'])
|
| 163 |
+
result_pth = storage.replace('.xlsx', '_score.csv')
|
| 164 |
+
results_pd = pd.DataFrame.from_dict(list(results.items()))
|
| 165 |
+
dump(results_pd, result_pth)
|
| 166 |
+
|
| 167 |
+
return results
|
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mlvu.py
ADDED
|
@@ -0,0 +1,455 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import huggingface_hub
|
| 2 |
+
from huggingface_hub import snapshot_download
|
| 3 |
+
from ..smp import *
|
| 4 |
+
from .video_concat_dataset import ConcatVideoDataset
|
| 5 |
+
from .video_base import VideoBaseDataset
|
| 6 |
+
from .utils import build_judge, DEBUG_MESSAGE
|
| 7 |
+
from ..utils import track_progress_rich
|
| 8 |
+
import torchvision.transforms as T
|
| 9 |
+
from torchvision import transforms
|
| 10 |
+
from torchvision.transforms.functional import InterpolationMode
|
| 11 |
+
from decord import VideoReader, cpu
|
| 12 |
+
import pandas as pd
|
| 13 |
+
import imageio
|
| 14 |
+
import cv2
|
| 15 |
+
import zipfile
|
| 16 |
+
import os
|
| 17 |
+
import glob
|
| 18 |
+
from .utils.mlvu import *
|
| 19 |
+
|
| 20 |
+
FAIL_MSG = 'Failed to obtain answer via API.'
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class MLVU(ConcatVideoDataset):
|
| 24 |
+
def __init__(self, dataset='MLVU', nframe=0, fps=-1):
|
| 25 |
+
self.DATASET_SETS[dataset] = ['MLVU_MCQ', 'MLVU_OpenEnded']
|
| 26 |
+
self.type_data_dict = {
|
| 27 |
+
'M-Avg':['plotQA', 'needle', 'ego', 'count', 'anomaly_reco', 'topic_reasoning'],
|
| 28 |
+
'G-Avg':['sub_scene', 'summary']
|
| 29 |
+
}
|
| 30 |
+
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
|
| 31 |
+
|
| 32 |
+
@classmethod
|
| 33 |
+
def supported_datasets(cls):
|
| 34 |
+
return ['MLVU']
|
| 35 |
+
|
| 36 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 37 |
+
result = super().evaluate(eval_file=eval_file, **judge_kwargs)
|
| 38 |
+
suffix = eval_file.split('.')[-1]
|
| 39 |
+
score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
|
| 40 |
+
for key in self.type_data_dict:
|
| 41 |
+
result.loc[key] = 0.0
|
| 42 |
+
for name, item in result.iterrows():
|
| 43 |
+
if name in self.type_data_dict[key]:
|
| 44 |
+
result.loc[key, 'success'] += item['success']
|
| 45 |
+
result.loc[key, 'overall'] += item['overall']
|
| 46 |
+
if key == 'G-Avg':
|
| 47 |
+
result.loc[key, 'acc'] = round(
|
| 48 |
+
result.loc[key, 'success'] / result.loc[key, 'overall'], 2
|
| 49 |
+
)
|
| 50 |
+
else:
|
| 51 |
+
result.loc[key, 'acc'] = round(
|
| 52 |
+
result.loc[key, 'success'] / result.loc[key, 'overall'] * 100, 1
|
| 53 |
+
)
|
| 54 |
+
result = result.reset_index().rename(columns={'index': 'task'})
|
| 55 |
+
dump(result, score_file)
|
| 56 |
+
return result
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class MLVU_MCQ(VideoBaseDataset):
|
| 60 |
+
|
| 61 |
+
MD5 = 'bb5c37e7cf8d43fc9a25c23d2b4633f5'
|
| 62 |
+
BASE_SYS = 'Carefully watch this video and pay attention to every detail. '
|
| 63 |
+
SYS = BASE_SYS + 'Based on your observations, select the best option that accurately addresses the question.'
|
| 64 |
+
TYPE = 'Video-MCQ'
|
| 65 |
+
|
| 66 |
+
def __init__(self, dataset='MLVU_MCQ', nframe=0, fps=-1):
|
| 67 |
+
self.type_data_list = {
|
| 68 |
+
'plotQA': ('1_plotQA.json', './MLVU/video/1_plotQA', 'MCQ'),
|
| 69 |
+
'needle': ('2_needle.json', './MLVU/video/2_needle', 'MCQ'),
|
| 70 |
+
'ego': ('3_ego.json', './MLVU/video/3_ego', 'MCQ'),
|
| 71 |
+
'count': ('4_count.json', './MLVU/video/4_count', 'MCQ'),
|
| 72 |
+
'order': ('5_order.json', './MLVU/video/5_order', 'MCQ'),
|
| 73 |
+
'anomaly_reco': ('6_anomaly_reco.json', './MLVU/video/6_anomaly_reco', 'MCQ'),
|
| 74 |
+
'topic_reasoning': ('7_topic_reasoning.json', './MLVU/video/7_topic_reasoning', 'MCQ'),
|
| 75 |
+
}
|
| 76 |
+
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
|
| 77 |
+
|
| 78 |
+
@classmethod
|
| 79 |
+
def supported_datasets(cls):
|
| 80 |
+
return ['MLVU_MCQ']
|
| 81 |
+
|
| 82 |
+
def prepare_dataset(self, dataset_name='MLVU_MCQ', repo_id='MLVU/MVLU'):
|
| 83 |
+
def check_integrity(pth):
|
| 84 |
+
data_file = osp.join(pth, f'{dataset_name}.tsv')
|
| 85 |
+
|
| 86 |
+
if not os.path.exists(data_file):
|
| 87 |
+
return False
|
| 88 |
+
|
| 89 |
+
if md5(data_file) != self.MD5:
|
| 90 |
+
return False
|
| 91 |
+
|
| 92 |
+
data = load(data_file)
|
| 93 |
+
for idx, item in data.iterrows():
|
| 94 |
+
if not osp.exists(osp.join(pth, item['prefix'], item['video'])):
|
| 95 |
+
return False
|
| 96 |
+
return True
|
| 97 |
+
|
| 98 |
+
if modelscope_flag_set():
|
| 99 |
+
repo_id = "AI-ModelScope/MLVU"
|
| 100 |
+
|
| 101 |
+
cache_path = get_cache_path(repo_id)
|
| 102 |
+
if cache_path is not None and check_integrity(cache_path):
|
| 103 |
+
dataset_path = cache_path
|
| 104 |
+
else:
|
| 105 |
+
def generate_tsv(pth):
|
| 106 |
+
data_file = osp.join(pth, f'{dataset_name}.tsv')
|
| 107 |
+
if os.path.exists(data_file) and md5(data_file) == self.MD5:
|
| 108 |
+
return
|
| 109 |
+
json_data_dir = os.path.join(dataset_path, 'MLVU', 'json')
|
| 110 |
+
self.data_list = []
|
| 111 |
+
for k, v in self.type_data_list.items():
|
| 112 |
+
with open(os.path.join(json_data_dir, v[0]), 'r') as f:
|
| 113 |
+
json_data = json.load(f)
|
| 114 |
+
for data in json_data:
|
| 115 |
+
self.data_list.append({
|
| 116 |
+
'task_type': k,
|
| 117 |
+
'prefix': v[1],
|
| 118 |
+
'duration': data['duration'],
|
| 119 |
+
'video': data['video'],
|
| 120 |
+
'question': data['question'],
|
| 121 |
+
'answer': data['answer'],
|
| 122 |
+
'candidates': data['candidates'],
|
| 123 |
+
})
|
| 124 |
+
|
| 125 |
+
data_df = pd.DataFrame(self.data_list)
|
| 126 |
+
data_df = data_df.assign(index=range(len(data_df)))
|
| 127 |
+
data_df.to_csv(data_file, sep='\t', index=False)
|
| 128 |
+
|
| 129 |
+
if modelscope_flag_set():
|
| 130 |
+
from modelscope import dataset_snapshot_download
|
| 131 |
+
dataset_path = dataset_snapshot_download(dataset_id=repo_id)
|
| 132 |
+
else:
|
| 133 |
+
hf_token = os.environ.get('HUGGINGFACE_TOKEN')
|
| 134 |
+
huggingface_hub.login(hf_token)
|
| 135 |
+
dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
|
| 136 |
+
|
| 137 |
+
generate_tsv(dataset_path)
|
| 138 |
+
|
| 139 |
+
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
|
| 140 |
+
return dict(root=dataset_path, data_file=data_file)
|
| 141 |
+
|
| 142 |
+
def qa_template(self, data):
|
| 143 |
+
question = f"Question: {data['question']}\n"
|
| 144 |
+
question += 'Options:\n'
|
| 145 |
+
answer = data['answer']
|
| 146 |
+
answer_idx = -1
|
| 147 |
+
for idx, c in enumerate(eval(data['candidates'])):
|
| 148 |
+
question += f"({chr(ord('A') + idx)}) {c}\n"
|
| 149 |
+
if c == answer:
|
| 150 |
+
answer_idx = idx
|
| 151 |
+
question = question.rstrip()
|
| 152 |
+
answer = f"({chr(ord('A') + answer_idx)}) {answer}"
|
| 153 |
+
return question, answer
|
| 154 |
+
|
| 155 |
+
def save_video_frames(self, line):
|
| 156 |
+
suffix = line['video'].split('.')[-1]
|
| 157 |
+
video = line['video'].replace(f'.{suffix}','')
|
| 158 |
+
vid_path = osp.join(self.data_root, line['prefix'], line['video'])
|
| 159 |
+
vid = decord.VideoReader(vid_path)
|
| 160 |
+
video_info = {
|
| 161 |
+
'fps': vid.get_avg_fps(),
|
| 162 |
+
'n_frames': len(vid),
|
| 163 |
+
}
|
| 164 |
+
if self.nframe > 0 and self.fps < 0:
|
| 165 |
+
step_size = len(vid) / (self.nframe + 1)
|
| 166 |
+
indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
|
| 167 |
+
frame_paths = self.frame_paths(video)
|
| 168 |
+
elif self.fps > 0:
|
| 169 |
+
# not constrained by num_frames, get frames by fps
|
| 170 |
+
total_duration = video_info['n_frames'] / video_info['fps']
|
| 171 |
+
required_frames = int(total_duration * self.fps)
|
| 172 |
+
step_size = video_info['fps'] / self.fps
|
| 173 |
+
indices = [int(i * step_size) for i in range(required_frames)]
|
| 174 |
+
frame_paths = self.frame_paths_fps(video, len(indices))
|
| 175 |
+
|
| 176 |
+
flag = np.all([osp.exists(p) for p in frame_paths])
|
| 177 |
+
|
| 178 |
+
if not flag:
|
| 179 |
+
images = [vid[i].asnumpy() for i in indices]
|
| 180 |
+
images = [Image.fromarray(arr) for arr in images]
|
| 181 |
+
for im, pth in zip(images, frame_paths):
|
| 182 |
+
if not osp.exists(pth):
|
| 183 |
+
im.save(pth)
|
| 184 |
+
|
| 185 |
+
return frame_paths
|
| 186 |
+
|
| 187 |
+
def save_video_into_images(self, line):
|
| 188 |
+
frame_paths = self.save_video_frames(line)
|
| 189 |
+
return frame_paths
|
| 190 |
+
|
| 191 |
+
def build_prompt(self, line, video_llm):
|
| 192 |
+
if isinstance(line, int):
|
| 193 |
+
assert line < len(self)
|
| 194 |
+
line = self.data.iloc[line]
|
| 195 |
+
|
| 196 |
+
question, answer = self.qa_template(line)
|
| 197 |
+
message = [dict(type='text', value=self.SYS, role='system')]
|
| 198 |
+
message.append(dict(type='text', value=question))
|
| 199 |
+
video_path = os.path.join(self.data_root, line['prefix'], line['video'])
|
| 200 |
+
if video_llm:
|
| 201 |
+
message.append(dict(type='video', value=video_path))
|
| 202 |
+
else:
|
| 203 |
+
img_frame_paths = self.save_video_into_images(line)
|
| 204 |
+
for im in img_frame_paths:
|
| 205 |
+
message.append(dict(type='image', value=im))
|
| 206 |
+
message.append(dict(type='text', value='\nOnly give the best option.'))
|
| 207 |
+
return message
|
| 208 |
+
|
| 209 |
+
@classmethod
|
| 210 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 211 |
+
assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
|
| 212 |
+
|
| 213 |
+
tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
|
| 214 |
+
score_file = eval_file.replace('.xlsx', '_score.xlsx')
|
| 215 |
+
|
| 216 |
+
if not osp.exists(score_file):
|
| 217 |
+
model = judge_kwargs.setdefault('model', 'chatgpt-0125')
|
| 218 |
+
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
|
| 219 |
+
|
| 220 |
+
if model == 'exact_matching':
|
| 221 |
+
model = None
|
| 222 |
+
elif gpt_key_set():
|
| 223 |
+
model = build_judge(**judge_kwargs)
|
| 224 |
+
if not model.working():
|
| 225 |
+
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
|
| 226 |
+
warnings.warn(DEBUG_MESSAGE)
|
| 227 |
+
model = None
|
| 228 |
+
else:
|
| 229 |
+
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
|
| 230 |
+
model = None
|
| 231 |
+
res = {} if not osp.exists(tmp_file) else load(tmp_file)
|
| 232 |
+
res = {k: v for k, v in res.items() if FAIL_MSG not in v}
|
| 233 |
+
|
| 234 |
+
data = load(eval_file)
|
| 235 |
+
data_un = data[~pd.isna(data['prediction'])]
|
| 236 |
+
|
| 237 |
+
for idx in data['index']:
|
| 238 |
+
ans = data.loc[data['index'] == idx, 'answer'].values[0]
|
| 239 |
+
pred = data.loc[data['index'] == idx, 'prediction'].values[0]
|
| 240 |
+
options = eval(data.loc[data['index'] == idx, 'candidates'].values[0])
|
| 241 |
+
answer_idx = -1
|
| 242 |
+
for id, c in enumerate(options):
|
| 243 |
+
if c == ans:
|
| 244 |
+
answer_idx = id
|
| 245 |
+
ans = f"({chr(ord('A') + answer_idx)}) {ans}"
|
| 246 |
+
input_item = data.loc[data['index'] == idx].to_dict(orient='records')[0]
|
| 247 |
+
for id, option_content in enumerate(eval(input_item['candidates'])):
|
| 248 |
+
input_item[chr(ord('A') + id)] = option_content
|
| 249 |
+
if option_content == input_item['answer']:
|
| 250 |
+
input_item['answer'] = chr(ord('A') + id)
|
| 251 |
+
|
| 252 |
+
if FAIL_MSG in pred:
|
| 253 |
+
data.loc[idx, 'score'] = -1
|
| 254 |
+
else:
|
| 255 |
+
data.loc[idx, 'score'] = int(check_ans_with_model(
|
| 256 |
+
pred, ans, model,
|
| 257 |
+
input_item,
|
| 258 |
+
'MLVU_MCQ'
|
| 259 |
+
))
|
| 260 |
+
|
| 261 |
+
rejected = [x for x in data['score'] if x == -1]
|
| 262 |
+
|
| 263 |
+
print(
|
| 264 |
+
f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
|
| 265 |
+
f'failed to obtain the score for another {len(rejected)} questions. '
|
| 266 |
+
f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
dump(data, score_file)
|
| 270 |
+
|
| 271 |
+
rating = get_dimension_rating(score_file)
|
| 272 |
+
return rating
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
class MLVU_OpenEnded(VideoBaseDataset):
|
| 276 |
+
|
| 277 |
+
MD5 = 'cee573a3627c6ac434ded704c60511ba'
|
| 278 |
+
BASE_SYS = 'Carefully watch this video and pay attention to every detail. '
|
| 279 |
+
SYS = BASE_SYS + 'Based on your observations, answer the given questions.'
|
| 280 |
+
TYPE = 'Video-VQA'
|
| 281 |
+
|
| 282 |
+
def __init__(self, dataset='MLVU_OpenEnded', nframe=0, fps=-1):
|
| 283 |
+
self.type_data_list = {
|
| 284 |
+
'sub_scene': ('8_sub_scene.json', './MLVU/video/8_sub_scene', 'VQA'),
|
| 285 |
+
'summary': ('9_summary.json', './MLVU/video/9_summary', 'VQA')
|
| 286 |
+
}
|
| 287 |
+
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
|
| 288 |
+
|
| 289 |
+
@classmethod
|
| 290 |
+
def supported_datasets(cls):
|
| 291 |
+
return ['MLVU_OpenEnded']
|
| 292 |
+
|
| 293 |
+
def prepare_dataset(self, dataset_name='MLVU_OpenEnded', repo_id='MLVU/MVLU'):
|
| 294 |
+
def check_integrity(pth):
|
| 295 |
+
data_file = osp.join(pth, f'{dataset_name}.tsv')
|
| 296 |
+
|
| 297 |
+
if not os.path.exists(data_file):
|
| 298 |
+
return False
|
| 299 |
+
|
| 300 |
+
if md5(data_file) != self.MD5:
|
| 301 |
+
return False
|
| 302 |
+
|
| 303 |
+
data = load(data_file)
|
| 304 |
+
for idx, item in data.iterrows():
|
| 305 |
+
if not osp.exists(osp.join(pth, item['prefix'], item['video'])):
|
| 306 |
+
return False
|
| 307 |
+
return True
|
| 308 |
+
|
| 309 |
+
if modelscope_flag_set():
|
| 310 |
+
repo_id = "AI-ModelScope/MLVU"
|
| 311 |
+
|
| 312 |
+
cache_path = get_cache_path(repo_id)
|
| 313 |
+
if cache_path is not None and check_integrity(cache_path):
|
| 314 |
+
dataset_path = cache_path
|
| 315 |
+
else:
|
| 316 |
+
def generate_tsv(pth):
|
| 317 |
+
data_file = osp.join(pth, f'{dataset_name}.tsv')
|
| 318 |
+
if os.path.exists(data_file) and md5(data_file) == self.MD5:
|
| 319 |
+
return
|
| 320 |
+
json_data_dir = os.path.join(dataset_path, 'MLVU', 'json')
|
| 321 |
+
self.data_list = []
|
| 322 |
+
for k, v in self.type_data_list.items():
|
| 323 |
+
with open(os.path.join(json_data_dir, v[0]), 'r') as f:
|
| 324 |
+
json_data = json.load(f)
|
| 325 |
+
for data in json_data:
|
| 326 |
+
self.data_list.append({
|
| 327 |
+
'task_type': k,
|
| 328 |
+
'prefix': v[1],
|
| 329 |
+
'duration': data['duration'],
|
| 330 |
+
'video': data['video'],
|
| 331 |
+
'question': data['question'],
|
| 332 |
+
'answer': data['answer'],
|
| 333 |
+
'scoring_points': data['scoring_points'] if 'scoring_points' in data else ''
|
| 334 |
+
})
|
| 335 |
+
|
| 336 |
+
data_df = pd.DataFrame(self.data_list)
|
| 337 |
+
data_df = data_df.assign(index=range(len(data_df)))
|
| 338 |
+
data_df.to_csv(data_file, sep='\t', index=False)
|
| 339 |
+
|
| 340 |
+
if modelscope_flag_set():
|
| 341 |
+
from modelscope import dataset_snapshot_download
|
| 342 |
+
dataset_path = dataset_snapshot_download(dataset_id=repo_id)
|
| 343 |
+
else:
|
| 344 |
+
hf_token = os.environ.get('HUGGINGFACE_TOKEN')
|
| 345 |
+
huggingface_hub.login(hf_token)
|
| 346 |
+
dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
|
| 347 |
+
|
| 348 |
+
generate_tsv(dataset_path)
|
| 349 |
+
|
| 350 |
+
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
|
| 351 |
+
return dict(root=dataset_path, data_file=data_file)
|
| 352 |
+
|
| 353 |
+
def qa_template(self, data):
|
| 354 |
+
question = f"{data['question']}"
|
| 355 |
+
answer = data['answer']
|
| 356 |
+
return question, answer
|
| 357 |
+
|
| 358 |
+
def save_video_frames(self, line):
|
| 359 |
+
suffix = line['video'].split('.')[-1]
|
| 360 |
+
video = line['video'].replace(f'.{suffix}','')
|
| 361 |
+
vid_path = osp.join(self.data_root, line['prefix'], line['video'])
|
| 362 |
+
vid = decord.VideoReader(vid_path)
|
| 363 |
+
video_info = {
|
| 364 |
+
'fps': vid.get_avg_fps(),
|
| 365 |
+
'n_frames': len(vid),
|
| 366 |
+
}
|
| 367 |
+
if self.nframe > 0 and self.fps < 0:
|
| 368 |
+
step_size = len(vid) / (self.nframe + 1)
|
| 369 |
+
indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
|
| 370 |
+
frame_paths = self.frame_paths(video)
|
| 371 |
+
elif self.fps > 0:
|
| 372 |
+
# not constrained by num_frames, get frames by fps
|
| 373 |
+
total_duration = video_info['n_frames'] / video_info['fps']
|
| 374 |
+
required_frames = int(total_duration * self.fps)
|
| 375 |
+
step_size = video_info['fps'] / self.fps
|
| 376 |
+
indices = [int(i * step_size) for i in range(required_frames)]
|
| 377 |
+
frame_paths = self.frame_paths_fps(video, len(indices))
|
| 378 |
+
|
| 379 |
+
flag = np.all([osp.exists(p) for p in frame_paths])
|
| 380 |
+
|
| 381 |
+
if not flag:
|
| 382 |
+
images = [vid[i].asnumpy() for i in indices]
|
| 383 |
+
images = [Image.fromarray(arr) for arr in images]
|
| 384 |
+
for im, pth in zip(images, frame_paths):
|
| 385 |
+
if not osp.exists(pth):
|
| 386 |
+
im.save(pth)
|
| 387 |
+
|
| 388 |
+
return frame_paths
|
| 389 |
+
|
| 390 |
+
def save_video_into_images(self, line):
|
| 391 |
+
frame_paths = self.save_video_frames(line)
|
| 392 |
+
return frame_paths
|
| 393 |
+
|
| 394 |
+
def build_prompt(self, line, video_llm):
|
| 395 |
+
if isinstance(line, int):
|
| 396 |
+
assert line < len(self)
|
| 397 |
+
line = self.data.iloc[line]
|
| 398 |
+
|
| 399 |
+
question, answer = self.qa_template(line)
|
| 400 |
+
message = [dict(type='text', value=self.SYS, role='system')]
|
| 401 |
+
message.append(dict(type='text', value=question))
|
| 402 |
+
video_path = os.path.join(self.data_root, line['prefix'], line['video'])
|
| 403 |
+
if video_llm:
|
| 404 |
+
message.append(dict(type='video', value=video_path))
|
| 405 |
+
else:
|
| 406 |
+
img_frame_paths = self.save_video_into_images(line)
|
| 407 |
+
for im in img_frame_paths:
|
| 408 |
+
message.append(dict(type='image', value=im))
|
| 409 |
+
return message
|
| 410 |
+
|
| 411 |
+
@classmethod
|
| 412 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 413 |
+
|
| 414 |
+
model = judge_kwargs['model'] if 'model' in judge_kwargs else judge_kwargs.setdefault('model', 'gpt-4-0125')
|
| 415 |
+
if model != 'gpt-4-0125':
|
| 416 |
+
print('MLVU Open Ended default using gpt-4-0125! So judge model is changed to gpt-4-0125')
|
| 417 |
+
judge_kwargs['model'] = 'gpt-4-0125'
|
| 418 |
+
|
| 419 |
+
suffix = eval_file.split('.')[-1]
|
| 420 |
+
score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
|
| 421 |
+
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
|
| 422 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
| 423 |
+
|
| 424 |
+
if not osp.exists(score_file):
|
| 425 |
+
data = load(eval_file)
|
| 426 |
+
model_dict = {
|
| 427 |
+
'sub_scene': build_judge(system_prompt=system_prompt_sub_scene, **judge_kwargs),
|
| 428 |
+
'summary': build_judge(system_prompt=system_prompt_summary, **judge_kwargs)
|
| 429 |
+
}
|
| 430 |
+
lt = len(data)
|
| 431 |
+
lines = [data.iloc[i] for i in range(lt)]
|
| 432 |
+
tups = [(model_dict[line['task_type']], line) for line in lines]
|
| 433 |
+
indices = [line['index'] for line in lines]
|
| 434 |
+
|
| 435 |
+
ans = {}
|
| 436 |
+
if osp.exists(tmp_file):
|
| 437 |
+
ans = load(tmp_file)
|
| 438 |
+
tups = [x for x, i in zip(tups, indices) if i not in ans]
|
| 439 |
+
indices = [i for i in indices if i not in ans]
|
| 440 |
+
|
| 441 |
+
if len(indices):
|
| 442 |
+
_ = track_progress_rich(
|
| 443 |
+
MLVU_OpenEnded_generate,
|
| 444 |
+
tups,
|
| 445 |
+
nproc=nproc,
|
| 446 |
+
chunksize=nproc,
|
| 447 |
+
keys=indices,
|
| 448 |
+
save=tmp_file,
|
| 449 |
+
)
|
| 450 |
+
ans = load(tmp_file)
|
| 451 |
+
data = MLVU_OpenEnded_extract(ans, data)
|
| 452 |
+
dump(data, score_file)
|
| 453 |
+
|
| 454 |
+
rating = get_dimension_rating(score_file)
|
| 455 |
+
return rating
|
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mmbench_video.py
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from huggingface_hub import snapshot_download
|
| 2 |
+
from ..smp import *
|
| 3 |
+
from .video_base import VideoBaseDataset
|
| 4 |
+
from .utils import build_judge, DEBUG_MESSAGE
|
| 5 |
+
from ..utils import track_progress_rich
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
FAIL_MSG = 'Failed to obtain answer via API.'
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def unwrap_hf_pkl(pth, suffix='.mp4'):
|
| 12 |
+
base_dir = os.path.join(pth, 'video_pkl/')
|
| 13 |
+
target_dir = os.path.join(pth, 'video/')
|
| 14 |
+
pickle_files = [os.path.join(base_dir, file) for file in os.listdir(base_dir)]
|
| 15 |
+
pickle_files.sort()
|
| 16 |
+
|
| 17 |
+
if not os.path.exists(target_dir):
|
| 18 |
+
os.makedirs(target_dir, exist_ok=True)
|
| 19 |
+
for pickle_file in pickle_files:
|
| 20 |
+
with open(pickle_file, 'rb') as file:
|
| 21 |
+
video_data = pickle.load(file)
|
| 22 |
+
# For each video file in the pickle file, write its contents to a new mp4 file
|
| 23 |
+
for video_name, video_content in video_data.items():
|
| 24 |
+
output_path = os.path.join(target_dir, f'{video_name}{suffix}')
|
| 25 |
+
with open(output_path, 'wb') as output_file:
|
| 26 |
+
output_file.write(video_content)
|
| 27 |
+
print('The video file has been restored and stored from the pickle file.')
|
| 28 |
+
else:
|
| 29 |
+
print('The video file already exists.')
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class MMBenchVideo(VideoBaseDataset):
|
| 33 |
+
|
| 34 |
+
MD5 = '98f7df3eb1007fc375ea6fe88a98e2ff'
|
| 35 |
+
SYS = 'You are an AI assistant responsible for answering questions about videos.'
|
| 36 |
+
FRAMES_TMPL_PACK = """
|
| 37 |
+
You will be provided with {} separate frames uniformly sampled from a video, \
|
| 38 |
+
the frames are provided in chronological order of the video.
|
| 39 |
+
Please analyze these images and provide the answer / answers to the \
|
| 40 |
+
following question / questions about the video content.
|
| 41 |
+
If multiple questions are provided (with indices I1, I2, I3, ...), \
|
| 42 |
+
you should organize your answers in the following json format:
|
| 43 |
+
{{
|
| 44 |
+
'I1': 'Answer to Question I1',
|
| 45 |
+
'I2': 'Answer to Question I2',
|
| 46 |
+
...
|
| 47 |
+
}}
|
| 48 |
+
Otherwise, please directly reply with your response to the only question.
|
| 49 |
+
Even if the information in these separate frames is not enough to give an answer,
|
| 50 |
+
PLEASE GIVE A RESPONSE TO EACH OF THE QUESTIONS IN THE FORMAT DESCRIBED ABOVE.
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
FRAMES_TMPL_NOPACK = """
|
| 54 |
+
You will be provided with {} separate frames uniformly sampled from a video, \
|
| 55 |
+
the frames are provided in chronological order of the video.
|
| 56 |
+
Please analyze these images and provide the answer to the question about the video content.
|
| 57 |
+
Please directly reply with your response to the only question.
|
| 58 |
+
"""
|
| 59 |
+
|
| 60 |
+
TYPE = 'Video-VQA'
|
| 61 |
+
|
| 62 |
+
def __init__(self, dataset='MMBench-Video', pack=False, nframe=0, fps=-1):
|
| 63 |
+
super().__init__(dataset=dataset, pack=pack, nframe=nframe, fps=fps)
|
| 64 |
+
|
| 65 |
+
@classmethod
|
| 66 |
+
def supported_datasets(cls):
|
| 67 |
+
return ['MMBench-Video']
|
| 68 |
+
|
| 69 |
+
def prepare_dataset(self, dataset_name='MMBench-Video', repo_id='opencompass/MMBench-Video'):
|
| 70 |
+
def check_integrity(pth):
|
| 71 |
+
data_file = osp.join(pth, f'{dataset_name}.tsv')
|
| 72 |
+
if md5(data_file) != self.MD5:
|
| 73 |
+
return False
|
| 74 |
+
data = load(data_file)
|
| 75 |
+
for video_pth in data['video_path']:
|
| 76 |
+
if not osp.exists(osp.join(pth, video_pth)):
|
| 77 |
+
return False
|
| 78 |
+
return True
|
| 79 |
+
|
| 80 |
+
cache_path = get_cache_path(repo_id)
|
| 81 |
+
if cache_path is not None and check_integrity(cache_path):
|
| 82 |
+
dataset_path = cache_path
|
| 83 |
+
else:
|
| 84 |
+
if modelscope_flag_set():
|
| 85 |
+
from modelscope import dataset_snapshot_download
|
| 86 |
+
dataset_path = dataset_snapshot_download(dataset_id=repo_id)
|
| 87 |
+
else:
|
| 88 |
+
dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
|
| 89 |
+
unwrap_hf_pkl(dataset_path)
|
| 90 |
+
self.video_path = osp.join(dataset_path, 'video/')
|
| 91 |
+
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
|
| 92 |
+
|
| 93 |
+
return dict(data_file=data_file, root=osp.join(dataset_path, 'video'))
|
| 94 |
+
|
| 95 |
+
def build_prompt_pack(self, line):
|
| 96 |
+
if isinstance(line, int):
|
| 97 |
+
assert line < len(self)
|
| 98 |
+
video = self.videos[line]
|
| 99 |
+
elif isinstance(line, pd.Series):
|
| 100 |
+
video = line['video']
|
| 101 |
+
elif isinstance(line, str):
|
| 102 |
+
video = line
|
| 103 |
+
|
| 104 |
+
frames = self.save_video_frames(video)
|
| 105 |
+
sub = self.data[self.data['video'] == video]
|
| 106 |
+
sys_prompt = self.SYS + self.FRAMES_TMPL_PACK.format(len(frames))
|
| 107 |
+
message = [dict(type='text', value=sys_prompt)]
|
| 108 |
+
for im in frames:
|
| 109 |
+
message.append(dict(type='image', value=im))
|
| 110 |
+
nq = len(sub)
|
| 111 |
+
prompt = 'Questions: \n{}\nAnswers: \n'
|
| 112 |
+
qs = {int(sub.iloc[i]['index']): sub.iloc[i]['question'] for i in range(nq)}
|
| 113 |
+
prompt = prompt.format(json.dumps(qs))
|
| 114 |
+
message.append(dict(type='text', value=prompt))
|
| 115 |
+
return message
|
| 116 |
+
|
| 117 |
+
def build_prompt_nopack(self, line, video_llm):
|
| 118 |
+
if isinstance(line, int):
|
| 119 |
+
assert line < len(self)
|
| 120 |
+
line = self.data.iloc[line]
|
| 121 |
+
if video_llm:
|
| 122 |
+
question = line['question']
|
| 123 |
+
prefix, video_idx_path = os.path.split(line['video_path'])
|
| 124 |
+
message = [dict(type='text', value=question)]
|
| 125 |
+
message.append(dict(type='video', value=os.path.join(self.video_path, video_idx_path)))
|
| 126 |
+
return message
|
| 127 |
+
else:
|
| 128 |
+
frames = self.save_video_frames(line['video'])
|
| 129 |
+
sys_prompt = self.FRAMES_TMPL_NOPACK.format(len(frames))
|
| 130 |
+
message = [dict(type='text', value=sys_prompt)]
|
| 131 |
+
for im in frames:
|
| 132 |
+
message.append(dict(type='image', value=im))
|
| 133 |
+
prompt = 'Question: {}\nAnswer: '.format(line['question'])
|
| 134 |
+
message.append(dict(type='text', value=prompt))
|
| 135 |
+
return message
|
| 136 |
+
|
| 137 |
+
def build_prompt(self, line, video_llm):
|
| 138 |
+
if self.pack and not video_llm:
|
| 139 |
+
return self.build_prompt_pack(line)
|
| 140 |
+
else:
|
| 141 |
+
return self.build_prompt_nopack(line, video_llm)
|
| 142 |
+
|
| 143 |
+
@staticmethod
|
| 144 |
+
def remove_side_quote(s, syms=[',', '"', "'"]):
|
| 145 |
+
if np.all([x in syms for x in s]):
|
| 146 |
+
return ''
|
| 147 |
+
while s[0] in syms:
|
| 148 |
+
s = s[1:]
|
| 149 |
+
while s[-1] in syms:
|
| 150 |
+
s = s[:-1]
|
| 151 |
+
return s
|
| 152 |
+
|
| 153 |
+
@staticmethod
|
| 154 |
+
def robust_json_load(s):
|
| 155 |
+
try:
|
| 156 |
+
jsons = list(extract_json_objects(s))
|
| 157 |
+
assert len(jsons) == 1
|
| 158 |
+
return jsons[0]
|
| 159 |
+
except:
|
| 160 |
+
if '{' in s and s.find('{') == s.rfind('{'):
|
| 161 |
+
sub_str = s[s.find('{') + 1:].strip()
|
| 162 |
+
lines = sub_str.split('\n')
|
| 163 |
+
res = {}
|
| 164 |
+
for l in lines:
|
| 165 |
+
l = l.strip()
|
| 166 |
+
if ': ' in l:
|
| 167 |
+
key = l.split(': ')[0].strip()
|
| 168 |
+
val = l.split(': ')[1].strip()
|
| 169 |
+
key = MMBenchVideo.remove_side_quote(key)
|
| 170 |
+
val = MMBenchVideo.remove_side_quote(val)
|
| 171 |
+
if len(key) and len(val):
|
| 172 |
+
res[key] = val
|
| 173 |
+
return res
|
| 174 |
+
return None
|
| 175 |
+
|
| 176 |
+
def load_pack_answers(self, data_raw):
|
| 177 |
+
vstats = defaultdict(lambda: 0)
|
| 178 |
+
data = defaultdict(lambda: {})
|
| 179 |
+
|
| 180 |
+
for k in data_raw:
|
| 181 |
+
ans = data_raw[k].strip()
|
| 182 |
+
if FAIL_MSG in ans:
|
| 183 |
+
vstats['GEN_FAIL'] += 1
|
| 184 |
+
continue
|
| 185 |
+
res = self.robust_json_load(ans)
|
| 186 |
+
if res is not None:
|
| 187 |
+
data[k] = res
|
| 188 |
+
vstats['PARSE_OK'] += 1
|
| 189 |
+
else:
|
| 190 |
+
vstats['PARSE_FAIL'] += 1
|
| 191 |
+
|
| 192 |
+
# return data
|
| 193 |
+
meta = cp.deepcopy(self.data)
|
| 194 |
+
lt = len(meta)
|
| 195 |
+
prediction = []
|
| 196 |
+
for i in range(lt):
|
| 197 |
+
line = meta.iloc[i]
|
| 198 |
+
vid = line['video']
|
| 199 |
+
idx = str(line['index'])
|
| 200 |
+
prediction.append(data[vid][idx] if idx in data[vid] else None)
|
| 201 |
+
meta['prediction'] = prediction
|
| 202 |
+
vstats['VALIDQ'] = len([x for x in prediction if x is not None])
|
| 203 |
+
vstats['INVALIDQ'] = len([x for x in prediction if x is None])
|
| 204 |
+
return meta, vstats
|
| 205 |
+
|
| 206 |
+
# It returns a dictionary
|
| 207 |
+
@classmethod
|
| 208 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 209 |
+
from .utils.mmbench_video import get_dimension_rating, system_prompt, build_prompt
|
| 210 |
+
|
| 211 |
+
assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
|
| 212 |
+
judge = judge_kwargs['model']
|
| 213 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
| 214 |
+
|
| 215 |
+
tmp_file = eval_file.replace('.xlsx', f'_{judge}_tmp.pkl')
|
| 216 |
+
tgt_file = eval_file.replace('.xlsx', f'_{judge}_rating.json')
|
| 217 |
+
score_file = eval_file.replace('.xlsx', f'_{judge}_score.xlsx')
|
| 218 |
+
|
| 219 |
+
model = build_judge(system_prompt=system_prompt, **judge_kwargs)
|
| 220 |
+
assert model.working(), 'MMBench-Video evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE
|
| 221 |
+
|
| 222 |
+
if not osp.exists(score_file):
|
| 223 |
+
res = {} if not osp.exists(tmp_file) else load(tmp_file)
|
| 224 |
+
res = {k: v for k, v in res.items() if model.fail_msg not in v}
|
| 225 |
+
|
| 226 |
+
data = load(eval_file)
|
| 227 |
+
data_un = data[~data['index'].isin(res)]
|
| 228 |
+
data_un = data_un[~pd.isna(data_un['prediction'])]
|
| 229 |
+
lt = len(data_un)
|
| 230 |
+
prompts = [build_prompt(data_un.iloc[i]) for i in range(lt)]
|
| 231 |
+
indices = [data_un.iloc[i]['index'] for i in range(lt)]
|
| 232 |
+
|
| 233 |
+
if len(prompts):
|
| 234 |
+
_ = track_progress_rich(
|
| 235 |
+
model.generate,
|
| 236 |
+
prompts,
|
| 237 |
+
keys=indices,
|
| 238 |
+
save=tmp_file,
|
| 239 |
+
nproc=nproc,
|
| 240 |
+
chunksize=nproc
|
| 241 |
+
)
|
| 242 |
+
score_map = load(tmp_file)
|
| 243 |
+
data['score'] = [score_map[idx] if idx in score_map else -1 for idx in data['index']]
|
| 244 |
+
rejected = [x for x in score_map.values() if FAIL_MSG in x]
|
| 245 |
+
data['score'] = [int(x) if istype(x, int) else -1 for x in data['score']]
|
| 246 |
+
print(
|
| 247 |
+
f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(score_map)} questions, '
|
| 248 |
+
f'failed to obtain the score for another {len(rejected)} questions. '
|
| 249 |
+
f'Those questions will be counted as 0 score in ALL rating, and will not be counted in VALID rating.'
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
dump(data, score_file)
|
| 253 |
+
|
| 254 |
+
rating = get_dimension_rating(score_file)
|
| 255 |
+
dump(rating, tgt_file)
|
| 256 |
+
return rating
|
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mmgenbench.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import warnings
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from abc import abstractmethod
|
| 4 |
+
from ..smp import *
|
| 5 |
+
from .image_base import ImageBaseDataset
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class MMGenBench(ImageBaseDataset):
|
| 9 |
+
|
| 10 |
+
prompt_list = [
|
| 11 |
+
"""
|
| 12 |
+
# Role
|
| 13 |
+
You are an expert in the field of image understanding, focusing on the \
|
| 14 |
+
understanding of images and generating the image caption-prompt.
|
| 15 |
+
|
| 16 |
+
# Definition Explanation
|
| 17 |
+
image caption-prompt: Refers to the caption or description of an image, \
|
| 18 |
+
used to provide to a Text-to-Image model to generate a new image.
|
| 19 |
+
Text-to-Image model: Can generate a new image based on the provided image \
|
| 20 |
+
caption-prompt, such as stable diffusion 3, flux, and other image generation models.
|
| 21 |
+
|
| 22 |
+
# Task Description
|
| 23 |
+
Generate an image caption-prompt based on the input image.
|
| 24 |
+
|
| 25 |
+
# Key Points and Requirements
|
| 26 |
+
1. Accurately understand the input image and precisely generate an image caption-prompt.
|
| 27 |
+
2. The generated image caption-prompt, when provided to the Text-to-Image model, requires the \
|
| 28 |
+
Text-to-Image model to generate a new image that is as consistent as possible with the input image.
|
| 29 |
+
3. The generated image caption-prompt must conform to the preferences of the Text-to-Image model.
|
| 30 |
+
4. The generated image caption-prompt should describe the input image in as much \
|
| 31 |
+
detail as possible, and it should be between 20 to 60 words.
|
| 32 |
+
|
| 33 |
+
# Output Format
|
| 34 |
+
A string, that is the image caption-prompt. No extra output needed.
|
| 35 |
+
"""
|
| 36 |
+
]
|
| 37 |
+
TYPE = 'GenerateImgPrompt'
|
| 38 |
+
DATASET_URL = {
|
| 39 |
+
'MMGenBench-Test': 'https://huggingface.co/datasets/lerogo/MMGenBench/resolve/main/MMGenBench-Test.tsv',
|
| 40 |
+
'MMGenBench-Domain': 'https://huggingface.co/datasets/lerogo/MMGenBench/resolve/main/MMGenBench-Domain.tsv',
|
| 41 |
+
}
|
| 42 |
+
PROMPT_MAP = {
|
| 43 |
+
'MMGenBench-Test': prompt_list[0],
|
| 44 |
+
'MMGenBench-Domain': prompt_list[0],
|
| 45 |
+
}
|
| 46 |
+
DATASET_MD5 = {
|
| 47 |
+
'MMGenBench-Test': "94f8dac6bbf7c20be403f99adeaa73da",
|
| 48 |
+
'MMGenBench-Domain': "5c10daf6e2c5f08bdfb0701aa6db86bb",
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
def __init__(self, dataset='MMGenBench', **kwargs):
|
| 52 |
+
super().__init__(dataset, **kwargs)
|
| 53 |
+
warnings.warn('This dataset is for inference only and does not support direct output of evaluation results.\n')
|
| 54 |
+
warnings.warn('Please refer to "https://github.com/lerogo/MMGenBench" for more evaluation information.\n')
|
| 55 |
+
|
| 56 |
+
def load_data(self, dataset):
|
| 57 |
+
data = super().load_data(dataset)
|
| 58 |
+
if 'question' not in data:
|
| 59 |
+
data['question'] = [(
|
| 60 |
+
self.PROMPT_MAP[dataset]
|
| 61 |
+
)] * len(data)
|
| 62 |
+
return data
|
| 63 |
+
|
| 64 |
+
# Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
|
| 65 |
+
@abstractmethod
|
| 66 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 67 |
+
warnings.warn('This evaluation method is not supported.\n')
|
| 68 |
+
warnings.warn('Please refer to "https://github.com/lerogo/MMGenBench" for more evaluation information.\n')
|
| 69 |
+
return None
|
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mmlongbench.py
ADDED
|
@@ -0,0 +1,584 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import math
|
| 3 |
+
from urllib.request import urlopen
|
| 4 |
+
from PIL import Image, ImageDraw, ImageFont
|
| 5 |
+
import torchvision.transforms as transforms
|
| 6 |
+
|
| 7 |
+
from vlmeval.dataset.utils import build_judge, levenshtein_distance
|
| 8 |
+
from vlmeval.smp import *
|
| 9 |
+
from .image_base import ImageBaseDataset
|
| 10 |
+
|
| 11 |
+
FAIL_MSG = 'Failed to obtain answer via API.'
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def get_gpt4_ICE():
|
| 15 |
+
example_1 = """
|
| 16 |
+
---
|
| 17 |
+
Question: List the primary questions asked about the services in this report.
|
| 18 |
+
Analysis: The primary questions asked about the services in the report for The Limes Residential Home are:\n\n
|
| 19 |
+
1. Is the service safe?\n
|
| 20 |
+
2. Is the service effective?\n
|
| 21 |
+
3. Is the service caring?\n
|
| 22 |
+
4. Is the service responsive?\n
|
| 23 |
+
5. Is the service well-led?
|
| 24 |
+
Extracted answer: [
|
| 25 |
+
'Is the servife safe?',
|
| 26 |
+
'Is the service effective',
|
| 27 |
+
'Is the serve caring?',
|
| 28 |
+
'Is the service responsive?',
|
| 29 |
+
'Is the service well-led?'
|
| 30 |
+
]
|
| 31 |
+
Answer format: List\n
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
example_2 = """
|
| 35 |
+
---
|
| 36 |
+
Question: How many regulations of the HSCA 2008 are breached in all according to this report?
|
| 37 |
+
Analysis: According to the report, the provider breached 10 Health and Social Care Act 2008 (Regulated Activities)
|
| 38 |
+
Regulations in total. Here are the specifics:\n\n1. Regulation 13: Safeguarding service users from abuse and
|
| 39 |
+
improper treatment\n2. Regulation 12: Safe care and treatment\n3. Regulation 18: Staffing\n4. Regulation 11:
|
| 40 |
+
Need for consent\n5. Regulation 10: Dignity and respect\n6. Regulation 9: Person-centred care\n7. Regulation 17:
|
| 41 |
+
Good governance\n8. Regulation 18 (CQC Registration Regulations 2009): Notification of other incidents\n9.
|
| 42 |
+
Regulation 18: Failure to maintain an accurate and up-to-date care plan\n10. Regulation 11: Failure to implement
|
| 43 |
+
the Mental Capacity Act 2005 code of practice effectively\n\nThese breaches involve issues concerning staffing,
|
| 44 |
+
safeguarding, medicines management, dignity and respect, consent, care planning, governance, and failure to
|
| 45 |
+
notify the CQC of incidents.
|
| 46 |
+
Extracted answer: 10
|
| 47 |
+
Answer format: Integer\n
|
| 48 |
+
"""
|
| 49 |
+
|
| 50 |
+
example_3 = """
|
| 51 |
+
---
|
| 52 |
+
Question: According to the survey that is the percentage of Chinese who are paying more or
|
| 53 |
+
about the same attention to politics after Trump's election?
|
| 54 |
+
Analysis: The survey provided does not specify the percentage of Chinese individuals specifically who are paying
|
| 55 |
+
more or about the same attention to politics after Trump's election. The report focuses primarily on American
|
| 56 |
+
demographics and does not include specific details about the Chinese population in relation to this question. If
|
| 57 |
+
you need information about a different demographic or a summary of the findings from the American demographic,
|
| 58 |
+
I can certainly help with that!
|
| 59 |
+
Extracted answer: Not answerable
|
| 60 |
+
Answer format: String\n
|
| 61 |
+
"""
|
| 62 |
+
|
| 63 |
+
example_4 = """
|
| 64 |
+
---
|
| 65 |
+
Question: How many quotations from male respondent over 50 years old are included in this report?
|
| 66 |
+
Analysis: The image you've provided appears to be a screenshot of a document with multiple charts. However, the
|
| 67 |
+
text is too small and blurry to read accurately. If you can provide a clearer image or more context, I might be
|
| 68 |
+
able to help you with your question.
|
| 69 |
+
Extracted answer: Fail to answer
|
| 70 |
+
Answer format: String\n
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
return [example_1, example_2, example_3, example_4]
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def build_mmlongbench_gpt4_prompt(line):
|
| 77 |
+
task_description = """
|
| 78 |
+
Given the question and analysis, you are tasked to extract answers with required formats from the free-form analysis.
|
| 79 |
+
- Your extracted answers should be one of the following formats: (1) Integer, (2) Float, (3) String and (4) List.
|
| 80 |
+
If you find the analysis the question can not be answered from the given documents, type "Not answerable".
|
| 81 |
+
Exception: If the analysis only tells you that it can not read/understand the images or documents,
|
| 82 |
+
type "Fail to answer".
|
| 83 |
+
- Please make your response as concise as possible. Also note that your response should be formatted as below:
|
| 84 |
+
```
|
| 85 |
+
Extracted answer: [answer]
|
| 86 |
+
Answer format: [answer format]
|
| 87 |
+
```
|
| 88 |
+
Please read the following example, then extract the answer from the model response
|
| 89 |
+
and type it at the end of the prompt.\n
|
| 90 |
+
"""
|
| 91 |
+
question = line['question']
|
| 92 |
+
prediction = str(line['prediction'])
|
| 93 |
+
prompt = task_description
|
| 94 |
+
examples = get_gpt4_ICE()
|
| 95 |
+
for example in examples:
|
| 96 |
+
prompt += example
|
| 97 |
+
prompt += '---\nQuestion:' + question + '\n'
|
| 98 |
+
prompt += 'Analysis: ' + prediction
|
| 99 |
+
return prompt
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def anls_compute(groundtruth, prediction, threshold=0.5):
|
| 103 |
+
dist = levenshtein_distance(groundtruth, prediction)
|
| 104 |
+
length = max(len(groundtruth.upper()), len(prediction.upper()))
|
| 105 |
+
value = 0.0 if length == 0 else float(dist) / float(length)
|
| 106 |
+
anls = 1.0 - value
|
| 107 |
+
if anls <= threshold:
|
| 108 |
+
anls = 0.0
|
| 109 |
+
return anls
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def is_float_equal(reference, prediction, include_percentage: bool = False, is_close: float = False) -> bool:
|
| 113 |
+
def get_precision(gt_ans: float) -> int:
|
| 114 |
+
precision = 3
|
| 115 |
+
if '.' in str(gt_ans):
|
| 116 |
+
precision = len(str(gt_ans).split('.')[-1])
|
| 117 |
+
return precision
|
| 118 |
+
|
| 119 |
+
reference = float(str(reference).strip().rstrip('%').strip())
|
| 120 |
+
try:
|
| 121 |
+
prediction = float(str(prediction).strip().rstrip('%').strip())
|
| 122 |
+
except:
|
| 123 |
+
return False
|
| 124 |
+
|
| 125 |
+
if include_percentage:
|
| 126 |
+
gt_result = [reference / 100, reference, reference * 100]
|
| 127 |
+
else:
|
| 128 |
+
gt_result = [reference]
|
| 129 |
+
for item in gt_result:
|
| 130 |
+
try:
|
| 131 |
+
if is_close:
|
| 132 |
+
if math.isclose(item, prediction, rel_tol=0.01):
|
| 133 |
+
return True
|
| 134 |
+
precision = max(min(get_precision(prediction), get_precision(item)), 2)
|
| 135 |
+
if round(prediction, precision) == round(item, precision):
|
| 136 |
+
return True
|
| 137 |
+
except Exception:
|
| 138 |
+
continue
|
| 139 |
+
return False
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def get_clean_string(s):
|
| 143 |
+
s = str(s).lower().strip()
|
| 144 |
+
if s.endswith('mile'):
|
| 145 |
+
s.rstrip('mile').strip()
|
| 146 |
+
if s.endswith('miles'):
|
| 147 |
+
s.rstrip('miles').strip()
|
| 148 |
+
if s.endswith('million'):
|
| 149 |
+
s.rstrip('million').strip()
|
| 150 |
+
# remove parenthesis
|
| 151 |
+
s = re.sub(r'\s*\([^)]*\)', '', s).strip()
|
| 152 |
+
# remove quotes
|
| 153 |
+
s = re.sub(r"^['\"]|['\"]$", '', s).strip()
|
| 154 |
+
s = s.strip().lstrip('$').strip()
|
| 155 |
+
s = s.strip().rstrip('%').strip()
|
| 156 |
+
return s
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def is_exact_match(s):
|
| 160 |
+
flag = False
|
| 161 |
+
# Website
|
| 162 |
+
if 'https://' in s:
|
| 163 |
+
flag = True
|
| 164 |
+
# code file
|
| 165 |
+
if s.endswith('.py') or s.endswith('ipynb'):
|
| 166 |
+
flag = True
|
| 167 |
+
if s.startswith('page'):
|
| 168 |
+
flag = True
|
| 169 |
+
# telephone number
|
| 170 |
+
if re.fullmatch(r'\b\d+(-\d+|\s\d+)?\b', s):
|
| 171 |
+
flag = True
|
| 172 |
+
# time
|
| 173 |
+
if 'a.m.' in s or 'p.m.' in s:
|
| 174 |
+
flag = True
|
| 175 |
+
# YYYY-MM-DD
|
| 176 |
+
if re.fullmatch(r'\b\d{4}[-\s]\d{2}[-\s]\d{2}\b', s):
|
| 177 |
+
flag = True
|
| 178 |
+
# YYYY-MM
|
| 179 |
+
if re.fullmatch(r'\b\d{4}[-\s]\d{2}\b', s):
|
| 180 |
+
flag = True
|
| 181 |
+
# Email address
|
| 182 |
+
if re.fullmatch(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', s):
|
| 183 |
+
flag = True
|
| 184 |
+
return flag
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def isfloat(num):
|
| 188 |
+
try:
|
| 189 |
+
float(num)
|
| 190 |
+
return True
|
| 191 |
+
except ValueError:
|
| 192 |
+
return False
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def get_font():
|
| 196 |
+
try:
|
| 197 |
+
truetype_url = "http://opencompass.openxlab.space/utils/Fonts/SimHei.ttf"
|
| 198 |
+
ff = urlopen(truetype_url)
|
| 199 |
+
font = ImageFont.truetype(ff, size=40)
|
| 200 |
+
except Exception as e:
|
| 201 |
+
logging.warning(f'{type(e)}: {e}')
|
| 202 |
+
logging.warning("Fail to download the font. Use the default one.")
|
| 203 |
+
font = ImageFont.load_default(size=40)
|
| 204 |
+
return font
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
def frame2img(img_path_list, font, save_path=None, idx_start=0):
|
| 208 |
+
imgs = [Image.open(img_path) for img_path in img_path_list]
|
| 209 |
+
|
| 210 |
+
new_imgs = []
|
| 211 |
+
for img in imgs:
|
| 212 |
+
w, h = img.size
|
| 213 |
+
scale = w / h
|
| 214 |
+
if w > h:
|
| 215 |
+
new_w = 560 * 2
|
| 216 |
+
new_h = int(560 * 2 / scale)
|
| 217 |
+
else:
|
| 218 |
+
new_w = int(560 * 2 * scale)
|
| 219 |
+
new_h = 560 * 2
|
| 220 |
+
img = transforms.functional.resize(img, [new_h, new_w],)
|
| 221 |
+
new_imgs.append(img)
|
| 222 |
+
imgs = new_imgs
|
| 223 |
+
new_w = 0
|
| 224 |
+
new_h = 0
|
| 225 |
+
pad = 40
|
| 226 |
+
if w > h:
|
| 227 |
+
for im in imgs:
|
| 228 |
+
w, h = im.size
|
| 229 |
+
new_w = max(new_w, w)
|
| 230 |
+
new_h += h + 10 + pad
|
| 231 |
+
new_img = Image.new("RGB", (new_w, new_h), "white")
|
| 232 |
+
draw = ImageDraw.Draw(new_img)
|
| 233 |
+
curr_h = 0
|
| 234 |
+
for idx, im in enumerate(imgs):
|
| 235 |
+
w, h = im.size
|
| 236 |
+
new_img.paste(im, (0, pad + curr_h))
|
| 237 |
+
draw.text((0, curr_h), f"<IMAGE {idx+idx_start}>", font=font, fill="black")
|
| 238 |
+
if idx + 1 < len(imgs):
|
| 239 |
+
draw.line([(0, pad + curr_h + h + 5), (new_w, pad + curr_h + h + 5)], fill='black', width=2)
|
| 240 |
+
curr_h += h + 10 + pad
|
| 241 |
+
else:
|
| 242 |
+
for im in imgs:
|
| 243 |
+
w, h = im.size
|
| 244 |
+
new_w += w + 10
|
| 245 |
+
new_h = max(new_h, h)
|
| 246 |
+
new_h += pad
|
| 247 |
+
new_img = Image.new('RGB', (new_w, new_h), 'white')
|
| 248 |
+
draw = ImageDraw.Draw(new_img)
|
| 249 |
+
curr_w = 0
|
| 250 |
+
for idx, im in enumerate(imgs):
|
| 251 |
+
w, h = im.size
|
| 252 |
+
new_img.paste(im, (curr_w, pad))
|
| 253 |
+
draw.text((curr_w, 0), f"<IMAGE {idx+idx_start}>", font=font, fill='black')
|
| 254 |
+
if idx + 1 < len(imgs):
|
| 255 |
+
draw.line([(curr_w + w + 5, 0), (curr_w + w + 5, new_h)], fill='black', width=2)
|
| 256 |
+
curr_w += w + 10
|
| 257 |
+
|
| 258 |
+
if save_path is not None:
|
| 259 |
+
new_img.save(save_path)
|
| 260 |
+
|
| 261 |
+
return new_img
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
def concat_images(image_list, max_concat=1, column_num=1):
|
| 265 |
+
concatenated_images = []
|
| 266 |
+
if column_num == -1:
|
| 267 |
+
MAX_COLUMN_NUM = 20
|
| 268 |
+
max_concat = 1
|
| 269 |
+
while len(image_list) / max_concat > MAX_COLUMN_NUM:
|
| 270 |
+
max_concat += 1
|
| 271 |
+
interval = max(math.ceil(len(image_list) / max_concat), 1)
|
| 272 |
+
for i in range(0, len(image_list), interval):
|
| 273 |
+
batch_images = image_list[i:i + interval]
|
| 274 |
+
concatenated_image = frame2img(batch_images, font=get_font(), idx_start=i)
|
| 275 |
+
concatenated_images.append(concatenated_image)
|
| 276 |
+
else:
|
| 277 |
+
interval = max(math.ceil(len(image_list) / max_concat), 1)
|
| 278 |
+
for i in range(0, len(image_list), interval):
|
| 279 |
+
batch_images = [Image.open(filename) for filename in image_list[i:i + interval]]
|
| 280 |
+
if column_num == 1:
|
| 281 |
+
total_height = batch_images[0].height * len(batch_images)
|
| 282 |
+
else:
|
| 283 |
+
total_height = batch_images[0].height * ((len(batch_images) - 1) // column_num + 1)
|
| 284 |
+
concatenated_image = Image.new('RGB', (batch_images[0].width * column_num, total_height), 'white')
|
| 285 |
+
|
| 286 |
+
x_offset, y_offset = 0, 0
|
| 287 |
+
for count, image in enumerate(batch_images):
|
| 288 |
+
concatenated_image.paste(image, (x_offset, y_offset))
|
| 289 |
+
x_offset += image.width
|
| 290 |
+
if (count + 1) % column_num == 0:
|
| 291 |
+
y_offset += image.height
|
| 292 |
+
x_offset = 0
|
| 293 |
+
concatenated_images.append(concatenated_image)
|
| 294 |
+
return concatenated_images
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
def eval_score(gt, pred, answer_type):
|
| 298 |
+
if answer_type == 'Int':
|
| 299 |
+
try:
|
| 300 |
+
gt, pred = int(gt), int(float(pred))
|
| 301 |
+
except:
|
| 302 |
+
pred = ''
|
| 303 |
+
score = (gt == pred)
|
| 304 |
+
elif answer_type == 'Float':
|
| 305 |
+
try:
|
| 306 |
+
gt = float(get_clean_string(str(gt)))
|
| 307 |
+
pred = float(get_clean_string(str(pred)))
|
| 308 |
+
except:
|
| 309 |
+
pred = ''
|
| 310 |
+
score = is_float_equal(gt, pred, include_percentage=True, is_close=True)
|
| 311 |
+
elif answer_type == 'Str':
|
| 312 |
+
gt = get_clean_string(gt)
|
| 313 |
+
pred = get_clean_string(pred)
|
| 314 |
+
if is_exact_match(gt):
|
| 315 |
+
score = (gt == pred)
|
| 316 |
+
else:
|
| 317 |
+
score = anls_compute(gt, pred)
|
| 318 |
+
else:
|
| 319 |
+
if isinstance(gt, str) and gt.startswith('['):
|
| 320 |
+
gt = eval(gt)
|
| 321 |
+
if not isinstance(gt, list):
|
| 322 |
+
gt = [gt]
|
| 323 |
+
if isinstance(pred, str) and pred.startswith('['):
|
| 324 |
+
pred = eval(pred)
|
| 325 |
+
if not isinstance(pred, list):
|
| 326 |
+
pred = [pred]
|
| 327 |
+
print(len(gt), len(pred))
|
| 328 |
+
if len(gt) != len(pred):
|
| 329 |
+
score = 0.0
|
| 330 |
+
else:
|
| 331 |
+
gt = sorted([get_clean_string(a) for a in gt])
|
| 332 |
+
pred = sorted([get_clean_string(a) for a in pred])
|
| 333 |
+
print(gt, pred)
|
| 334 |
+
if isfloat(gt[0]) or is_exact_match(gt[0]):
|
| 335 |
+
score = ('-'.join(gt) == '-'.join(pred))
|
| 336 |
+
else:
|
| 337 |
+
score = min([anls_compute(gt_v, pred_v) for gt_v, pred_v in zip(gt, pred)])
|
| 338 |
+
|
| 339 |
+
return float(score)
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
def MMLongBench_auxeval(model, line):
|
| 343 |
+
prompt = build_mmlongbench_gpt4_prompt(line)
|
| 344 |
+
log = ''
|
| 345 |
+
retry = 5
|
| 346 |
+
|
| 347 |
+
for i in range(retry):
|
| 348 |
+
prediction = line['prediction']
|
| 349 |
+
res = model.generate(prompt, temperature=i * 0.5)
|
| 350 |
+
|
| 351 |
+
if FAIL_MSG in res:
|
| 352 |
+
log += f'Try {i}: output is {prediction}, failed to parse.\n'
|
| 353 |
+
else:
|
| 354 |
+
log += 'Succeed'
|
| 355 |
+
try:
|
| 356 |
+
pred = res.split('Answer format:')[0].split('Extracted answer:')[1].strip()
|
| 357 |
+
except:
|
| 358 |
+
pred = ''
|
| 359 |
+
return dict(log=log, res=res, pred=pred)
|
| 360 |
+
log += 'All 5 retries failed.\n'
|
| 361 |
+
return dict(log=log, res='', pred='')
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
def get_f1(data):
|
| 365 |
+
gt_pos_data = data[data.apply(lambda k: k['answer'] != 'Not answerable', axis=1)]
|
| 366 |
+
pred_pos_data = data[data.apply(lambda k: k['pred'] != 'Not answerable', axis=1)]
|
| 367 |
+
recall = sum(gt_pos_data['score'].tolist()) / len(gt_pos_data)
|
| 368 |
+
precision = sum(pred_pos_data['score'].tolist()) / len(pred_pos_data)
|
| 369 |
+
return 2 * recall * precision / (recall + precision)
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
def MMLongBench_acc(result_file):
|
| 373 |
+
data = load(result_file)
|
| 374 |
+
overall_score = 0.0
|
| 375 |
+
score_list = list()
|
| 376 |
+
for i in range(len(data)):
|
| 377 |
+
item = data.iloc[i]
|
| 378 |
+
try:
|
| 379 |
+
score = eval_score(item['answer'], item['pred'], item['answer_format'])
|
| 380 |
+
except:
|
| 381 |
+
score = 0.0
|
| 382 |
+
score_list.append(score)
|
| 383 |
+
overall_score += score
|
| 384 |
+
|
| 385 |
+
data['score'] = score_list
|
| 386 |
+
dump(data, result_file)
|
| 387 |
+
|
| 388 |
+
data_chart = data[data.apply(lambda k: 'Chart' in eval(k['evidence_sources']), axis=1)]
|
| 389 |
+
data_table = data[data.apply(lambda k: 'Table' in eval(k['evidence_sources']), axis=1)]
|
| 390 |
+
data_image = data[data.apply(lambda k: 'Figure' in eval(k['evidence_sources']), axis=1)]
|
| 391 |
+
data_text = data[data.apply(lambda k: 'Pure-text (Plain-text)' in eval(k['evidence_sources']), axis=1)]
|
| 392 |
+
data_layout = data[data.apply(lambda k: 'Generalized-text (Layout)' in eval(k['evidence_sources']), axis=1)]
|
| 393 |
+
|
| 394 |
+
data_single = data[data.apply(lambda k: len(eval(k['evidence_pages'])) == 1, axis=1)]
|
| 395 |
+
data_multi = data[data.apply(lambda k: len(eval(k['evidence_pages'])) > 1, axis=1)]
|
| 396 |
+
data_unans = data[data.apply(lambda k: len(eval(k['evidence_pages'])) == 0, axis=1)]
|
| 397 |
+
|
| 398 |
+
res = dict()
|
| 399 |
+
res['category'] = [
|
| 400 |
+
'overall_f1', 'overall_acc', 'text', 'layout', 'table', 'chart',
|
| 401 |
+
'image', 'single-page', 'multi-page', 'unanswerable'
|
| 402 |
+
]
|
| 403 |
+
res['num'] = [
|
| 404 |
+
len(data), len(data), len(data_text), len(data_layout), len(data_table),
|
| 405 |
+
len(data_chart), len(data_image), len(data_single), len(data_multi), len(data_unans)
|
| 406 |
+
]
|
| 407 |
+
res['avg_score'] = [
|
| 408 |
+
get_f1(data),
|
| 409 |
+
overall_score / len(data),
|
| 410 |
+
sum(data_text['score'].tolist()) / len(data_text) if len(data_text) > 0 else 0.0,
|
| 411 |
+
sum(data_layout['score'].tolist()) / len(data_layout) if len(data_layout) > 0 else 0.0,
|
| 412 |
+
sum(data_table['score'].tolist()) / len(data_table) if len(data_table) > 0 else 0.0,
|
| 413 |
+
sum(data_chart['score'].tolist()) / len(data_chart) if len(data_chart) > 0 else 0.0,
|
| 414 |
+
sum(data_image['score'].tolist()) / len(data_image) if len(data_image) > 0 else 0.0,
|
| 415 |
+
sum(data_single['score'].tolist()) / len(data_single) if len(data_single) > 0 else 0.0,
|
| 416 |
+
sum(data_multi['score'].tolist()) / len(data_multi) if len(data_multi) > 0 else 0.0,
|
| 417 |
+
sum(data_unans['score'].tolist()) / len(data_unans) if len(data_unans) > 0 else 0.0,
|
| 418 |
+
]
|
| 419 |
+
res = pd.DataFrame(res)
|
| 420 |
+
return res
|
| 421 |
+
|
| 422 |
+
|
| 423 |
+
class MMLongBench(ImageBaseDataset):
|
| 424 |
+
|
| 425 |
+
TYPE = 'VQA'
|
| 426 |
+
|
| 427 |
+
DATASET_URL = {
|
| 428 |
+
'MMLongBench_DOC': 'https://opencompass.openxlab.space/utils/VLMEval/MMLongBench_DOC.tsv',
|
| 429 |
+
}
|
| 430 |
+
DATASET_MD5 = {
|
| 431 |
+
'MMLongBench_DOC': '9b393e1f4c52718380d50586197eac9b',
|
| 432 |
+
}
|
| 433 |
+
|
| 434 |
+
SUPPORTED_MODELS = {
|
| 435 |
+
'GPT4': (1, 1),
|
| 436 |
+
'GPT4V': (1, 1),
|
| 437 |
+
'GPT4V_HIGH': (1, 1),
|
| 438 |
+
'GPT4o': (1, 1),
|
| 439 |
+
'GPT4o_HIGH': (1, 1),
|
| 440 |
+
'GPT4o_MINI': (1, 1),
|
| 441 |
+
'MiniCPM-Llama3-V-2_5': (1, 5),
|
| 442 |
+
'InternVL-Chat-V1-5': (5, 2),
|
| 443 |
+
'XComposer2_4KHD': (1, 5),
|
| 444 |
+
'XComposer2d5': (1, -1),
|
| 445 |
+
}
|
| 446 |
+
|
| 447 |
+
def __init__(self, dataset, **kwargs):
|
| 448 |
+
self.model_list = list(self.SUPPORTED_MODELS.keys())
|
| 449 |
+
model_name = kwargs['model']
|
| 450 |
+
if not listinstr(self.model_list, model_name):
|
| 451 |
+
raise AssertionError("{} doesn't support the evaluation on MMLongBench_DOC.".format(model_name))
|
| 452 |
+
super(MMLongBench, self).__init__(dataset)
|
| 453 |
+
|
| 454 |
+
self.is_api = True if listinstr(['GPT4'], model_name) else False
|
| 455 |
+
self.max_pages = 120
|
| 456 |
+
concat_num, column_num = self.SUPPORTED_MODELS.get(model_name)
|
| 457 |
+
self.concat_num = concat_num
|
| 458 |
+
self.column_num = column_num
|
| 459 |
+
|
| 460 |
+
def dump_image(self, origin_line):
|
| 461 |
+
os.makedirs(self.img_root, exist_ok=True)
|
| 462 |
+
try:
|
| 463 |
+
import fitz
|
| 464 |
+
except Exception as e:
|
| 465 |
+
logging.critical(f'{type(e)}: {e}')
|
| 466 |
+
logging.critical('Please use `pip install pymupdf` to parse PDF files.')
|
| 467 |
+
|
| 468 |
+
line = origin_line.copy()
|
| 469 |
+
line['image_path'] = line['image_path'][:self.max_pages]
|
| 470 |
+
skip_pdf_parse = True
|
| 471 |
+
for im_name in line['image_path']:
|
| 472 |
+
path = osp.join(self.img_root, im_name)
|
| 473 |
+
if not read_ok(path):
|
| 474 |
+
skip_pdf_parse = False
|
| 475 |
+
break
|
| 476 |
+
|
| 477 |
+
# Just for being compatible with the zooped loop: zip(line['image'], line['image_path'])
|
| 478 |
+
if skip_pdf_parse:
|
| 479 |
+
line['image'] = line['image_path']
|
| 480 |
+
else:
|
| 481 |
+
pdf_data = base64.b64decode(line['image'])
|
| 482 |
+
pdf_file = io.BytesIO(pdf_data)
|
| 483 |
+
encoded_images = []
|
| 484 |
+
with fitz.open(stream=pdf_file, filetype='pdf') as doc:
|
| 485 |
+
doc = doc[:self.max_pages]
|
| 486 |
+
for page in doc:
|
| 487 |
+
image = page.get_pixmap(dpi=144)
|
| 488 |
+
image_file = io.BytesIO(image.tobytes(output='png'))
|
| 489 |
+
image = Image.open(image_file)
|
| 490 |
+
encoded_image = encode_image_to_base64(image)
|
| 491 |
+
encoded_images.append(encoded_image)
|
| 492 |
+
line['image'] = encoded_images
|
| 493 |
+
print('process {}'.format(line['doc_id']))
|
| 494 |
+
|
| 495 |
+
if 'image' in line:
|
| 496 |
+
if isinstance(line['image'], list):
|
| 497 |
+
tgt_path = []
|
| 498 |
+
assert 'image_path' in line
|
| 499 |
+
for img, im_name in zip(line['image'], line['image_path']):
|
| 500 |
+
path = osp.join(self.img_root, im_name)
|
| 501 |
+
if not read_ok(path):
|
| 502 |
+
decode_base64_to_image_file(img, path)
|
| 503 |
+
tgt_path.append(path)
|
| 504 |
+
else:
|
| 505 |
+
tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
|
| 506 |
+
if not read_ok(tgt_path):
|
| 507 |
+
decode_base64_to_image_file(line['image'], tgt_path)
|
| 508 |
+
tgt_path = [tgt_path]
|
| 509 |
+
else:
|
| 510 |
+
assert 'image_path' in line
|
| 511 |
+
tgt_path = toliststr(line['image_path'])
|
| 512 |
+
|
| 513 |
+
if self.concat_num > 0 and not self.is_api:
|
| 514 |
+
concatenated_images = concat_images(tgt_path, max_concat=self.concat_num, column_num=self.column_num)
|
| 515 |
+
|
| 516 |
+
old_tgt_path = tgt_path
|
| 517 |
+
assert isinstance(old_tgt_path, list)
|
| 518 |
+
if self.column_num != -1:
|
| 519 |
+
tgt_path = [
|
| 520 |
+
'_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat{}_{}.jpg'.format(self.concat_num, i)
|
| 521 |
+
for i in range(len(concatenated_images))
|
| 522 |
+
]
|
| 523 |
+
else:
|
| 524 |
+
tgt_path = [
|
| 525 |
+
'_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat_all_{}.jpg'.format(i)
|
| 526 |
+
for i in range(len(concatenated_images))
|
| 527 |
+
]
|
| 528 |
+
|
| 529 |
+
for path, concatenated_image in zip(tgt_path, concatenated_images):
|
| 530 |
+
if not read_ok(path):
|
| 531 |
+
decode_base64_to_image_file(encode_image_to_base64(concatenated_image), path)
|
| 532 |
+
num_images, image_size = len(old_tgt_path), concatenated_image.size
|
| 533 |
+
print('concat {} images to a new one with size {}. save at {}'.format(num_images, image_size, path))
|
| 534 |
+
return tgt_path
|
| 535 |
+
|
| 536 |
+
@classmethod
|
| 537 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 538 |
+
logger = get_logger('Evaluation')
|
| 539 |
+
model = judge_kwargs['model']
|
| 540 |
+
|
| 541 |
+
suffix = eval_file.split('.')[-1]
|
| 542 |
+
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
|
| 543 |
+
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
|
| 544 |
+
|
| 545 |
+
if osp.exists(storage):
|
| 546 |
+
logger.warning(f'GPT scoring file {storage} already exists, will reuse it in MMLongBench_eval. ')
|
| 547 |
+
else:
|
| 548 |
+
data = load(eval_file)
|
| 549 |
+
model = build_judge(max_tokens=128, **judge_kwargs)
|
| 550 |
+
lt = len(data)
|
| 551 |
+
lines = [data.iloc[i] for i in range(lt)]
|
| 552 |
+
tups = [(model, line) for line in lines]
|
| 553 |
+
indices = [line['index'] for line in lines]
|
| 554 |
+
|
| 555 |
+
ans = {}
|
| 556 |
+
if osp.exists(tmp_file):
|
| 557 |
+
ans = load(tmp_file)
|
| 558 |
+
tups = [x for x, i in zip(tups, indices) if i not in ans]
|
| 559 |
+
indices = [i for i in indices if i not in ans]
|
| 560 |
+
|
| 561 |
+
if len(indices):
|
| 562 |
+
new_results = list()
|
| 563 |
+
for model, line in tqdm(tups):
|
| 564 |
+
res = MMLongBench_auxeval(model, line)
|
| 565 |
+
new_results.append(res)
|
| 566 |
+
|
| 567 |
+
log_map, res_map, pred_map = {}, {}, {}
|
| 568 |
+
all_inds = [line['index'] for line in lines]
|
| 569 |
+
for k, v in zip(all_inds, new_results):
|
| 570 |
+
log_map[k] = v['log']
|
| 571 |
+
res_map[k] = v['res']
|
| 572 |
+
pred_map[k] = v['pred']
|
| 573 |
+
data['res'] = [res_map[idx] for idx in data['index']]
|
| 574 |
+
data['log'] = [log_map[idx] for idx in data['index']]
|
| 575 |
+
data['pred'] = [pred_map[idx] for idx in data['index']]
|
| 576 |
+
dump(data, storage)
|
| 577 |
+
|
| 578 |
+
score = MMLongBench_acc(storage)
|
| 579 |
+
score_pth = storage.replace('.xlsx', '_score.csv')
|
| 580 |
+
|
| 581 |
+
dump(score, score_pth)
|
| 582 |
+
logger.info(f'MMLongBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
|
| 583 |
+
logger.info('Score: ')
|
| 584 |
+
logger.info(score)
|
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mmmath.py
ADDED
|
@@ -0,0 +1,446 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import json
|
| 3 |
+
import sympy as sp
|
| 4 |
+
import numpy as np
|
| 5 |
+
from sympy import simplify, Eq, sympify, Pow, pi
|
| 6 |
+
from sympy.parsing.latex import parse_latex
|
| 7 |
+
import sys
|
| 8 |
+
import math
|
| 9 |
+
import os
|
| 10 |
+
import argparse
|
| 11 |
+
|
| 12 |
+
from .image_base import ImageBaseDataset
|
| 13 |
+
from ..utils import track_progress_rich
|
| 14 |
+
from ..smp import load, dump
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class AutoScoringJudge:
|
| 18 |
+
def __init__(self):
|
| 19 |
+
# Map of special symbols to their replacements
|
| 20 |
+
self.special_signal_map = {
|
| 21 |
+
"\\left": "",
|
| 22 |
+
"\\right": "",
|
| 23 |
+
"厘米":"",
|
| 24 |
+
# "∶": ":",
|
| 25 |
+
",": ",",
|
| 26 |
+
"$": "",
|
| 27 |
+
"(":"(",
|
| 28 |
+
")":")",
|
| 29 |
+
"\\infty":"oo",
|
| 30 |
+
"\\colon ":":",
|
| 31 |
+
# "\\approx": "=",
|
| 32 |
+
# "\\simeq": "=",
|
| 33 |
+
# "\\sim": "=",
|
| 34 |
+
# "^\\prime": "'",
|
| 35 |
+
# "^{\\prime}": "'",
|
| 36 |
+
"+":"+",
|
| 37 |
+
"\\, ": "",
|
| 38 |
+
"\\,":"",
|
| 39 |
+
"^\\circ": "",
|
| 40 |
+
"^{\\circ}": "",
|
| 41 |
+
# "%": "",
|
| 42 |
+
}
|
| 43 |
+
self.pi = parse_latex("\\pi")
|
| 44 |
+
# MM-Math default precision
|
| 45 |
+
self.precision = 1e-2
|
| 46 |
+
|
| 47 |
+
def trans_greater_sign_to_interval(self, expr:str):
|
| 48 |
+
expr_tmp = expr.split("<")
|
| 49 |
+
return "(" + expr_tmp[0] + ", " + expr_tmp[-1] + ")"
|
| 50 |
+
|
| 51 |
+
def split_by_comma(self, expr: str):
|
| 52 |
+
# Splits expressions by commas outside of brackets
|
| 53 |
+
in_bracket_num = 0
|
| 54 |
+
splitted_expr = []
|
| 55 |
+
start_idx = 0
|
| 56 |
+
for i, char in enumerate(expr):
|
| 57 |
+
if char in ["(", "["]:
|
| 58 |
+
in_bracket_num += 1
|
| 59 |
+
elif char in [")", "]"]:
|
| 60 |
+
in_bracket_num -= 1
|
| 61 |
+
elif char == "," and in_bracket_num == 0:
|
| 62 |
+
splitted_expr.append(expr[start_idx:i].strip())
|
| 63 |
+
start_idx = i + 1
|
| 64 |
+
|
| 65 |
+
if start_idx < len(expr):
|
| 66 |
+
splitted_expr.append(expr[start_idx:].strip())
|
| 67 |
+
|
| 68 |
+
return splitted_expr
|
| 69 |
+
|
| 70 |
+
def trans_plus_minus_sign(self, expr_list: list):
|
| 71 |
+
# Translates plus-minus signs into separate expressions
|
| 72 |
+
new_expr_list = []
|
| 73 |
+
for expr in expr_list:
|
| 74 |
+
if "\\pm" in expr:
|
| 75 |
+
new_expr_list.append(expr.replace("\\pm", "+"))
|
| 76 |
+
new_expr_list.append(expr.replace("\\pm", "-"))
|
| 77 |
+
else:
|
| 78 |
+
new_expr_list.append(expr)
|
| 79 |
+
|
| 80 |
+
return new_expr_list
|
| 81 |
+
|
| 82 |
+
def judge(self, expression1, expression2, precision=1e-2):
|
| 83 |
+
# Judge if two expressions are equal (expression1 is considered as the Ground Truth)
|
| 84 |
+
# Default precision is a list for supporting multiple expressions
|
| 85 |
+
precision = precision if isinstance(precision, list) else [precision]
|
| 86 |
+
|
| 87 |
+
try:
|
| 88 |
+
expression1, expression2 = self.preprocess(expression1, expression2)
|
| 89 |
+
except:
|
| 90 |
+
return False
|
| 91 |
+
if expression1 == expression2:
|
| 92 |
+
# print("Exactly equal")
|
| 93 |
+
return True
|
| 94 |
+
|
| 95 |
+
# Remove Chinese characters from the string, as answers like "yes" or "no" in Chinese have been considered
|
| 96 |
+
expression1 = expression1 if re.fullmatch(r"[\u4e00-\u9fff]+", expression1) else re.sub(r'[\u4e00-\u9fff]+', '', expression1) # noqa: E501
|
| 97 |
+
expression2 = expression2 if re.fullmatch(r'[\u4e00-\u9fff]+', expression2) else re.sub(r'[\u4e00-\u9fff]+', '', expression2) # noqa: E501
|
| 98 |
+
# Check if two < or > in expression
|
| 99 |
+
if self.is_two_greater_sign(expression1):
|
| 100 |
+
expression1 = self.trans_greater_sign_to_interval(expression1)
|
| 101 |
+
|
| 102 |
+
if self.is_two_greater_sign(expression2):
|
| 103 |
+
expression2 = self.trans_greater_sign_to_interval(expression2)
|
| 104 |
+
|
| 105 |
+
expression1 = self.split_by_comma(expression1)
|
| 106 |
+
expression2 = self.split_by_comma(expression2)
|
| 107 |
+
|
| 108 |
+
temp_list1 = self.trans_plus_minus_sign(expression1)
|
| 109 |
+
temp_list2 = self.trans_plus_minus_sign(expression2)
|
| 110 |
+
|
| 111 |
+
# Set up a list for allowed errors
|
| 112 |
+
if len(precision) <= 1:
|
| 113 |
+
precision = precision * len(temp_list1)
|
| 114 |
+
|
| 115 |
+
if len(temp_list1) != len(temp_list2):
|
| 116 |
+
return False
|
| 117 |
+
|
| 118 |
+
# Check if elements in both lists can be paired and are equal
|
| 119 |
+
idx = -1
|
| 120 |
+
while len(temp_list1) != 0:
|
| 121 |
+
idx = (idx + 1) % len(temp_list1)
|
| 122 |
+
|
| 123 |
+
item1 = temp_list1[idx]
|
| 124 |
+
self.precision = precision[idx]
|
| 125 |
+
|
| 126 |
+
for item2 in temp_list2:
|
| 127 |
+
if self.is_equal(item1, item2):
|
| 128 |
+
temp_list1.remove(item1)
|
| 129 |
+
temp_list2.remove(item2)
|
| 130 |
+
precision.remove(self.precision)
|
| 131 |
+
break
|
| 132 |
+
else:
|
| 133 |
+
# If no match was found, return False
|
| 134 |
+
return False
|
| 135 |
+
|
| 136 |
+
# If all elements are matched, return True
|
| 137 |
+
return True
|
| 138 |
+
|
| 139 |
+
def is_interval(self, expr):
|
| 140 |
+
# Checks if an expression is an interval
|
| 141 |
+
return expr.startswith(("(", "[")) and expr.endswith((")", "]"))
|
| 142 |
+
|
| 143 |
+
def is_two_greater_sign(self, expr):
|
| 144 |
+
match = re.findall(r'<', expr)
|
| 145 |
+
return len(match) == 2
|
| 146 |
+
|
| 147 |
+
def sympy_sub_pi(self, expression_sympy):
|
| 148 |
+
# Replaces the symbol for pi in sympy expressions with its numerical value
|
| 149 |
+
return expression_sympy.subs(self.pi, math.pi)
|
| 150 |
+
|
| 151 |
+
def is_equal(self, expression1, expression2):
|
| 152 |
+
# Default first expression is ground truth. Check if expressions are equal in different aspects
|
| 153 |
+
if expression1 == expression2 and expression1 != "" and expression2 != "":
|
| 154 |
+
# print("Equivalent natively")
|
| 155 |
+
return True
|
| 156 |
+
|
| 157 |
+
# First check if both are intervals
|
| 158 |
+
if self.is_interval(expression1) and self.is_interval(expression2):
|
| 159 |
+
try:
|
| 160 |
+
if self.interval_equal(expression1, expression2):
|
| 161 |
+
# print("Interval equivalent")
|
| 162 |
+
return True
|
| 163 |
+
except:
|
| 164 |
+
return False
|
| 165 |
+
|
| 166 |
+
# Then check for numerical equality
|
| 167 |
+
try:
|
| 168 |
+
if self.numerical_equal(expression1, expression2):
|
| 169 |
+
# print("Numerically equivalent")
|
| 170 |
+
return True
|
| 171 |
+
except:
|
| 172 |
+
pass
|
| 173 |
+
# Then check if expressions are mathematically equal
|
| 174 |
+
try:
|
| 175 |
+
if self.expression_equal(expression1, expression2) and not ("=" in expression1 and "=" in expression2):
|
| 176 |
+
# print("Expression equivalent")
|
| 177 |
+
return True
|
| 178 |
+
except:
|
| 179 |
+
pass
|
| 180 |
+
|
| 181 |
+
# Lastly, check for equation equality
|
| 182 |
+
try:
|
| 183 |
+
if self.equation_equal(expression1, expression2):
|
| 184 |
+
# print("Equation equivalent")
|
| 185 |
+
return True
|
| 186 |
+
except:
|
| 187 |
+
pass
|
| 188 |
+
|
| 189 |
+
return False
|
| 190 |
+
|
| 191 |
+
def numerical_equal(self, expression1: str, expression2: str, include_percentage: bool = True):
|
| 192 |
+
# Check if two numerical values are equal within an allowed error range
|
| 193 |
+
# Includes possible percentage cases
|
| 194 |
+
reference = float(expression1)
|
| 195 |
+
prediction = float(expression2)
|
| 196 |
+
|
| 197 |
+
if include_percentage:
|
| 198 |
+
gt_result = [reference / 100, reference, reference * 100]
|
| 199 |
+
else:
|
| 200 |
+
gt_result = [reference]
|
| 201 |
+
|
| 202 |
+
for item in gt_result:
|
| 203 |
+
if abs(item - prediction) <= self.precision * 1.01:
|
| 204 |
+
return True
|
| 205 |
+
return False
|
| 206 |
+
|
| 207 |
+
def expression_equal(self, exp1, exp2):
|
| 208 |
+
# Check if two expressions are mathematically equivalent
|
| 209 |
+
# Extract expression and use sympy for equivalence checking
|
| 210 |
+
def extract_expression(expression):
|
| 211 |
+
if "=" in expression:
|
| 212 |
+
expression = expression.split("=")[1]
|
| 213 |
+
return expression.strip()
|
| 214 |
+
|
| 215 |
+
exp1 = extract_expression(exp1)
|
| 216 |
+
exp2 = extract_expression(exp2)
|
| 217 |
+
|
| 218 |
+
exp_too_long = len(exp1) > 300 or len(exp2) > 300
|
| 219 |
+
|
| 220 |
+
expr1_sym = sympify(parse_latex(exp1))
|
| 221 |
+
expr2_sym = sympify(parse_latex(exp2))
|
| 222 |
+
if expr1_sym == expr2_sym:
|
| 223 |
+
return True
|
| 224 |
+
else:
|
| 225 |
+
expr1_sym = self.sympy_sub_pi(expr1_sym)
|
| 226 |
+
expr2_sym = self.sympy_sub_pi(expr2_sym)
|
| 227 |
+
|
| 228 |
+
if (expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol)) or \
|
| 229 |
+
(not expr1_sym.has(sp.Symbol) and expr2_sym.has(sp.Symbol)):
|
| 230 |
+
return False
|
| 231 |
+
elif not expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol):
|
| 232 |
+
try:
|
| 233 |
+
if not (self.can_compute_power(expr1_sym) and self.can_compute_power(expr2_sym)):
|
| 234 |
+
print("These two numbers cannot be calculated by the current computer for: "
|
| 235 |
+
f"\"{str(expr1_sym)}\" and \"{str(expr2_sym)}\"")
|
| 236 |
+
return False
|
| 237 |
+
if exp_too_long:
|
| 238 |
+
print(f'Expression {exp1} or {exp2} is too long to compute. ')
|
| 239 |
+
return False
|
| 240 |
+
if abs(expr1_sym.evalf() - expr2_sym.evalf()) <= self.precision * 1.01:
|
| 241 |
+
return True
|
| 242 |
+
else:
|
| 243 |
+
return False
|
| 244 |
+
except:
|
| 245 |
+
return False
|
| 246 |
+
elif exp_too_long:
|
| 247 |
+
print(f'Expression {exp1} or {exp2} is too long to compute. ')
|
| 248 |
+
return False
|
| 249 |
+
else:
|
| 250 |
+
try:
|
| 251 |
+
simplified_expr = simplify(expr1_sym - expr2_sym)
|
| 252 |
+
num_value = simplified_expr.evalf()
|
| 253 |
+
return abs(num_value) < 1e-3
|
| 254 |
+
except:
|
| 255 |
+
return False
|
| 256 |
+
|
| 257 |
+
def equation_equal(self, expression1, expression2):
|
| 258 |
+
# Check if two equations are mathematically equivalent
|
| 259 |
+
# Simplify equations and use sympy for equivalence checking
|
| 260 |
+
def simplify_equation(latex_eq):
|
| 261 |
+
lhs, rhs = latex_eq.split('=')
|
| 262 |
+
|
| 263 |
+
lhs_expr = parse_latex(lhs)
|
| 264 |
+
rhs_expr = parse_latex(rhs)
|
| 265 |
+
|
| 266 |
+
equation = Eq(lhs_expr, rhs_expr)
|
| 267 |
+
|
| 268 |
+
simplified_eq = simplify(equation.lhs - equation.rhs)
|
| 269 |
+
|
| 270 |
+
return simplified_eq
|
| 271 |
+
|
| 272 |
+
expr1_sym = simplify_equation(expression1)
|
| 273 |
+
expr2_sym = simplify_equation(expression2)
|
| 274 |
+
|
| 275 |
+
division_result_1 = simplify(expr1_sym / expr2_sym)
|
| 276 |
+
division_result_2 = simplify(expr2_sym / expr1_sym)
|
| 277 |
+
|
| 278 |
+
if ((division_result_1.is_Integer and division_result_1 != 0) or # noqa: W504
|
| 279 |
+
(division_result_2.is_Integer and division_result_2 != 0)):
|
| 280 |
+
return True
|
| 281 |
+
else:
|
| 282 |
+
return False
|
| 283 |
+
|
| 284 |
+
def interval_equal(self, expression1, expression2):
|
| 285 |
+
# Check if two intervals are mathematically equivalent
|
| 286 |
+
def compare_two_interval(inter1, inter2):
|
| 287 |
+
if inter1[0] != inter2[0] or inter1[-1] != inter2[-1]:
|
| 288 |
+
return False
|
| 289 |
+
|
| 290 |
+
inter1 = inter1.strip('[]()')
|
| 291 |
+
inter2 = inter2.strip('[]()')
|
| 292 |
+
|
| 293 |
+
items_1 = inter1.split(',')
|
| 294 |
+
items_2 = inter2.split(',')
|
| 295 |
+
|
| 296 |
+
for item_1, item_2 in zip(items_1, items_2):
|
| 297 |
+
if not self.expression_equal(item_1, item_2):
|
| 298 |
+
return False
|
| 299 |
+
return True
|
| 300 |
+
|
| 301 |
+
interval1 = expression1
|
| 302 |
+
interval2 = expression2
|
| 303 |
+
|
| 304 |
+
if interval1 == interval2:
|
| 305 |
+
return True
|
| 306 |
+
else:
|
| 307 |
+
inter_list1 = interval1.split("\\cup")
|
| 308 |
+
inter_list2 = interval2.split("\\cup")
|
| 309 |
+
|
| 310 |
+
if len(inter_list1) != len(inter_list2):
|
| 311 |
+
return False
|
| 312 |
+
else:
|
| 313 |
+
for inter1, inter2 in zip(inter_list1, inter_list2):
|
| 314 |
+
if not compare_two_interval(inter1, inter2):
|
| 315 |
+
return False
|
| 316 |
+
return True
|
| 317 |
+
|
| 318 |
+
def preprocess(self, expression1, expression2):
|
| 319 |
+
# Preprocess expressions to extract and replace special symbols
|
| 320 |
+
def extract_boxed_content(latex_str):
|
| 321 |
+
boxed_matches = re.finditer(r'\\boxed{', latex_str)
|
| 322 |
+
results = ""
|
| 323 |
+
|
| 324 |
+
for match in boxed_matches:
|
| 325 |
+
start_index = match.end()
|
| 326 |
+
end_index = start_index
|
| 327 |
+
stack = 1
|
| 328 |
+
|
| 329 |
+
while stack > 0 and end_index < len(latex_str):
|
| 330 |
+
if latex_str[end_index] == '{':
|
| 331 |
+
stack += 1
|
| 332 |
+
elif latex_str[end_index] == '}':
|
| 333 |
+
stack -= 1
|
| 334 |
+
end_index += 1
|
| 335 |
+
|
| 336 |
+
if stack == 0:
|
| 337 |
+
content = latex_str[start_index:end_index - 1]
|
| 338 |
+
results += content + ","
|
| 339 |
+
else:
|
| 340 |
+
raise ValueError("Mismatched braces in LaTeX string.")
|
| 341 |
+
|
| 342 |
+
if results == "":
|
| 343 |
+
last_line_ans = latex_str.strip().split("\n")[-1]
|
| 344 |
+
dollar_pattern = r"\$(.*?)\$"
|
| 345 |
+
answers = re.findall(dollar_pattern, last_line_ans)
|
| 346 |
+
|
| 347 |
+
if answers:
|
| 348 |
+
for ans in answers:
|
| 349 |
+
results += ans + ","
|
| 350 |
+
else:
|
| 351 |
+
results = latex_str
|
| 352 |
+
|
| 353 |
+
return results
|
| 354 |
+
|
| 355 |
+
def sepcial_symbol_replace(expression):
|
| 356 |
+
|
| 357 |
+
expression = expression.replace("\\text{cm}^2", '').replace("\\text{cm}", "").replace("\\,cm", '').replace("\\text{ cm}", '').replace("cm", '').replace("\\text{分米}^2", '').replace("cm^{2}", '').replace("60 \\text{ cm}^2",'').replace("\\ \\text{m}", "").replace("\\text{米}","").strip() # noqa: E501
|
| 358 |
+
|
| 359 |
+
expression = re.sub(r"(.+)m$", r"\1", expression)
|
| 360 |
+
|
| 361 |
+
if "\\in " in expression:
|
| 362 |
+
expression = expression.split("\\in ")[1]
|
| 363 |
+
|
| 364 |
+
for signal in self.special_signal_map:
|
| 365 |
+
expression = expression.replace(signal, self.special_signal_map[signal])
|
| 366 |
+
|
| 367 |
+
expression = re.sub(r'(\\sin|\\cos|\\tan)(\d+)', r'\1((\2/180)\\pi)', expression)
|
| 368 |
+
|
| 369 |
+
expression = expression.strip("\n,.:;^_=+`!@#%^&*~,。")
|
| 370 |
+
|
| 371 |
+
pattern = r'\\(?:mathrm|mathbf)\{~?([^}]*)\}'
|
| 372 |
+
expression = re.sub(pattern, r'\1', expression)
|
| 373 |
+
|
| 374 |
+
return expression
|
| 375 |
+
|
| 376 |
+
exp1, exp2 = extract_boxed_content(expression1), extract_boxed_content(expression2)
|
| 377 |
+
|
| 378 |
+
exp1, exp2 = sepcial_symbol_replace(exp1), sepcial_symbol_replace(exp2)
|
| 379 |
+
|
| 380 |
+
return exp1, exp2
|
| 381 |
+
|
| 382 |
+
def can_compute_power(self, expr):
|
| 383 |
+
# Checks if a power expression can be computed
|
| 384 |
+
if isinstance(expr, Pow):
|
| 385 |
+
base, exp = expr.as_base_exp()
|
| 386 |
+
if base.is_number and exp.is_number:
|
| 387 |
+
MAX_EXP = 1000 # Adjust based on computing environment
|
| 388 |
+
if abs(exp.evalf()) > MAX_EXP:
|
| 389 |
+
return False
|
| 390 |
+
else:
|
| 391 |
+
return True
|
| 392 |
+
else:
|
| 393 |
+
return False
|
| 394 |
+
else:
|
| 395 |
+
return True # Not a power expression, can compute
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
class MMMath(ImageBaseDataset):
|
| 399 |
+
|
| 400 |
+
TYPE = 'VQA'
|
| 401 |
+
|
| 402 |
+
DATASET_URL = {
|
| 403 |
+
'MM-Math': 'https://opencompass.openxlab.space/utils/VLMEval/MM-Math.tsv',
|
| 404 |
+
}
|
| 405 |
+
DATASET_MD5 = {
|
| 406 |
+
'MM-Math': '1f064ed7c4e0e8926a3fa65849419ca5',
|
| 407 |
+
}
|
| 408 |
+
|
| 409 |
+
@classmethod
|
| 410 |
+
def evaluate(self, eval_file, **kwargs):
|
| 411 |
+
|
| 412 |
+
data = load(eval_file)
|
| 413 |
+
judger = AutoScoringJudge()
|
| 414 |
+
func = judger.judge
|
| 415 |
+
|
| 416 |
+
tups = [dict(expression1=x, expression2=y) for x, y in zip(data['answer'], data['prediction'])]
|
| 417 |
+
|
| 418 |
+
res = track_progress_rich(func, tups, nproc=16)
|
| 419 |
+
data['hit'] = res
|
| 420 |
+
dump(data, eval_file)
|
| 421 |
+
|
| 422 |
+
score_file = eval_file.replace('.xlsx', '_score.json')
|
| 423 |
+
score = {}
|
| 424 |
+
score['overall'] = np.mean(data['hit'])
|
| 425 |
+
# Results by Difficulty
|
| 426 |
+
difficulties = set(data['difficulty'])
|
| 427 |
+
for d in difficulties:
|
| 428 |
+
score[f'Difficulty-{d}'] = np.mean(data[data['difficulty'] == d]['hit'])
|
| 429 |
+
|
| 430 |
+
# Results by Year
|
| 431 |
+
years = set(data['year'])
|
| 432 |
+
for y in years:
|
| 433 |
+
score[f'Year-{y}'] = np.mean(data[data['year'] == y]['hit'])
|
| 434 |
+
|
| 435 |
+
# Results by Knowledge-L1
|
| 436 |
+
points = set(data['knowledge_l1'])
|
| 437 |
+
for p in points:
|
| 438 |
+
score[f'Knowledge-L1-{p}'] = np.mean(data[data['knowledge_l1'] == p]['hit'])
|
| 439 |
+
|
| 440 |
+
# Results by Knowledge-L2
|
| 441 |
+
points = set(data['knowledge_l2'])
|
| 442 |
+
for p in points:
|
| 443 |
+
score[f'Knowledge-L2-{p}'] = np.mean(data[data['knowledge_l2'] == p]['hit'])
|
| 444 |
+
|
| 445 |
+
dump(score, score_file)
|
| 446 |
+
return score
|
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mvbench.py
ADDED
|
@@ -0,0 +1,666 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import huggingface_hub
|
| 2 |
+
from huggingface_hub import snapshot_download
|
| 3 |
+
from ..smp import *
|
| 4 |
+
from .video_base import VideoBaseDataset
|
| 5 |
+
from .utils import build_judge, DEBUG_MESSAGE
|
| 6 |
+
from ..utils import track_progress_rich
|
| 7 |
+
import torchvision.transforms as T
|
| 8 |
+
from torchvision import transforms
|
| 9 |
+
from torchvision.transforms.functional import InterpolationMode
|
| 10 |
+
from decord import VideoReader, cpu
|
| 11 |
+
import imageio
|
| 12 |
+
import cv2
|
| 13 |
+
import zipfile
|
| 14 |
+
import os
|
| 15 |
+
import glob
|
| 16 |
+
from .utils.mvbench import *
|
| 17 |
+
|
| 18 |
+
FAIL_MSG = 'Failed to obtain answer via API.'
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class MVBench(VideoBaseDataset):
|
| 22 |
+
|
| 23 |
+
MD5 = 'fd21d36522cdedd46d84dc46715ad832'
|
| 24 |
+
SYS = """Carefully watch the video and pay attention to the cause and sequence of events, \
|
| 25 |
+
the detail and movement of objects, and the action and pose of persons. \
|
| 26 |
+
Based on your observations, select the best option that accurately addresses the question.
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
TYPE = 'Video-MCQ'
|
| 30 |
+
|
| 31 |
+
def __init__(self, dataset='MVBench', nframe=0, fps=-1):
|
| 32 |
+
self.type_data_list = {
|
| 33 |
+
'Action Sequence': ('action_sequence.json',
|
| 34 |
+
'your_data_path/star/Charades_v1_480/', 'video', True), # has start & end
|
| 35 |
+
'Action Prediction': ('action_prediction.json',
|
| 36 |
+
'your_data_path/star/Charades_v1_480/', 'video', True), # has start & end
|
| 37 |
+
'Action Antonym': ('action_antonym.json',
|
| 38 |
+
'your_data_path/ssv2_video/', 'video', False),
|
| 39 |
+
'Fine-grained Action': ('fine_grained_action.json',
|
| 40 |
+
'your_data_path/Moments_in_Time_Raw/videos/', 'video', False),
|
| 41 |
+
'Unexpected Action': ('unexpected_action.json',
|
| 42 |
+
'your_data_path/FunQA_test/test/', 'video', False),
|
| 43 |
+
'Object Existence': ('object_existence.json',
|
| 44 |
+
'your_data_path/clevrer/video_validation/', 'video', False),
|
| 45 |
+
'Object Interaction': ('object_interaction.json',
|
| 46 |
+
'your_data_path/star/Charades_v1_480/', 'video', True), # has start & end
|
| 47 |
+
'Object Shuffle': ('object_shuffle.json',
|
| 48 |
+
'your_data_path/perception/videos/', 'video', False),
|
| 49 |
+
'Moving Direction': ('moving_direction.json',
|
| 50 |
+
'your_data_path/clevrer/video_validation/', 'video', False),
|
| 51 |
+
'Action Localization': ('action_localization.json',
|
| 52 |
+
'your_data_path/sta/sta_video/', 'video', True), # has start & end
|
| 53 |
+
'Scene Transition': ('scene_transition.json',
|
| 54 |
+
'your_data_path/scene_qa/video/', 'video', False),
|
| 55 |
+
'Action Count': ('action_count.json',
|
| 56 |
+
'your_data_path/perception/videos/', 'video', False),
|
| 57 |
+
'Moving Count': ('moving_count.json',
|
| 58 |
+
'your_data_path/clevrer/video_validation/', 'video', False),
|
| 59 |
+
'Moving Attribute': ('moving_attribute.json',
|
| 60 |
+
'your_data_path/clevrer/video_validation/', 'video', False),
|
| 61 |
+
'State Change': ('state_change.json',
|
| 62 |
+
'your_data_path/perception/videos/', 'video', False),
|
| 63 |
+
'Fine-grained Pose': ('fine_grained_pose.json',
|
| 64 |
+
'your_data_path/nturgbd/', 'video', False),
|
| 65 |
+
'Character Order': ('character_order.json',
|
| 66 |
+
'your_data_path/perception/videos/', 'video', False),
|
| 67 |
+
'Egocentric Navigation': ('egocentric_navigation.json',
|
| 68 |
+
'your_data_path/vlnqa/', 'video', False),
|
| 69 |
+
'Episodic Reasoning': ('episodic_reasoning.json',
|
| 70 |
+
'your_data_path/tvqa/frames_fps3_hq/', 'frame', True), # has start & end, read frame
|
| 71 |
+
'Counterfactual Inference': ('counterfactual_inference.json',
|
| 72 |
+
'your_data_path/clevrer/video_validation/', 'video', False),
|
| 73 |
+
}
|
| 74 |
+
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
|
| 75 |
+
|
| 76 |
+
@classmethod
|
| 77 |
+
def supported_datasets(cls):
|
| 78 |
+
return ['MVBench']
|
| 79 |
+
|
| 80 |
+
def prepare_dataset(self, dataset_name='MVBench', repo_id='OpenGVLab/MVBench'):
|
| 81 |
+
def check_integrity(pth):
|
| 82 |
+
data_file = osp.join(pth, f'{dataset_name}.tsv')
|
| 83 |
+
|
| 84 |
+
if not os.path.exists(data_file):
|
| 85 |
+
return False
|
| 86 |
+
|
| 87 |
+
if md5(data_file) != self.MD5:
|
| 88 |
+
return False
|
| 89 |
+
|
| 90 |
+
data = load(data_file)
|
| 91 |
+
for idx, item in data.iterrows():
|
| 92 |
+
if not osp.exists(osp.join(pth, item['prefix'], item['video'])):
|
| 93 |
+
return False
|
| 94 |
+
return True
|
| 95 |
+
|
| 96 |
+
if modelscope_flag_set():
|
| 97 |
+
repo_id = 'modelscope/MVBench'
|
| 98 |
+
|
| 99 |
+
cache_path = get_cache_path(repo_id, branch='main')
|
| 100 |
+
if cache_path is not None and check_integrity(cache_path):
|
| 101 |
+
dataset_path = cache_path
|
| 102 |
+
else:
|
| 103 |
+
def unzip_hf_zip(pth):
|
| 104 |
+
pth = os.path.join(pth, 'video/')
|
| 105 |
+
for filename in os.listdir(pth):
|
| 106 |
+
if filename.endswith('.zip'):
|
| 107 |
+
# 构建完整的文件路径
|
| 108 |
+
zip_path = os.path.join(pth, filename)
|
| 109 |
+
|
| 110 |
+
# 解压 ZIP 文件
|
| 111 |
+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
| 112 |
+
zip_ref.extractall(pth)
|
| 113 |
+
|
| 114 |
+
def generate_tsv(pth):
|
| 115 |
+
data_file = osp.join(pth, f'{dataset_name}.tsv')
|
| 116 |
+
if os.path.exists(data_file) and md5(data_file) == self.MD5:
|
| 117 |
+
return
|
| 118 |
+
json_data_dir = os.path.join(pth, 'json')
|
| 119 |
+
self.data_list = []
|
| 120 |
+
for k, v in self.type_data_list.items():
|
| 121 |
+
with open(os.path.join(json_data_dir, v[0]), 'r') as f:
|
| 122 |
+
json_data = json.load(f)
|
| 123 |
+
for data in json_data:
|
| 124 |
+
if os.path.exists(os.path.join(pth, v[1].replace('your_data_path', 'video'), data['video'])):
|
| 125 |
+
self.data_list.append({
|
| 126 |
+
'task_type': k,
|
| 127 |
+
'prefix': v[1].replace('your_data_path', 'video'),
|
| 128 |
+
'data_type': v[2],
|
| 129 |
+
'bound': v[3],
|
| 130 |
+
'start': data['start'] if 'start' in data.keys() else None,
|
| 131 |
+
'end': data['end'] if 'end' in data.keys() else None,
|
| 132 |
+
'video': data['video'],
|
| 133 |
+
'question': data['question'],
|
| 134 |
+
'answer': data['answer'],
|
| 135 |
+
'candidates': data['candidates']
|
| 136 |
+
})
|
| 137 |
+
else:
|
| 138 |
+
print(
|
| 139 |
+
'NTURGB-D zip file is removed according to MVBench, you can view it at '
|
| 140 |
+
'https://huggingface.co/datasets/OpenGVLab/MVBench for detailed reason.'
|
| 141 |
+
)
|
| 142 |
+
raise Exception(
|
| 143 |
+
f"{os.path.join(v[1].replace('your_data_path', 'video'), data['video'])} does not exist"
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
data_df = pd.DataFrame(self.data_list)
|
| 147 |
+
data_df = data_df.assign(index=range(len(data_df)))
|
| 148 |
+
data_df.to_csv(data_file, sep='\t', index=False)
|
| 149 |
+
|
| 150 |
+
def move_files(pth):
|
| 151 |
+
src_folder = os.path.join(pth, 'video/data0613')
|
| 152 |
+
if not os.path.exists(src_folder):
|
| 153 |
+
return
|
| 154 |
+
for subdir in os.listdir(src_folder):
|
| 155 |
+
subdir_path = os.path.join(src_folder, subdir)
|
| 156 |
+
if os.path.isdir(subdir_path):
|
| 157 |
+
for subsubdir in os.listdir(subdir_path):
|
| 158 |
+
subsubdir_path = os.path.join(subdir_path, subsubdir)
|
| 159 |
+
if os.path.isdir(subsubdir_path):
|
| 160 |
+
for item in os.listdir(subsubdir_path):
|
| 161 |
+
item_path = os.path.join(subsubdir_path, item)
|
| 162 |
+
target_folder = os.path.join(pth, 'video', subdir, subsubdir)
|
| 163 |
+
if not os.path.exists(target_folder):
|
| 164 |
+
os.makedirs(target_folder)
|
| 165 |
+
target_path = os.path.join(target_folder, item)
|
| 166 |
+
try:
|
| 167 |
+
shutil.move(item_path, target_path)
|
| 168 |
+
except Exception as e:
|
| 169 |
+
print(f"Error moving {item_path} to {target_path}: {e}")
|
| 170 |
+
|
| 171 |
+
if modelscope_flag_set():
|
| 172 |
+
from modelscope import dataset_snapshot_download
|
| 173 |
+
dataset_path = dataset_snapshot_download(dataset_id=repo_id, revision='master')
|
| 174 |
+
else:
|
| 175 |
+
hf_token = os.environ.get('HUGGINGFACE_TOKEN')
|
| 176 |
+
huggingface_hub.login(hf_token)
|
| 177 |
+
dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
|
| 178 |
+
unzip_hf_zip(dataset_path)
|
| 179 |
+
move_files(dataset_path)
|
| 180 |
+
generate_tsv(dataset_path)
|
| 181 |
+
|
| 182 |
+
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
|
| 183 |
+
|
| 184 |
+
self.decord_method = {
|
| 185 |
+
'video': self.read_video,
|
| 186 |
+
'gif': self.read_gif,
|
| 187 |
+
'frame': self.read_frame,
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
self.nframe = 8
|
| 191 |
+
self.frame_fps = 3
|
| 192 |
+
|
| 193 |
+
# transform
|
| 194 |
+
self.transform = T.Compose([
|
| 195 |
+
Stack(),
|
| 196 |
+
ToTorchFormatTensor()
|
| 197 |
+
])
|
| 198 |
+
|
| 199 |
+
return dict(root=dataset_path, data_file=data_file)
|
| 200 |
+
|
| 201 |
+
def get_index(self, bound, fps, max_frame, first_idx=0):
|
| 202 |
+
if bound:
|
| 203 |
+
start, end = bound[0], bound[1]
|
| 204 |
+
else:
|
| 205 |
+
start, end = -100000, 100000
|
| 206 |
+
start_idx = max(first_idx, round(start * fps))
|
| 207 |
+
end_idx = min(round(end * fps), max_frame)
|
| 208 |
+
seg_size = float(end_idx - start_idx) / self.num_segments
|
| 209 |
+
frame_indices = np.array([
|
| 210 |
+
int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
|
| 211 |
+
for idx in range(self.num_segments)
|
| 212 |
+
])
|
| 213 |
+
return frame_indices
|
| 214 |
+
|
| 215 |
+
def read_video(self, video_path, bound=None):
|
| 216 |
+
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
|
| 217 |
+
max_frame = len(vr) - 1
|
| 218 |
+
fps = float(vr.get_avg_fps())
|
| 219 |
+
|
| 220 |
+
images_group = list()
|
| 221 |
+
frame_indices = self.get_index(bound, fps, max_frame, first_idx=0)
|
| 222 |
+
for frame_index in frame_indices:
|
| 223 |
+
img = Image.fromarray(vr[frame_index].asnumpy())
|
| 224 |
+
images_group.append(img)
|
| 225 |
+
torch_imgs = self.transform(images_group)
|
| 226 |
+
return torch_imgs
|
| 227 |
+
|
| 228 |
+
def read_gif(self, video_path, bound=None, fps=25):
|
| 229 |
+
gif = imageio.get_reader(video_path)
|
| 230 |
+
max_frame = len(gif) - 1
|
| 231 |
+
|
| 232 |
+
images_group = list()
|
| 233 |
+
frame_indices = self.get_index(bound, fps, max_frame, first_idx=0)
|
| 234 |
+
for index, frame in enumerate(gif):
|
| 235 |
+
if index in frame_indices:
|
| 236 |
+
img = cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB)
|
| 237 |
+
img = Image.fromarray(img)
|
| 238 |
+
images_group.append(img)
|
| 239 |
+
torch_imgs = self.transform(images_group)
|
| 240 |
+
return torch_imgs
|
| 241 |
+
|
| 242 |
+
def read_frame(self, video_path, bound=None, fps=3):
|
| 243 |
+
max_frame = len(os.listdir(video_path))
|
| 244 |
+
images_group = list()
|
| 245 |
+
frame_indices = self.get_index(bound, fps, max_frame, first_idx=1) # frame_idx starts from 1
|
| 246 |
+
for frame_index in frame_indices:
|
| 247 |
+
img = Image.open(os.path.join(video_path, f'{frame_index:05d}.jpg'))
|
| 248 |
+
images_group.append(img)
|
| 249 |
+
torch_imgs = self.transform(images_group)
|
| 250 |
+
return torch_imgs
|
| 251 |
+
|
| 252 |
+
def save_video_frames(self, imgs, video_name, frames):
|
| 253 |
+
|
| 254 |
+
frame_paths = self.frame_paths(video_name)
|
| 255 |
+
flag = np.all([osp.exists(p) for p in frame_paths])
|
| 256 |
+
|
| 257 |
+
if not flag:
|
| 258 |
+
block_size = imgs.size(0) // frames
|
| 259 |
+
split_tensors = torch.split(imgs, block_size)
|
| 260 |
+
to_pil = transforms.ToPILImage()
|
| 261 |
+
images = [to_pil(arr) for arr in split_tensors]
|
| 262 |
+
for im, pth in zip(images, frame_paths):
|
| 263 |
+
if not osp.exists(pth):
|
| 264 |
+
im.save(pth)
|
| 265 |
+
|
| 266 |
+
return frame_paths
|
| 267 |
+
|
| 268 |
+
def qa_template(self, data):
|
| 269 |
+
question = f"Question: {data['question']}\n"
|
| 270 |
+
question += 'Options:\n'
|
| 271 |
+
answer = data['answer']
|
| 272 |
+
answer_idx = -1
|
| 273 |
+
for idx, c in enumerate(eval(data['candidates'])):
|
| 274 |
+
question += f"({chr(ord('A') + idx)}) {c}\n"
|
| 275 |
+
if c == answer:
|
| 276 |
+
answer_idx = idx
|
| 277 |
+
question = question.rstrip()
|
| 278 |
+
answer = f"({chr(ord('A') + answer_idx)}) {answer}"
|
| 279 |
+
return question, answer
|
| 280 |
+
|
| 281 |
+
def load_into_video_and_process(self, line):
|
| 282 |
+
try:
|
| 283 |
+
from moviepy.editor import VideoFileClip, ImageSequenceClip
|
| 284 |
+
except:
|
| 285 |
+
raise ImportError(
|
| 286 |
+
'MoviePy is not installed, please install it by running "pip install moviepy==1.0.3"'
|
| 287 |
+
)
|
| 288 |
+
video_path = os.path.join(self.data_root, line['prefix'], line['video'])
|
| 289 |
+
|
| 290 |
+
if line['data_type'] in ['gif'] or os.path.splitext(video_path)[1] in ['.webm']:
|
| 291 |
+
processed_video_path = video_path.replace(os.path.splitext(video_path)[1], '.mp4')
|
| 292 |
+
if not os.path.exists(processed_video_path):
|
| 293 |
+
# using MoviePy to transform GIF, webm into mp4 format
|
| 294 |
+
gif_clip = VideoFileClip(video_path)
|
| 295 |
+
gif_clip.write_videofile(processed_video_path, codec='libx264')
|
| 296 |
+
gif_clip.close()
|
| 297 |
+
elif line['data_type'] in ['frame']:
|
| 298 |
+
input_images = os.path.join(video_path, '*.jpg')
|
| 299 |
+
processed_video_path = f'{video_path}.mp4'
|
| 300 |
+
if not os.path.exists(processed_video_path):
|
| 301 |
+
# using MoviePy to transform images into mp4
|
| 302 |
+
image_files = sorted(glob.glob(input_images))
|
| 303 |
+
image_clip = ImageSequenceClip(image_files, fps=self.frame_fps)
|
| 304 |
+
image_clip.write_videofile(processed_video_path, codec='libx264')
|
| 305 |
+
image_clip.close()
|
| 306 |
+
else:
|
| 307 |
+
processed_video_path = video_path
|
| 308 |
+
|
| 309 |
+
if line['bound']:
|
| 310 |
+
base_name, suffix = os.path.splitext(processed_video_path)
|
| 311 |
+
output_video_path = f'{base_name}_processed{suffix}'
|
| 312 |
+
if not os.path.exists(output_video_path):
|
| 313 |
+
video_clip = VideoFileClip(processed_video_path)
|
| 314 |
+
clip = video_clip.subclip(line['start'], min(line['end'], video_clip.duration))
|
| 315 |
+
clip.write_videofile(output_video_path)
|
| 316 |
+
clip.close()
|
| 317 |
+
else:
|
| 318 |
+
output_video_path = processed_video_path
|
| 319 |
+
|
| 320 |
+
return output_video_path
|
| 321 |
+
|
| 322 |
+
def save_video_into_images(self, line):
|
| 323 |
+
bound = None
|
| 324 |
+
if line['bound']:
|
| 325 |
+
bound = (
|
| 326 |
+
line['start'],
|
| 327 |
+
line['end'],
|
| 328 |
+
)
|
| 329 |
+
video_path = os.path.join(self.data_root, line['prefix'], line['video'])
|
| 330 |
+
decord_method = self.decord_method[line['data_type']]
|
| 331 |
+
self.num_segments = self.nframe
|
| 332 |
+
torch_imgs = decord_method(video_path, bound)
|
| 333 |
+
img_frame_paths = self.save_video_frames(torch_imgs, line['video'], self.num_segments)
|
| 334 |
+
return img_frame_paths
|
| 335 |
+
|
| 336 |
+
def build_prompt(self, line, video_llm):
|
| 337 |
+
if self.fps > 0:
|
| 338 |
+
raise ValueError('MVBench does not support fps setting, please transfer to MVBench_MP4!')
|
| 339 |
+
if isinstance(line, int):
|
| 340 |
+
assert line < len(self)
|
| 341 |
+
line = self.data.iloc[line]
|
| 342 |
+
|
| 343 |
+
question, answer = self.qa_template(line)
|
| 344 |
+
message = [dict(type='text', value=self.SYS, role='system')]
|
| 345 |
+
message.append(dict(type='text', value=question))
|
| 346 |
+
if video_llm:
|
| 347 |
+
new_video_path = self.load_into_video_and_process(line)
|
| 348 |
+
message.append(dict(type='video', value=new_video_path))
|
| 349 |
+
else:
|
| 350 |
+
img_frame_paths = self.save_video_into_images(line)
|
| 351 |
+
for im in img_frame_paths:
|
| 352 |
+
message.append(dict(type='image', value=im))
|
| 353 |
+
message.append(dict(type='text', value='\nOnly give the best option.'))
|
| 354 |
+
message.append(dict(type='text', value='Best option:(', role='assistant'))
|
| 355 |
+
return message
|
| 356 |
+
|
| 357 |
+
@classmethod
|
| 358 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 359 |
+
|
| 360 |
+
assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
|
| 361 |
+
|
| 362 |
+
tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
|
| 363 |
+
tgt_file = eval_file.replace('.xlsx', '_rating.json')
|
| 364 |
+
score_file = eval_file.replace('.xlsx', '_score.xlsx')
|
| 365 |
+
|
| 366 |
+
if not osp.exists(score_file):
|
| 367 |
+
model = judge_kwargs.setdefault('model', 'chatgpt-0125')
|
| 368 |
+
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
|
| 369 |
+
|
| 370 |
+
if model == 'exact_matching':
|
| 371 |
+
model = None
|
| 372 |
+
elif gpt_key_set():
|
| 373 |
+
model = build_judge(**judge_kwargs)
|
| 374 |
+
if not model.working():
|
| 375 |
+
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
|
| 376 |
+
warnings.warn(DEBUG_MESSAGE)
|
| 377 |
+
model = None
|
| 378 |
+
else:
|
| 379 |
+
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
|
| 380 |
+
model = None
|
| 381 |
+
res = {} if not osp.exists(tmp_file) else load(tmp_file)
|
| 382 |
+
res = {k: v for k, v in res.items() if FAIL_MSG not in v}
|
| 383 |
+
|
| 384 |
+
data = load(eval_file)
|
| 385 |
+
data_un = data[~pd.isna(data['prediction'])]
|
| 386 |
+
|
| 387 |
+
for idx in data_un['index']:
|
| 388 |
+
ans = data.loc[data['index'] == idx, 'answer'].values[0]
|
| 389 |
+
pred = data.loc[data['index'] == idx, 'prediction'].values[0]
|
| 390 |
+
options = eval(data.loc[data['index'] == idx, 'candidates'].values[0])
|
| 391 |
+
answer_idx = -1
|
| 392 |
+
for id, c in enumerate(options):
|
| 393 |
+
if c == ans:
|
| 394 |
+
answer_idx = id
|
| 395 |
+
ans = f"({chr(ord('A') + answer_idx)}) {ans}"
|
| 396 |
+
input_item = data.loc[data['index'] == idx].to_dict(orient='records')[0]
|
| 397 |
+
for id, option_content in enumerate(eval(input_item['candidates'])):
|
| 398 |
+
input_item[chr(ord('A') + id)] = option_content
|
| 399 |
+
if option_content == input_item['answer']:
|
| 400 |
+
input_item['answer'] = chr(ord('A') + id)
|
| 401 |
+
|
| 402 |
+
if FAIL_MSG in pred:
|
| 403 |
+
data.loc[idx, 'score'] = -1
|
| 404 |
+
else:
|
| 405 |
+
data.loc[idx, 'score'] = int(check_ans_with_model(
|
| 406 |
+
pred, ans, model,
|
| 407 |
+
input_item,
|
| 408 |
+
'MVBench'
|
| 409 |
+
))
|
| 410 |
+
|
| 411 |
+
rejected = [x for x in data['score'] if x == -1]
|
| 412 |
+
|
| 413 |
+
print(
|
| 414 |
+
f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
|
| 415 |
+
f'failed to obtain the score for another {len(rejected)} questions. '
|
| 416 |
+
f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
|
| 417 |
+
)
|
| 418 |
+
|
| 419 |
+
dump(data, score_file)
|
| 420 |
+
|
| 421 |
+
rating = get_dimension_rating(score_file)
|
| 422 |
+
dump(rating, tgt_file)
|
| 423 |
+
return rating
|
| 424 |
+
|
| 425 |
+
|
| 426 |
+
class MVBench_MP4(VideoBaseDataset):
|
| 427 |
+
|
| 428 |
+
MP4_MD5 = '5c8c6f8b7972c2de65a629590f7c42f5'
|
| 429 |
+
SYS = """Carefully watch the video and pay attention to the cause and sequence of events, \
|
| 430 |
+
the detail and movement of objects, and the action and pose of persons. \
|
| 431 |
+
Based on your observations, select the best option that accurately addresses the question.
|
| 432 |
+
"""
|
| 433 |
+
TYPE = 'Video-MCQ'
|
| 434 |
+
|
| 435 |
+
def __init__(self, dataset='MVBench_MP4', nframe=0, fps=-1):
|
| 436 |
+
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
|
| 437 |
+
|
| 438 |
+
@classmethod
|
| 439 |
+
def supported_datasets(cls):
|
| 440 |
+
return ['MVBench_MP4']
|
| 441 |
+
|
| 442 |
+
def prepare_dataset(self, dataset_name='MVBench_MP4', repo_id='OpenGVLab/MVBench'):
|
| 443 |
+
def check_integrity(pth):
|
| 444 |
+
data_file = osp.join(pth, f'{dataset_name}.tsv')
|
| 445 |
+
|
| 446 |
+
if not os.path.exists(data_file):
|
| 447 |
+
return False
|
| 448 |
+
|
| 449 |
+
if md5(data_file) != self.MP4_MD5:
|
| 450 |
+
return False
|
| 451 |
+
|
| 452 |
+
data = load(data_file)
|
| 453 |
+
for idx, item in data.iterrows():
|
| 454 |
+
if not osp.exists(osp.join(pth, item['prefix'], item['video'])):
|
| 455 |
+
return False
|
| 456 |
+
return True
|
| 457 |
+
|
| 458 |
+
if modelscope_flag_set():
|
| 459 |
+
repo_id = 'modelscope/MVBench'
|
| 460 |
+
|
| 461 |
+
cache_path = get_cache_path(repo_id, branch='video')
|
| 462 |
+
if cache_path is not None and check_integrity(cache_path):
|
| 463 |
+
dataset_path = cache_path
|
| 464 |
+
else:
|
| 465 |
+
def generate_tsv(pth):
|
| 466 |
+
data_file = osp.join(pth, f'{dataset_name}.tsv')
|
| 467 |
+
if os.path.exists(data_file) and md5(data_file) == self.MP4_MD5:
|
| 468 |
+
return
|
| 469 |
+
json_data_path = os.path.join(dataset_path, 'test.json')
|
| 470 |
+
json_data = load(json_data_path)
|
| 471 |
+
root_data_dict = json_data['root']
|
| 472 |
+
self.data_list = []
|
| 473 |
+
for k, v in json_data['meta'].items():
|
| 474 |
+
for item in v:
|
| 475 |
+
self.data_list.append({
|
| 476 |
+
'task_type': k,
|
| 477 |
+
'prefix': root_data_dict[k],
|
| 478 |
+
'video': item['video'],
|
| 479 |
+
'question': item['question'],
|
| 480 |
+
'answer': item['answer'],
|
| 481 |
+
'candidates': item['candidates']
|
| 482 |
+
})
|
| 483 |
+
data_df = pd.DataFrame(self.data_list)
|
| 484 |
+
data_df = data_df.assign(index=range(len(data_df)))
|
| 485 |
+
data_df.to_csv(data_file, sep='\t', index=False)
|
| 486 |
+
|
| 487 |
+
if modelscope_flag_set():
|
| 488 |
+
from modelscope import dataset_snapshot_download
|
| 489 |
+
dataset_path = dataset_snapshot_download(dataset_id=repo_id, revision='video')
|
| 490 |
+
else:
|
| 491 |
+
hf_token = os.environ.get('HUGGINGFACE_TOKEN')
|
| 492 |
+
huggingface_hub.login(hf_token)
|
| 493 |
+
dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset', revision='video')
|
| 494 |
+
generate_tsv(dataset_path)
|
| 495 |
+
|
| 496 |
+
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
|
| 497 |
+
|
| 498 |
+
# transform
|
| 499 |
+
self.transform = T.Compose([
|
| 500 |
+
Stack(),
|
| 501 |
+
ToTorchFormatTensor()
|
| 502 |
+
])
|
| 503 |
+
|
| 504 |
+
return dict(root=dataset_path, data_file=data_file)
|
| 505 |
+
|
| 506 |
+
def qa_template(self, data):
|
| 507 |
+
question = f"Question: {data['question']}\n"
|
| 508 |
+
question += 'Options:\n'
|
| 509 |
+
answer = data['answer']
|
| 510 |
+
answer_idx = -1
|
| 511 |
+
for idx, c in enumerate(eval(data['candidates'])):
|
| 512 |
+
question += f"({chr(ord('A') + idx)}) {c}\n"
|
| 513 |
+
if c == answer:
|
| 514 |
+
answer_idx = idx
|
| 515 |
+
question = question.rstrip()
|
| 516 |
+
answer = f"({chr(ord('A') + answer_idx)}) {answer}"
|
| 517 |
+
return question, answer
|
| 518 |
+
|
| 519 |
+
def get_index_by_frame(self, max_frame):
|
| 520 |
+
seg_size = float(max_frame) / self.num_segments
|
| 521 |
+
frame_indices = np.array([
|
| 522 |
+
int((seg_size / 2) + np.round(seg_size * idx))
|
| 523 |
+
for idx in range(self.num_segments)
|
| 524 |
+
])
|
| 525 |
+
return frame_indices
|
| 526 |
+
|
| 527 |
+
def get_index_by_fps(self, vid, fps):
|
| 528 |
+
total_frames = len(vid)
|
| 529 |
+
video_fps = vid.get_avg_fps()
|
| 530 |
+
total_duration = total_frames / video_fps
|
| 531 |
+
required_frames = int(total_duration * fps)
|
| 532 |
+
step_size = video_fps / fps
|
| 533 |
+
frame_indices = np.array([int(i * step_size) for i in range(required_frames)])
|
| 534 |
+
self.num_segments = len(frame_indices)
|
| 535 |
+
return frame_indices
|
| 536 |
+
|
| 537 |
+
def read_video(self, video_path):
|
| 538 |
+
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
|
| 539 |
+
max_frame = len(vr) - 1
|
| 540 |
+
|
| 541 |
+
images_group = list()
|
| 542 |
+
if self.fps < 0:
|
| 543 |
+
frame_indices = self.get_index_by_frame(max_frame)
|
| 544 |
+
else:
|
| 545 |
+
frame_indices = self.get_index_by_fps(vr, self.fps)
|
| 546 |
+
|
| 547 |
+
for frame_index in frame_indices:
|
| 548 |
+
img = Image.fromarray(vr[frame_index].asnumpy())
|
| 549 |
+
images_group.append(img)
|
| 550 |
+
torch_imgs = self.transform(images_group)
|
| 551 |
+
return torch_imgs
|
| 552 |
+
|
| 553 |
+
def save_video_frames(self, imgs, video_name, frames):
|
| 554 |
+
if self.fps > 0:
|
| 555 |
+
frame_paths = self.frame_paths_fps(video_name, frames)
|
| 556 |
+
else:
|
| 557 |
+
frame_paths = self.frame_paths(video_name)
|
| 558 |
+
flag = np.all([osp.exists(p) for p in frame_paths])
|
| 559 |
+
|
| 560 |
+
if not flag:
|
| 561 |
+
block_size = imgs.size(0) // frames
|
| 562 |
+
split_tensors = torch.split(imgs, block_size)
|
| 563 |
+
to_pil = transforms.ToPILImage()
|
| 564 |
+
images = [to_pil(arr) for arr in split_tensors]
|
| 565 |
+
for im, pth in zip(images, frame_paths):
|
| 566 |
+
if not osp.exists(pth):
|
| 567 |
+
im.save(pth)
|
| 568 |
+
|
| 569 |
+
return frame_paths
|
| 570 |
+
|
| 571 |
+
def save_video_into_images(self, line):
|
| 572 |
+
video_path = os.path.join(self.data_root, line['prefix'], line['video'])
|
| 573 |
+
if self.fps <= 0:
|
| 574 |
+
self.num_segments = self.nframe
|
| 575 |
+
else:
|
| 576 |
+
self.num_segments = 0
|
| 577 |
+
torch_imgs = self.read_video(video_path)
|
| 578 |
+
img_frame_paths = self.save_video_frames(torch_imgs, line['video'], self.num_segments)
|
| 579 |
+
return img_frame_paths
|
| 580 |
+
|
| 581 |
+
def build_prompt(self, line, video_llm):
|
| 582 |
+
if isinstance(line, int):
|
| 583 |
+
assert line < len(self)
|
| 584 |
+
line = self.data.iloc[line]
|
| 585 |
+
|
| 586 |
+
question, answer = self.qa_template(line)
|
| 587 |
+
message = [dict(type='text', value=self.SYS, role='system')]
|
| 588 |
+
message.append(dict(type='text', value=question))
|
| 589 |
+
video_path = os.path.join(self.data_root, line['prefix'], line['video'])
|
| 590 |
+
if video_llm:
|
| 591 |
+
message.append(dict(type='video', value=video_path))
|
| 592 |
+
else:
|
| 593 |
+
img_frame_paths = self.save_video_into_images(line)
|
| 594 |
+
for im in img_frame_paths:
|
| 595 |
+
message.append(dict(type='image', value=im))
|
| 596 |
+
message.append(dict(type='text', value='\nOnly give the best option.'))
|
| 597 |
+
message.append(dict(type='text', value='Best option:(', role='assistant'))
|
| 598 |
+
return message
|
| 599 |
+
|
| 600 |
+
@classmethod
|
| 601 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 602 |
+
|
| 603 |
+
assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
|
| 604 |
+
|
| 605 |
+
tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
|
| 606 |
+
tgt_file = eval_file.replace('.xlsx', '_rating.json')
|
| 607 |
+
score_file = eval_file.replace('.xlsx', '_score.xlsx')
|
| 608 |
+
|
| 609 |
+
if not osp.exists(score_file):
|
| 610 |
+
model = judge_kwargs.setdefault('model', 'chatgpt-0125')
|
| 611 |
+
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
|
| 612 |
+
|
| 613 |
+
if model == 'exact_matching':
|
| 614 |
+
model = None
|
| 615 |
+
elif gpt_key_set():
|
| 616 |
+
model = build_judge(**judge_kwargs)
|
| 617 |
+
if not model.working():
|
| 618 |
+
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
|
| 619 |
+
warnings.warn(DEBUG_MESSAGE)
|
| 620 |
+
model = None
|
| 621 |
+
else:
|
| 622 |
+
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
|
| 623 |
+
model = None
|
| 624 |
+
res = {} if not osp.exists(tmp_file) else load(tmp_file)
|
| 625 |
+
res = {k: v for k, v in res.items() if FAIL_MSG not in v}
|
| 626 |
+
|
| 627 |
+
data = load(eval_file)
|
| 628 |
+
data_un = data[~pd.isna(data['prediction'])]
|
| 629 |
+
|
| 630 |
+
for idx in data_un['index']:
|
| 631 |
+
ans = data.loc[data['index'] == idx, 'answer'].values[0]
|
| 632 |
+
pred = data.loc[data['index'] == idx, 'prediction'].values[0]
|
| 633 |
+
options = eval(data.loc[data['index'] == idx, 'candidates'].values[0])
|
| 634 |
+
answer_idx = -1
|
| 635 |
+
for id, c in enumerate(options):
|
| 636 |
+
if c == ans:
|
| 637 |
+
answer_idx = id
|
| 638 |
+
ans = f"({chr(ord('A') + answer_idx)}) {ans}"
|
| 639 |
+
input_item = data.loc[data['index'] == idx].to_dict(orient='records')[0]
|
| 640 |
+
for id, option_content in enumerate(eval(input_item['candidates'])):
|
| 641 |
+
input_item[chr(ord('A') + id)] = option_content
|
| 642 |
+
if option_content == input_item['answer']:
|
| 643 |
+
input_item['answer'] = chr(ord('A') + id)
|
| 644 |
+
|
| 645 |
+
if FAIL_MSG in pred:
|
| 646 |
+
data.loc[idx, 'score'] = -1
|
| 647 |
+
else:
|
| 648 |
+
data.loc[idx, 'score'] = int(check_ans_with_model(
|
| 649 |
+
pred, ans, model,
|
| 650 |
+
input_item,
|
| 651 |
+
'MVBench_MP4'
|
| 652 |
+
))
|
| 653 |
+
|
| 654 |
+
rejected = [x for x in data['score'] if x == -1]
|
| 655 |
+
|
| 656 |
+
print(
|
| 657 |
+
f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
|
| 658 |
+
f'failed to obtain the score for another {len(rejected)} questions. '
|
| 659 |
+
f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
|
| 660 |
+
)
|
| 661 |
+
|
| 662 |
+
dump(data, score_file)
|
| 663 |
+
|
| 664 |
+
rating = get_dimension_rating(score_file)
|
| 665 |
+
dump(rating, tgt_file)
|
| 666 |
+
return rating
|
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/slidevqa.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import math
|
| 3 |
+
from typing import List
|
| 4 |
+
|
| 5 |
+
from vlmeval.dataset.utils.judge_util import build_judge
|
| 6 |
+
from vlmeval.smp import *
|
| 7 |
+
from .image_base import ImageBaseDataset
|
| 8 |
+
from .mmlongbench import concat_images, MMLongBench_auxeval, anls_compute
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
FAIL_MSG = 'Failed to obtain answer via API.'
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def get_f1(gt, pred):
|
| 15 |
+
gt_bow, pred_bow = gt.strip().split(), pred.strip().split()
|
| 16 |
+
if not gt_bow or not pred_bow:
|
| 17 |
+
return 0.0
|
| 18 |
+
|
| 19 |
+
recall = len([pred_e for pred_e in pred_bow if pred_e in gt_bow]) / len(gt_bow)
|
| 20 |
+
precision = len([pred_e for pred_e in pred_bow if pred_e in gt_bow]) / len(pred_bow)
|
| 21 |
+
f1 = 2 * recall * precision / (recall + precision) if (recall + precision) > 1e-4 else 0.0
|
| 22 |
+
return f1
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def SlideVQA_acc(result_file):
|
| 26 |
+
data = load(result_file)
|
| 27 |
+
anls_list, em_list, f1_list = list(), list(), list()
|
| 28 |
+
for i in range(len(data)):
|
| 29 |
+
item = data.iloc[i]
|
| 30 |
+
if isinstance(item['answer'], float) and math.isnan(item['answer']):
|
| 31 |
+
item['answer'] = 'Not answerable'
|
| 32 |
+
|
| 33 |
+
item['answer'] = re.sub('\n', '', item['answer']).lower()
|
| 34 |
+
item['pred'] = str(item['pred']).lower()
|
| 35 |
+
anls_score = anls_compute(item['answer'], item['pred'])
|
| 36 |
+
em_score = (item['answer'].strip() == item['pred'].strip())
|
| 37 |
+
f1_score = get_f1(item['answer'], item['pred'])
|
| 38 |
+
anls_list.append(anls_score)
|
| 39 |
+
em_list.append(em_score)
|
| 40 |
+
f1_list.append(f1_score)
|
| 41 |
+
print('---------------------')
|
| 42 |
+
print(item['answer'], item['pred'], anls_score, em_score, f1_score)
|
| 43 |
+
|
| 44 |
+
data['anls'] = anls_list
|
| 45 |
+
data['em'] = em_list
|
| 46 |
+
data['f1'] = f1_list
|
| 47 |
+
dump(data, result_file)
|
| 48 |
+
|
| 49 |
+
res = dict()
|
| 50 |
+
res['category'], res['num'] = ['anls', 'EM', 'F1'], [len(data), len(data), len(data)]
|
| 51 |
+
res['avg'] = [sum(anls_list) / len(data), sum(em_list) / len(data), sum(f1_list) / len(data)]
|
| 52 |
+
res = pd.DataFrame(res)
|
| 53 |
+
return res
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class SlideVQA(ImageBaseDataset):
|
| 57 |
+
|
| 58 |
+
TYPE = 'VQA'
|
| 59 |
+
|
| 60 |
+
DATASET_URL = {
|
| 61 |
+
'SLIDEVQA_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/SLIDEVQA_MINI.tsv',
|
| 62 |
+
'SLIDEVQA': 'https://opencompass.openxlab.space/utils/VLMEval/SLIDEVQA.tsv',
|
| 63 |
+
}
|
| 64 |
+
DATASET_MD5 = {
|
| 65 |
+
'SLIDEVQA_MINI': '6d9a8d8814fa5b7669deb2af3a3208eb',
|
| 66 |
+
'SLIDEVQA': '5e822c2f800e94c1e23badfd478326b6',
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
SUPPORTED_MODELS = {
|
| 70 |
+
'GPT4': (1, 1),
|
| 71 |
+
'GPT4V': (1, 1),
|
| 72 |
+
'GPT4V_HIGH': (1, 1),
|
| 73 |
+
'GPT4o': (1, 1),
|
| 74 |
+
'GPT4o_HIGH': (1, 1),
|
| 75 |
+
'GPT4o_MINI': (1, 1),
|
| 76 |
+
'XComposer2d5': (1, -1),
|
| 77 |
+
'XComposer2_4KHD': (1, -1),
|
| 78 |
+
'MiniCPM-Llama3-V-2_5': (1, 5),
|
| 79 |
+
'InternVL-Chat-V1-5': (5, 2),
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
def __init__(self, dataset, **kwargs):
|
| 83 |
+
self.model_list = list(self.SUPPORTED_MODELS.keys())
|
| 84 |
+
model_name = kwargs['model']
|
| 85 |
+
if not listinstr(self.model_list, model_name):
|
| 86 |
+
raise AssertionError("{} doesn't support the evaluation on SlideVQA.".format(model_name))
|
| 87 |
+
super(SlideVQA, self).__init__(dataset)
|
| 88 |
+
|
| 89 |
+
self.is_api = True if listinstr(['GPT4'], model_name) else False
|
| 90 |
+
self.max_pages = 120
|
| 91 |
+
concat_num, column_num = self.SUPPORTED_MODELS.get(model_name)
|
| 92 |
+
self.concat_num = concat_num
|
| 93 |
+
self.column_num = column_num
|
| 94 |
+
|
| 95 |
+
def dump_image(self, origin_line):
|
| 96 |
+
os.makedirs(self.img_root, exist_ok=True)
|
| 97 |
+
|
| 98 |
+
line = origin_line.copy()
|
| 99 |
+
if not isinstance(line['image_path'], List):
|
| 100 |
+
line['image_path'] = [line['image_path']]
|
| 101 |
+
line['image_path'] = line['image_path'][:self.max_pages]
|
| 102 |
+
|
| 103 |
+
if 'image' in line:
|
| 104 |
+
if isinstance(line['image'], list):
|
| 105 |
+
tgt_path = []
|
| 106 |
+
assert 'image_path' in line
|
| 107 |
+
for img, im_name in zip(line['image'], line['image_path']):
|
| 108 |
+
path = osp.join(self.img_root, im_name)
|
| 109 |
+
if not read_ok(path):
|
| 110 |
+
decode_base64_to_image_file(img, path)
|
| 111 |
+
tgt_path.append(path)
|
| 112 |
+
else:
|
| 113 |
+
tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
|
| 114 |
+
if not read_ok(tgt_path):
|
| 115 |
+
decode_base64_to_image_file(line['image'], tgt_path)
|
| 116 |
+
tgt_path = [tgt_path]
|
| 117 |
+
else:
|
| 118 |
+
assert 'image_path' in line
|
| 119 |
+
tgt_path = toliststr(line['image_path'])
|
| 120 |
+
|
| 121 |
+
if self.concat_num > 0 and not self.is_api:
|
| 122 |
+
concatenated_images = concat_images(tgt_path, max_concat=self.concat_num, column_num=self.column_num)
|
| 123 |
+
|
| 124 |
+
old_tgt_path = tgt_path
|
| 125 |
+
assert isinstance(old_tgt_path, list)
|
| 126 |
+
if self.column_num != -1:
|
| 127 |
+
tgt_path = [
|
| 128 |
+
'_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat{}_{}.jpg'.format(self.concat_num, i)
|
| 129 |
+
for i in range(len(concatenated_images))
|
| 130 |
+
]
|
| 131 |
+
else:
|
| 132 |
+
tgt_path = ['_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat_all.jpg']
|
| 133 |
+
|
| 134 |
+
for path, concatenated_image in zip(tgt_path, concatenated_images):
|
| 135 |
+
if not read_ok(path):
|
| 136 |
+
decode_base64_to_image_file(encode_image_to_base64(concatenated_image), path)
|
| 137 |
+
num_images, image_size = len(old_tgt_path), concatenated_image.size
|
| 138 |
+
print('concat {} images to a new one with size {}. save at {}'.format(num_images, image_size, path))
|
| 139 |
+
return tgt_path
|
| 140 |
+
|
| 141 |
+
@classmethod
|
| 142 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 143 |
+
logger = get_logger('Evaluation')
|
| 144 |
+
model = judge_kwargs['model']
|
| 145 |
+
|
| 146 |
+
suffix = eval_file.split('.')[-1]
|
| 147 |
+
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
|
| 148 |
+
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
|
| 149 |
+
|
| 150 |
+
if osp.exists(storage):
|
| 151 |
+
logger.warning(f'GPT scoring file {storage} already exists, will reuse it in SlideVQA_eval. ')
|
| 152 |
+
else:
|
| 153 |
+
data = load(eval_file)
|
| 154 |
+
model = build_judge(max_tokens=128, **judge_kwargs)
|
| 155 |
+
lt = len(data)
|
| 156 |
+
lines = [data.iloc[i] for i in range(lt)]
|
| 157 |
+
tups = [(model, line) for line in lines]
|
| 158 |
+
indices = [line['index'] for line in lines]
|
| 159 |
+
|
| 160 |
+
ans = {}
|
| 161 |
+
if osp.exists(tmp_file):
|
| 162 |
+
ans = load(tmp_file)
|
| 163 |
+
tups = [x for x, i in zip(tups, indices) if i not in ans]
|
| 164 |
+
indices = [i for i in indices if i not in ans]
|
| 165 |
+
|
| 166 |
+
if len(indices):
|
| 167 |
+
new_results = list()
|
| 168 |
+
for model, line in tqdm(tups):
|
| 169 |
+
res = MMLongBench_auxeval(model, line)
|
| 170 |
+
new_results.append(res)
|
| 171 |
+
|
| 172 |
+
log_map, res_map, pred_map = {}, {}, {}
|
| 173 |
+
all_inds = [line['index'] for line in lines]
|
| 174 |
+
for k, v in zip(all_inds, new_results):
|
| 175 |
+
log_map[k] = v['log']
|
| 176 |
+
res_map[k] = v['res']
|
| 177 |
+
pred_map[k] = v['pred']
|
| 178 |
+
data['res'] = [res_map[idx] for idx in data['index']]
|
| 179 |
+
data['log'] = [log_map[idx] for idx in data['index']]
|
| 180 |
+
data['pred'] = [pred_map[idx] for idx in data['index']]
|
| 181 |
+
dump(data, storage)
|
| 182 |
+
|
| 183 |
+
score = SlideVQA_acc(storage)
|
| 184 |
+
score_pth = storage.replace('.xlsx', '_score.csv')
|
| 185 |
+
|
| 186 |
+
dump(score, score_pth)
|
| 187 |
+
logger.info(f'SlideVQA successfully finished evaluating {eval_file}, results saved in {score_pth}')
|
| 188 |
+
logger.info('Score: ')
|
| 189 |
+
logger.info(score)
|
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/tempcompass.py
ADDED
|
@@ -0,0 +1,639 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import huggingface_hub
|
| 2 |
+
from huggingface_hub import snapshot_download
|
| 3 |
+
from ..smp import *
|
| 4 |
+
from .video_concat_dataset import ConcatVideoDataset
|
| 5 |
+
from .video_base import VideoBaseDataset
|
| 6 |
+
from .utils import build_judge, DEBUG_MESSAGE
|
| 7 |
+
from ..utils import track_progress_rich
|
| 8 |
+
import torchvision.transforms as T
|
| 9 |
+
from torchvision import transforms
|
| 10 |
+
from torchvision.transforms.functional import InterpolationMode
|
| 11 |
+
from decord import VideoReader, cpu
|
| 12 |
+
from .utils.tempcompass import *
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
FAIL_MSG = 'Failed to obtain answer via API.'
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class TempCompass(ConcatVideoDataset):
|
| 19 |
+
def __init__(self, dataset='TempCompass', nframe=0, fps=-1):
|
| 20 |
+
self.DATASET_SETS[dataset] = ['TempCompass_MCQ', 'TempCompass_Captioning', 'TempCompass_YorN']
|
| 21 |
+
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
|
| 22 |
+
|
| 23 |
+
@classmethod
|
| 24 |
+
def supported_datasets(cls):
|
| 25 |
+
return ['TempCompass']
|
| 26 |
+
|
| 27 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 28 |
+
result = super().evaluate(eval_file=eval_file, **judge_kwargs)
|
| 29 |
+
suffix = eval_file.split('.')[-1]
|
| 30 |
+
result = result.reset_index().rename(columns={'index': 'dim.task_type'})
|
| 31 |
+
score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
|
| 32 |
+
avg_dict = {}
|
| 33 |
+
for idx, item in result.iterrows():
|
| 34 |
+
dim, task_type = item['dim.task_type'].split('. ')
|
| 35 |
+
if dim not in avg_dict:
|
| 36 |
+
avg_dict[dim] = {'success': 0.0, 'overall': 0.0}
|
| 37 |
+
if task_type not in avg_dict:
|
| 38 |
+
avg_dict[task_type] = {'success': 0.0, 'overall': 0.0}
|
| 39 |
+
if 'overall' not in avg_dict:
|
| 40 |
+
avg_dict['overall'] = {'success': 0.0, 'overall': 0.0}
|
| 41 |
+
avg_dict[dim]['success'] += item['success']
|
| 42 |
+
avg_dict[dim]['overall'] += item['overall']
|
| 43 |
+
avg_dict[task_type]['success'] += item['success']
|
| 44 |
+
avg_dict[task_type]['overall'] += item['overall']
|
| 45 |
+
avg_dict['overall']['success'] += item['success']
|
| 46 |
+
avg_dict['overall']['overall'] += item['overall']
|
| 47 |
+
result.loc[idx, 'acc'] = round(item['success'] / item['overall'] * 100, 2)
|
| 48 |
+
for key, value in avg_dict.items():
|
| 49 |
+
# 使用 loc 方法添加新行
|
| 50 |
+
result.loc[len(result)] = {
|
| 51 |
+
'dim.task_type': key,
|
| 52 |
+
'success': value['success'],
|
| 53 |
+
'overall': value['overall'],
|
| 54 |
+
'acc': round(value['success'] / value['overall'] * 100, 2)
|
| 55 |
+
}
|
| 56 |
+
dump(result, score_file)
|
| 57 |
+
return result
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class TempCompass_MCQ(VideoBaseDataset):
|
| 61 |
+
|
| 62 |
+
MD5 = '7efbb9e6d9dabacd22daf274852691dd'
|
| 63 |
+
TYPE = 'Video-MCQ'
|
| 64 |
+
|
| 65 |
+
def __init__(self, dataset='TempCompass_MCQ', nframe=0, fps=-1):
|
| 66 |
+
self.type_data_list = {
|
| 67 |
+
'multi-choice': ('multi-choice.json', './videos', '.mp4'),
|
| 68 |
+
'caption_matching': ('caption_matching.json', './videos', '.mp4'),
|
| 69 |
+
}
|
| 70 |
+
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
|
| 71 |
+
|
| 72 |
+
@classmethod
|
| 73 |
+
def supported_datasets(cls):
|
| 74 |
+
return ['TempCompass_MCQ']
|
| 75 |
+
|
| 76 |
+
def prepare_dataset(self, dataset_name='TempCompass_MCQ', repo_id='lmms-lab/TempCompass'):
|
| 77 |
+
def check_integrity(pth):
|
| 78 |
+
data_file = osp.join(pth, f'{dataset_name}.tsv')
|
| 79 |
+
|
| 80 |
+
if not osp.exists(data_file):
|
| 81 |
+
return False
|
| 82 |
+
|
| 83 |
+
if md5(data_file) != self.MD5:
|
| 84 |
+
return False
|
| 85 |
+
|
| 86 |
+
data = load(data_file)
|
| 87 |
+
for idx, item in data.iterrows():
|
| 88 |
+
if not osp.exists(osp.join(pth, item['prefix'], item['video'] + item['suffix'])):
|
| 89 |
+
return False
|
| 90 |
+
return True
|
| 91 |
+
|
| 92 |
+
cache_path = get_cache_path(repo_id)
|
| 93 |
+
if cache_path is not None and check_integrity(cache_path):
|
| 94 |
+
dataset_path = cache_path
|
| 95 |
+
else:
|
| 96 |
+
def read_parquet(pth):
|
| 97 |
+
import pandas as pd
|
| 98 |
+
for task_name in self.type_data_list.keys():
|
| 99 |
+
if not osp.exists(osp.join(pth, f'{task_name}.json')):
|
| 100 |
+
data = pd.read_parquet(osp.join(pth, task_name, 'test-00000-of-00001.parquet'))
|
| 101 |
+
data.to_json(osp.join(pth, f'{task_name}.json'), orient='records', lines=False)
|
| 102 |
+
|
| 103 |
+
def unzip_videos(pth):
|
| 104 |
+
import zipfile
|
| 105 |
+
if not osp.exists(osp.join(pth, 'videos')):
|
| 106 |
+
zip_file = osp.join(pth, 'tempcompass_videos.zip')
|
| 107 |
+
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
|
| 108 |
+
zip_ref.extractall(pth)
|
| 109 |
+
|
| 110 |
+
def generate_tsv(pth):
|
| 111 |
+
data_file = osp.join(pth, f'{dataset_name}.tsv')
|
| 112 |
+
if osp.exists(data_file) and md5(data_file) == self.MD5:
|
| 113 |
+
return
|
| 114 |
+
self.data_list = []
|
| 115 |
+
for k, v in self.type_data_list.items():
|
| 116 |
+
with open(osp.join(pth, v[0]), 'r') as f:
|
| 117 |
+
json_data = json.load(f)
|
| 118 |
+
for data in json_data:
|
| 119 |
+
self.data_list.append({
|
| 120 |
+
'task_type': k,
|
| 121 |
+
'prefix': v[1],
|
| 122 |
+
'suffix': v[2],
|
| 123 |
+
'video': data['video_id'],
|
| 124 |
+
'question': data['question'].split('\n')[0],
|
| 125 |
+
'answer': data['answer'],
|
| 126 |
+
'dim': data['dim'],
|
| 127 |
+
'candidates': data['question'].split('\n')[1:],
|
| 128 |
+
})
|
| 129 |
+
|
| 130 |
+
data_df = pd.DataFrame(self.data_list)
|
| 131 |
+
data_df = data_df.assign(index=range(len(data_df)))
|
| 132 |
+
data_df.to_csv(data_file, sep='\t', index=False)
|
| 133 |
+
|
| 134 |
+
if modelscope_flag_set():
|
| 135 |
+
from modelscope import dataset_snapshot_download
|
| 136 |
+
dataset_path = dataset_snapshot_download(dataset_id=repo_id)
|
| 137 |
+
else:
|
| 138 |
+
dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
|
| 139 |
+
read_parquet(dataset_path)
|
| 140 |
+
unzip_videos(dataset_path)
|
| 141 |
+
generate_tsv(dataset_path)
|
| 142 |
+
|
| 143 |
+
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
|
| 144 |
+
return dict(root=dataset_path, data_file=data_file)
|
| 145 |
+
|
| 146 |
+
def qa_template(self, data):
|
| 147 |
+
question = data['question'] + '\n' + '\n'.join(eval(data['candidates']))
|
| 148 |
+
answer = data['answer']
|
| 149 |
+
return question, answer
|
| 150 |
+
|
| 151 |
+
def save_video_frames(self, line):
|
| 152 |
+
vid_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix'])
|
| 153 |
+
vid = decord.VideoReader(vid_path)
|
| 154 |
+
video_info = {
|
| 155 |
+
'fps': vid.get_avg_fps(),
|
| 156 |
+
'n_frames': len(vid),
|
| 157 |
+
}
|
| 158 |
+
if self.nframe > 0 and self.fps < 0:
|
| 159 |
+
step_size = len(vid) / (self.nframe + 1)
|
| 160 |
+
indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
|
| 161 |
+
frame_paths = self.frame_paths(line['video'])
|
| 162 |
+
elif self.fps > 0:
|
| 163 |
+
# not constrained by num_frames, get frames by fps
|
| 164 |
+
total_duration = video_info['n_frames'] / video_info['fps']
|
| 165 |
+
required_frames = int(total_duration * self.fps)
|
| 166 |
+
step_size = video_info['fps'] / self.fps
|
| 167 |
+
indices = [int(i * step_size) for i in range(required_frames)]
|
| 168 |
+
frame_paths = self.frame_paths_fps(line['video'], len(indices))
|
| 169 |
+
|
| 170 |
+
flag = np.all([osp.exists(p) for p in frame_paths])
|
| 171 |
+
|
| 172 |
+
if not flag:
|
| 173 |
+
images = [vid[i].asnumpy() for i in indices]
|
| 174 |
+
images = [Image.fromarray(arr) for arr in images]
|
| 175 |
+
for im, pth in zip(images, frame_paths):
|
| 176 |
+
if not osp.exists(pth):
|
| 177 |
+
im.save(pth)
|
| 178 |
+
|
| 179 |
+
return frame_paths
|
| 180 |
+
|
| 181 |
+
def save_video_into_images(self, line):
|
| 182 |
+
frame_paths = self.save_video_frames(line)
|
| 183 |
+
return frame_paths
|
| 184 |
+
|
| 185 |
+
def build_prompt(self, line, video_llm):
|
| 186 |
+
if isinstance(line, int):
|
| 187 |
+
assert line < len(self)
|
| 188 |
+
line = self.data.iloc[line]
|
| 189 |
+
|
| 190 |
+
question, answer = self.qa_template(line)
|
| 191 |
+
message = []
|
| 192 |
+
message.append(dict(type='text', value=question))
|
| 193 |
+
video_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix'])
|
| 194 |
+
if video_llm:
|
| 195 |
+
message.append(dict(type='video', value=video_path))
|
| 196 |
+
else:
|
| 197 |
+
img_frame_paths = self.save_video_into_images(line)
|
| 198 |
+
for im in img_frame_paths:
|
| 199 |
+
message.append(dict(type='image', value=im))
|
| 200 |
+
message.append(dict(type='text', value='\nPlease directly give the best option:'))
|
| 201 |
+
return message
|
| 202 |
+
|
| 203 |
+
@classmethod
|
| 204 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 205 |
+
model = judge_kwargs.get('model', 'exact_matching')
|
| 206 |
+
assert model in ['chatgpt-1106', 'exact_matching']
|
| 207 |
+
judge_kwargs.update({
|
| 208 |
+
"max_tokens": 128,
|
| 209 |
+
"temperature": 1.0,
|
| 210 |
+
"top_p": 1,
|
| 211 |
+
"presence_penalty": 1,
|
| 212 |
+
})
|
| 213 |
+
|
| 214 |
+
suffix = eval_file.split('.')[-1]
|
| 215 |
+
score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
|
| 216 |
+
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
|
| 217 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
| 218 |
+
|
| 219 |
+
if not osp.exists(score_file):
|
| 220 |
+
data = load(eval_file)
|
| 221 |
+
if model != 'exact_matching':
|
| 222 |
+
model = build_judge(system_prompt=sys_prompt, **judge_kwargs)
|
| 223 |
+
else:
|
| 224 |
+
model = None
|
| 225 |
+
|
| 226 |
+
lt = len(data)
|
| 227 |
+
lines = [data.iloc[i] for i in range(lt)]
|
| 228 |
+
tups = [(model, line) for line in lines]
|
| 229 |
+
indices = [line['index'] for line in lines]
|
| 230 |
+
|
| 231 |
+
ans = {}
|
| 232 |
+
if osp.exists(tmp_file):
|
| 233 |
+
ans = load(tmp_file)
|
| 234 |
+
tups = [x for x, i in zip(tups, indices) if i not in ans]
|
| 235 |
+
indices = [i for i in indices if i not in ans]
|
| 236 |
+
|
| 237 |
+
if len(indices):
|
| 238 |
+
_ = track_progress_rich(
|
| 239 |
+
evaluate_tempcompass_mcq,
|
| 240 |
+
tups,
|
| 241 |
+
nproc=nproc,
|
| 242 |
+
chunksize=nproc,
|
| 243 |
+
keys=indices,
|
| 244 |
+
save=tmp_file,
|
| 245 |
+
)
|
| 246 |
+
ans = load(tmp_file)
|
| 247 |
+
for idx, item in data.iterrows():
|
| 248 |
+
data.loc[idx, 'score'] = ans[idx]['rating']
|
| 249 |
+
dump(data, score_file)
|
| 250 |
+
|
| 251 |
+
rating = get_dimension_rating(score_file)
|
| 252 |
+
return rating
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
class TempCompass_Captioning(VideoBaseDataset):
|
| 256 |
+
|
| 257 |
+
MD5 = '35be9bf2581ea7767f02e9a8f37ae1ab'
|
| 258 |
+
TYPE = 'Video-VQA'
|
| 259 |
+
|
| 260 |
+
def __init__(self, dataset='TempCompass_Captioning', nframe=0, fps=-1):
|
| 261 |
+
self.type_data_list = {
|
| 262 |
+
'captioning': ('captioning.json', './videos', '.mp4'),
|
| 263 |
+
}
|
| 264 |
+
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
|
| 265 |
+
|
| 266 |
+
@classmethod
|
| 267 |
+
def supported_datasets(cls):
|
| 268 |
+
return ['TempCompass_Captioning']
|
| 269 |
+
|
| 270 |
+
def prepare_dataset(self, dataset_name='TempCompass_Captioning', repo_id='lmms-lab/TempCompass'):
|
| 271 |
+
def check_integrity(pth):
|
| 272 |
+
data_file = osp.join(pth, f'{dataset_name}.tsv')
|
| 273 |
+
|
| 274 |
+
if not osp.exists(data_file):
|
| 275 |
+
return False
|
| 276 |
+
|
| 277 |
+
if md5(data_file) != self.MD5:
|
| 278 |
+
return False
|
| 279 |
+
|
| 280 |
+
data = load(data_file)
|
| 281 |
+
for idx, item in data.iterrows():
|
| 282 |
+
if not osp.exists(osp.join(pth, item['prefix'], item['video'] + item['suffix'])):
|
| 283 |
+
return False
|
| 284 |
+
return True
|
| 285 |
+
|
| 286 |
+
cache_path = get_cache_path(repo_id)
|
| 287 |
+
if cache_path is not None and check_integrity(cache_path):
|
| 288 |
+
dataset_path = cache_path
|
| 289 |
+
else:
|
| 290 |
+
def read_parquet(pth):
|
| 291 |
+
import pandas as pd
|
| 292 |
+
for task_name in self.type_data_list.keys():
|
| 293 |
+
if not osp.exists(osp.join(pth, f'{task_name}.json')):
|
| 294 |
+
data = pd.read_parquet(osp.join(pth, task_name, 'test-00000-of-00001.parquet'))
|
| 295 |
+
data.to_json(osp.join(pth, f'{task_name}.json'), orient='records', lines=False)
|
| 296 |
+
|
| 297 |
+
def unzip_videos(pth):
|
| 298 |
+
import zipfile
|
| 299 |
+
if not osp.exists(osp.join(pth, 'videos')):
|
| 300 |
+
zip_file = osp.join(pth, 'tempcompass_videos.zip')
|
| 301 |
+
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
|
| 302 |
+
zip_ref.extractall(pth)
|
| 303 |
+
|
| 304 |
+
def generate_tsv(pth):
|
| 305 |
+
data_file = osp.join(pth, f'{dataset_name}.tsv')
|
| 306 |
+
if osp.exists(data_file) and md5(data_file) == self.MD5:
|
| 307 |
+
return
|
| 308 |
+
self.data_list = []
|
| 309 |
+
for k, v in self.type_data_list.items():
|
| 310 |
+
with open(osp.join(pth, v[0]), 'r') as f:
|
| 311 |
+
json_data = json.load(f)
|
| 312 |
+
for data in json_data:
|
| 313 |
+
self.data_list.append({
|
| 314 |
+
'task_type': k,
|
| 315 |
+
'prefix': v[1],
|
| 316 |
+
'suffix': v[2],
|
| 317 |
+
'video': data['video_id'],
|
| 318 |
+
'question': data['question'],
|
| 319 |
+
'answer': data['answer'],
|
| 320 |
+
'dim': data['dim'],
|
| 321 |
+
'mc_question': data['mc_question'],
|
| 322 |
+
'mc_answer': data['mc_answer'],
|
| 323 |
+
})
|
| 324 |
+
|
| 325 |
+
data_df = pd.DataFrame(self.data_list)
|
| 326 |
+
data_df = data_df.assign(index=range(len(data_df)))
|
| 327 |
+
data_df.to_csv(data_file, sep='\t', index=False)
|
| 328 |
+
|
| 329 |
+
if modelscope_flag_set():
|
| 330 |
+
from modelscope import dataset_snapshot_download
|
| 331 |
+
dataset_path = dataset_snapshot_download(dataset_id=repo_id)
|
| 332 |
+
else:
|
| 333 |
+
dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
|
| 334 |
+
read_parquet(dataset_path)
|
| 335 |
+
unzip_videos(dataset_path)
|
| 336 |
+
generate_tsv(dataset_path)
|
| 337 |
+
|
| 338 |
+
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
|
| 339 |
+
return dict(root=dataset_path, data_file=data_file)
|
| 340 |
+
|
| 341 |
+
def qa_template(self, data):
|
| 342 |
+
question = data['question']
|
| 343 |
+
answer = data['answer']
|
| 344 |
+
return question, answer
|
| 345 |
+
|
| 346 |
+
def save_video_frames(self, line):
|
| 347 |
+
vid_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix'])
|
| 348 |
+
vid = decord.VideoReader(vid_path)
|
| 349 |
+
video_info = {
|
| 350 |
+
'fps': vid.get_avg_fps(),
|
| 351 |
+
'n_frames': len(vid),
|
| 352 |
+
}
|
| 353 |
+
if self.nframe > 0 and self.fps < 0:
|
| 354 |
+
step_size = len(vid) / (self.nframe + 1)
|
| 355 |
+
indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
|
| 356 |
+
frame_paths = self.frame_paths(line['video'])
|
| 357 |
+
elif self.fps > 0:
|
| 358 |
+
# not constrained by num_frames, get frames by fps
|
| 359 |
+
total_duration = video_info['n_frames'] / video_info['fps']
|
| 360 |
+
required_frames = int(total_duration * self.fps)
|
| 361 |
+
step_size = video_info['fps'] / self.fps
|
| 362 |
+
indices = [int(i * step_size) for i in range(required_frames)]
|
| 363 |
+
frame_paths = self.frame_paths_fps(line['video'], len(indices))
|
| 364 |
+
|
| 365 |
+
flag = np.all([osp.exists(p) for p in frame_paths])
|
| 366 |
+
|
| 367 |
+
if not flag:
|
| 368 |
+
images = [vid[i].asnumpy() for i in indices]
|
| 369 |
+
images = [Image.fromarray(arr) for arr in images]
|
| 370 |
+
for im, pth in zip(images, frame_paths):
|
| 371 |
+
if not osp.exists(pth):
|
| 372 |
+
im.save(pth)
|
| 373 |
+
|
| 374 |
+
return frame_paths
|
| 375 |
+
|
| 376 |
+
def save_video_into_images(self, line):
|
| 377 |
+
frame_paths = self.save_video_frames(line)
|
| 378 |
+
return frame_paths
|
| 379 |
+
|
| 380 |
+
def build_prompt(self, line, video_llm):
|
| 381 |
+
if isinstance(line, int):
|
| 382 |
+
assert line < len(self)
|
| 383 |
+
line = self.data.iloc[line]
|
| 384 |
+
|
| 385 |
+
question, answer = self.qa_template(line)
|
| 386 |
+
message = []
|
| 387 |
+
message.append(dict(type='text', value=question))
|
| 388 |
+
video_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix'])
|
| 389 |
+
if video_llm:
|
| 390 |
+
message.append(dict(type='video', value=video_path))
|
| 391 |
+
else:
|
| 392 |
+
img_frame_paths = self.save_video_into_images(line)
|
| 393 |
+
for im in img_frame_paths:
|
| 394 |
+
message.append(dict(type='image', value=im))
|
| 395 |
+
return message
|
| 396 |
+
|
| 397 |
+
@classmethod
|
| 398 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 399 |
+
model = judge_kwargs.get('model', 'exact_matching')
|
| 400 |
+
assert model in ['chatgpt-1106', 'exact_matching']
|
| 401 |
+
judge_kwargs.update({
|
| 402 |
+
"max_tokens": 128,
|
| 403 |
+
"temperature": 1.0,
|
| 404 |
+
"top_p": 1,
|
| 405 |
+
"presence_penalty": 1,
|
| 406 |
+
})
|
| 407 |
+
|
| 408 |
+
suffix = eval_file.split('.')[-1]
|
| 409 |
+
score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
|
| 410 |
+
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
|
| 411 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
| 412 |
+
|
| 413 |
+
if not osp.exists(score_file):
|
| 414 |
+
data = load(eval_file)
|
| 415 |
+
if model != 'exact_matching':
|
| 416 |
+
model = build_judge(system_prompt=sys_prompt, **judge_kwargs)
|
| 417 |
+
else:
|
| 418 |
+
model = None
|
| 419 |
+
|
| 420 |
+
lt = len(data)
|
| 421 |
+
lines = [data.iloc[i] for i in range(lt)]
|
| 422 |
+
tups = [(model, line) for line in lines]
|
| 423 |
+
indices = [line['index'] for line in lines]
|
| 424 |
+
|
| 425 |
+
ans = {}
|
| 426 |
+
if osp.exists(tmp_file):
|
| 427 |
+
ans = load(tmp_file)
|
| 428 |
+
tups = [x for x, i in zip(tups, indices) if i not in ans]
|
| 429 |
+
indices = [i for i in indices if i not in ans]
|
| 430 |
+
|
| 431 |
+
if len(indices):
|
| 432 |
+
_ = track_progress_rich(
|
| 433 |
+
evaluate_tempcompass_captioning,
|
| 434 |
+
tups,
|
| 435 |
+
nproc=nproc,
|
| 436 |
+
chunksize=nproc,
|
| 437 |
+
keys=indices,
|
| 438 |
+
save=tmp_file,
|
| 439 |
+
)
|
| 440 |
+
ans = load(tmp_file)
|
| 441 |
+
for idx, item in data.iterrows():
|
| 442 |
+
data.loc[idx, 'score'] = ans[idx]['rating']
|
| 443 |
+
dump(data, score_file)
|
| 444 |
+
|
| 445 |
+
rating = get_dimension_rating(score_file)
|
| 446 |
+
return rating
|
| 447 |
+
|
| 448 |
+
|
| 449 |
+
class TempCompass_YorN(VideoBaseDataset):
|
| 450 |
+
|
| 451 |
+
MD5 = 'c72c046d7fa0e82c8cd7462f2e844ea8'
|
| 452 |
+
TYPE = 'Video-Y/N'
|
| 453 |
+
|
| 454 |
+
def __init__(self, dataset='TempCompass_YorN', nframe=0, fps=-1):
|
| 455 |
+
self.type_data_list = {
|
| 456 |
+
'yes_no': ('yes_no.json', './videos', '.mp4'),
|
| 457 |
+
}
|
| 458 |
+
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
|
| 459 |
+
|
| 460 |
+
@classmethod
|
| 461 |
+
def supported_datasets(cls):
|
| 462 |
+
return ['TempCompass_YorN']
|
| 463 |
+
|
| 464 |
+
def prepare_dataset(self, dataset_name='TempCompass_YorN', repo_id='lmms-lab/TempCompass'):
|
| 465 |
+
def check_integrity(pth):
|
| 466 |
+
data_file = osp.join(pth, f'{dataset_name}.tsv')
|
| 467 |
+
|
| 468 |
+
if not osp.exists(data_file):
|
| 469 |
+
return False
|
| 470 |
+
|
| 471 |
+
if md5(data_file) != self.MD5:
|
| 472 |
+
return False
|
| 473 |
+
|
| 474 |
+
data = load(data_file)
|
| 475 |
+
for idx, item in data.iterrows():
|
| 476 |
+
if not osp.exists(osp.join(pth, item['prefix'], item['video'] + item['suffix'])):
|
| 477 |
+
return False
|
| 478 |
+
return True
|
| 479 |
+
|
| 480 |
+
cache_path = get_cache_path(repo_id)
|
| 481 |
+
if cache_path is not None and check_integrity(cache_path):
|
| 482 |
+
dataset_path = cache_path
|
| 483 |
+
else:
|
| 484 |
+
def read_parquet(pth):
|
| 485 |
+
import pandas as pd
|
| 486 |
+
for task_name in self.type_data_list.keys():
|
| 487 |
+
if not osp.exists(osp.join(pth, f'{task_name}.json')):
|
| 488 |
+
data = pd.read_parquet(osp.join(pth, task_name, 'test-00000-of-00001.parquet'))
|
| 489 |
+
data.to_json(osp.join(pth, f'{task_name}.json'), orient='records', lines=False)
|
| 490 |
+
|
| 491 |
+
def unzip_videos(pth):
|
| 492 |
+
import zipfile
|
| 493 |
+
if not osp.exists(osp.join(pth, 'videos')):
|
| 494 |
+
zip_file = osp.join(pth, 'tempcompass_videos.zip')
|
| 495 |
+
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
|
| 496 |
+
zip_ref.extractall(pth)
|
| 497 |
+
|
| 498 |
+
def generate_tsv(pth):
|
| 499 |
+
data_file = osp.join(pth, f'{dataset_name}.tsv')
|
| 500 |
+
if osp.exists(data_file) and md5(data_file) == self.MD5:
|
| 501 |
+
return
|
| 502 |
+
self.data_list = []
|
| 503 |
+
for k, v in self.type_data_list.items():
|
| 504 |
+
with open(osp.join(pth, v[0]), 'r') as f:
|
| 505 |
+
json_data = json.load(f)
|
| 506 |
+
for data in json_data:
|
| 507 |
+
self.data_list.append({
|
| 508 |
+
'task_type': k,
|
| 509 |
+
'prefix': v[1],
|
| 510 |
+
'suffix': v[2],
|
| 511 |
+
'video': data['video_id'],
|
| 512 |
+
'question': data['question'].split('\n')[0],
|
| 513 |
+
'answer': data['answer'],
|
| 514 |
+
'dim': data['dim']
|
| 515 |
+
})
|
| 516 |
+
|
| 517 |
+
data_df = pd.DataFrame(self.data_list)
|
| 518 |
+
data_df = data_df.assign(index=range(len(data_df)))
|
| 519 |
+
data_df.to_csv(data_file, sep='\t', index=False)
|
| 520 |
+
|
| 521 |
+
if modelscope_flag_set():
|
| 522 |
+
from modelscope import dataset_snapshot_download
|
| 523 |
+
dataset_path = dataset_snapshot_download(dataset_id=repo_id)
|
| 524 |
+
else:
|
| 525 |
+
dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
|
| 526 |
+
read_parquet(dataset_path)
|
| 527 |
+
unzip_videos(dataset_path)
|
| 528 |
+
generate_tsv(dataset_path)
|
| 529 |
+
|
| 530 |
+
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
|
| 531 |
+
return dict(root=dataset_path, data_file=data_file)
|
| 532 |
+
|
| 533 |
+
def qa_template(self, data):
|
| 534 |
+
question = data['question']
|
| 535 |
+
answer = data['answer']
|
| 536 |
+
return question, answer
|
| 537 |
+
|
| 538 |
+
def save_video_frames(self, line):
|
| 539 |
+
vid_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix'])
|
| 540 |
+
vid = decord.VideoReader(vid_path)
|
| 541 |
+
video_info = {
|
| 542 |
+
'fps': vid.get_avg_fps(),
|
| 543 |
+
'n_frames': len(vid),
|
| 544 |
+
}
|
| 545 |
+
if self.nframe > 0 and self.fps < 0:
|
| 546 |
+
step_size = len(vid) / (self.nframe + 1)
|
| 547 |
+
indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
|
| 548 |
+
frame_paths = self.frame_paths(line['video'])
|
| 549 |
+
elif self.fps > 0:
|
| 550 |
+
# not constrained by num_frames, get frames by fps
|
| 551 |
+
total_duration = video_info['n_frames'] / video_info['fps']
|
| 552 |
+
required_frames = int(total_duration * self.fps)
|
| 553 |
+
step_size = video_info['fps'] / self.fps
|
| 554 |
+
indices = [int(i * step_size) for i in range(required_frames)]
|
| 555 |
+
frame_paths = self.frame_paths_fps(line['video'], len(indices))
|
| 556 |
+
|
| 557 |
+
flag = np.all([osp.exists(p) for p in frame_paths])
|
| 558 |
+
|
| 559 |
+
if not flag:
|
| 560 |
+
images = [vid[i].asnumpy() for i in indices]
|
| 561 |
+
images = [Image.fromarray(arr) for arr in images]
|
| 562 |
+
for im, pth in zip(images, frame_paths):
|
| 563 |
+
if not osp.exists(pth):
|
| 564 |
+
im.save(pth)
|
| 565 |
+
|
| 566 |
+
return frame_paths
|
| 567 |
+
|
| 568 |
+
def save_video_into_images(self, line):
|
| 569 |
+
frame_paths = self.save_video_frames(line)
|
| 570 |
+
return frame_paths
|
| 571 |
+
|
| 572 |
+
def build_prompt(self, line, video_llm):
|
| 573 |
+
if isinstance(line, int):
|
| 574 |
+
assert line < len(self)
|
| 575 |
+
line = self.data.iloc[line]
|
| 576 |
+
|
| 577 |
+
question, answer = self.qa_template(line)
|
| 578 |
+
message = []
|
| 579 |
+
message.append(dict(type='text', value=question))
|
| 580 |
+
video_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix'])
|
| 581 |
+
if video_llm:
|
| 582 |
+
message.append(dict(type='video', value=video_path))
|
| 583 |
+
else:
|
| 584 |
+
img_frame_paths = self.save_video_into_images(line)
|
| 585 |
+
for im in img_frame_paths:
|
| 586 |
+
message.append(dict(type='image', value=im))
|
| 587 |
+
message.append(dict(type='text', value='\nPlease answer yes or no:'))
|
| 588 |
+
return message
|
| 589 |
+
|
| 590 |
+
@classmethod
|
| 591 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 592 |
+
model = judge_kwargs.get('model', 'exact_matching')
|
| 593 |
+
assert model in ['chatgpt-1106', 'exact_matching']
|
| 594 |
+
judge_kwargs.update({
|
| 595 |
+
"max_tokens": 128,
|
| 596 |
+
"temperature": 1.0,
|
| 597 |
+
"top_p": 1,
|
| 598 |
+
"presence_penalty": 1,
|
| 599 |
+
})
|
| 600 |
+
|
| 601 |
+
suffix = eval_file.split('.')[-1]
|
| 602 |
+
score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
|
| 603 |
+
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
|
| 604 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
| 605 |
+
|
| 606 |
+
if not osp.exists(score_file):
|
| 607 |
+
data = load(eval_file)
|
| 608 |
+
if model != 'exact_matching':
|
| 609 |
+
model = build_judge(system_prompt=sys_prompt, **judge_kwargs)
|
| 610 |
+
else:
|
| 611 |
+
model = None
|
| 612 |
+
|
| 613 |
+
lt = len(data)
|
| 614 |
+
lines = [data.iloc[i] for i in range(lt)]
|
| 615 |
+
tups = [(model, line) for line in lines]
|
| 616 |
+
indices = [line['index'] for line in lines]
|
| 617 |
+
|
| 618 |
+
ans = {}
|
| 619 |
+
if osp.exists(tmp_file):
|
| 620 |
+
ans = load(tmp_file)
|
| 621 |
+
tups = [x for x, i in zip(tups, indices) if i not in ans]
|
| 622 |
+
indices = [i for i in indices if i not in ans]
|
| 623 |
+
|
| 624 |
+
if len(indices):
|
| 625 |
+
_ = track_progress_rich(
|
| 626 |
+
evaluate_tempcompass_YorN,
|
| 627 |
+
tups,
|
| 628 |
+
nproc=nproc,
|
| 629 |
+
chunksize=nproc,
|
| 630 |
+
keys=indices,
|
| 631 |
+
save=tmp_file,
|
| 632 |
+
)
|
| 633 |
+
ans = load(tmp_file)
|
| 634 |
+
for idx, item in data.iterrows():
|
| 635 |
+
data.loc[idx, 'score'] = ans[idx]['rating']
|
| 636 |
+
dump(data, score_file)
|
| 637 |
+
|
| 638 |
+
rating = get_dimension_rating(score_file)
|
| 639 |
+
return rating
|
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/text_base.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from abc import abstractmethod
|
| 2 |
+
from ..smp import *
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class TextBaseDataset:
|
| 6 |
+
MODALITY = 'TEXT'
|
| 7 |
+
DATASET_URL = {}
|
| 8 |
+
DATASET_MD5 = {}
|
| 9 |
+
|
| 10 |
+
def __init__(self, dataset='MMBench', **kwargs):
|
| 11 |
+
self.dataset_name = dataset
|
| 12 |
+
|
| 13 |
+
data = self.load_data(dataset)
|
| 14 |
+
|
| 15 |
+
data['index'] = [str(x) for x in data['index']]
|
| 16 |
+
|
| 17 |
+
if np.all([istype(x, int) for x in data['index']]):
|
| 18 |
+
data['index'] = [int(x) for x in data['index']]
|
| 19 |
+
|
| 20 |
+
self.data = data
|
| 21 |
+
self.post_build(dataset)
|
| 22 |
+
|
| 23 |
+
def __len__(self):
|
| 24 |
+
return len(self.data)
|
| 25 |
+
|
| 26 |
+
def __getitem__(self, idx):
|
| 27 |
+
return dict(self.data.iloc[idx])
|
| 28 |
+
|
| 29 |
+
def prepare_tsv(self, url, file_md5=None):
|
| 30 |
+
data_root = LMUDataRoot()
|
| 31 |
+
os.makedirs(data_root, exist_ok=True)
|
| 32 |
+
update_flag = False
|
| 33 |
+
file_name = url.split('/')[-1]
|
| 34 |
+
data_path = osp.join(data_root, file_name)
|
| 35 |
+
if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
|
| 36 |
+
pass
|
| 37 |
+
else:
|
| 38 |
+
warnings.warn('The dataset tsv is not downloaded')
|
| 39 |
+
download_file(url, data_path)
|
| 40 |
+
update_flag = True
|
| 41 |
+
|
| 42 |
+
if file_size(data_path, 'GB') > 1:
|
| 43 |
+
local_path = data_path.replace('.tsv', '_local.tsv')
|
| 44 |
+
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag:
|
| 45 |
+
from ..tools import LOCALIZE
|
| 46 |
+
LOCALIZE(data_path, local_path)
|
| 47 |
+
data_path = local_path
|
| 48 |
+
return load(data_path)
|
| 49 |
+
|
| 50 |
+
def dump_image(self, line):
|
| 51 |
+
return []
|
| 52 |
+
|
| 53 |
+
def display(self, line):
|
| 54 |
+
if isinstance(line, int):
|
| 55 |
+
line = self.data.iloc[line]
|
| 56 |
+
assert isinstance(line, pd.Series) or isinstance(line, dict)
|
| 57 |
+
mmqa_display(line)
|
| 58 |
+
|
| 59 |
+
# Return a list of dataset names that are supported by this class, can override
|
| 60 |
+
@classmethod
|
| 61 |
+
def supported_datasets(cls):
|
| 62 |
+
return list(cls.DATASET_URL)
|
| 63 |
+
|
| 64 |
+
# Given the dataset name, return the dataset as a pandas dataframe, can override
|
| 65 |
+
def load_data(self, dataset):
|
| 66 |
+
url = self.DATASET_URL[dataset]
|
| 67 |
+
file_md5 = self.DATASET_MD5[dataset]
|
| 68 |
+
return self.prepare_tsv(url, file_md5)
|
| 69 |
+
|
| 70 |
+
# Post built hook, will be called after the dataset is built, can override
|
| 71 |
+
def post_build(self, dataset):
|
| 72 |
+
pass
|
| 73 |
+
|
| 74 |
+
# Given one data record, return the built prompt (a multi-modal message), can override
|
| 75 |
+
def build_prompt(self, line):
|
| 76 |
+
if isinstance(line, int):
|
| 77 |
+
line = self.data.iloc[line]
|
| 78 |
+
|
| 79 |
+
question = line['question']
|
| 80 |
+
|
| 81 |
+
msgs = []
|
| 82 |
+
msgs.append(dict(type='text', value=question))
|
| 83 |
+
return msgs
|
| 84 |
+
|
| 85 |
+
# Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
|
| 86 |
+
@abstractmethod
|
| 87 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 88 |
+
pass
|
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/text_mcq.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .text_base import TextBaseDataset
|
| 2 |
+
from .utils import build_judge, DEBUG_MESSAGE
|
| 3 |
+
from ..smp import *
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class TextMCQDataset(TextBaseDataset):
|
| 7 |
+
TYPE = 'MCQ'
|
| 8 |
+
|
| 9 |
+
DATASET_URL = {}
|
| 10 |
+
|
| 11 |
+
DATASET_MD5 = {}
|
| 12 |
+
|
| 13 |
+
def build_prompt(self, line):
|
| 14 |
+
|
| 15 |
+
if isinstance(line, int):
|
| 16 |
+
line = self.data.iloc[line]
|
| 17 |
+
|
| 18 |
+
question = line['question']
|
| 19 |
+
options = {
|
| 20 |
+
cand: line[cand]
|
| 21 |
+
for cand in string.ascii_uppercase
|
| 22 |
+
if cand in line and not pd.isna(line[cand])
|
| 23 |
+
}
|
| 24 |
+
options_prompt = 'Options:\n'
|
| 25 |
+
for key, item in options.items():
|
| 26 |
+
options_prompt += f'{key}. {item}\n'
|
| 27 |
+
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
|
| 28 |
+
prompt = ''
|
| 29 |
+
if hint is not None:
|
| 30 |
+
prompt += f'Hint: {hint}\n'
|
| 31 |
+
prompt += f'Question: {question}\n'
|
| 32 |
+
if len(options):
|
| 33 |
+
prompt += options_prompt
|
| 34 |
+
prompt += 'Please select the correct answer from the options above. \n'
|
| 35 |
+
|
| 36 |
+
msgs = []
|
| 37 |
+
|
| 38 |
+
msgs.append(dict(type='text', value=prompt))
|
| 39 |
+
|
| 40 |
+
return msgs
|
| 41 |
+
|
| 42 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
| 43 |
+
from .utils.multiple_choice import report_acc, report_acc_MMT, mcq_circular_eval, mcq_vanilla_eval
|
| 44 |
+
# assert dataset is not None
|
| 45 |
+
dataset_map = {
|
| 46 |
+
'MMBench_TEST_EN': 'MMBench', 'MMBench_TEST_EN_V11': 'MMBench_V11',
|
| 47 |
+
'MMBench_TEST_CN': 'MMBench_CN', 'MMBench_TEST_CN_V11': 'MMBench_CN_V11'
|
| 48 |
+
}
|
| 49 |
+
dataset = self.dataset_name
|
| 50 |
+
if dataset in dataset_map:
|
| 51 |
+
dataset = dataset_map[dataset]
|
| 52 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
| 53 |
+
|
| 54 |
+
circular = False
|
| 55 |
+
|
| 56 |
+
suffix = eval_file.split('.')[-1]
|
| 57 |
+
model = judge_kwargs.get('model', 'exact_matching')
|
| 58 |
+
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
|
| 59 |
+
name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
|
| 60 |
+
name_str = name_str_map[model] if model in name_str_map else model
|
| 61 |
+
|
| 62 |
+
if model == 'exact_matching':
|
| 63 |
+
model = None
|
| 64 |
+
elif gpt_key_set():
|
| 65 |
+
model = build_judge(**judge_kwargs)
|
| 66 |
+
if not model.working():
|
| 67 |
+
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
|
| 68 |
+
warnings.warn(DEBUG_MESSAGE)
|
| 69 |
+
model = None
|
| 70 |
+
else:
|
| 71 |
+
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
|
| 72 |
+
model = None
|
| 73 |
+
|
| 74 |
+
result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
|
| 75 |
+
|
| 76 |
+
data = load(eval_file)
|
| 77 |
+
data = data.sort_values(by='index')
|
| 78 |
+
data['prediction'] = [str(x) for x in data['prediction']]
|
| 79 |
+
# If not choice label, then use lower case
|
| 80 |
+
for k in data.keys():
|
| 81 |
+
data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
|
| 82 |
+
|
| 83 |
+
meta = self.data
|
| 84 |
+
meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
|
| 85 |
+
data_map = {x: y for x, y in zip(data['index'], data['question'])}
|
| 86 |
+
for k in data_map:
|
| 87 |
+
assert k in meta_q_map, (
|
| 88 |
+
f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
if circular:
|
| 92 |
+
data = mcq_circular_eval(model, data, meta, nproc, result_file, self.dataset_name)
|
| 93 |
+
else:
|
| 94 |
+
data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
|
| 95 |
+
|
| 96 |
+
# load split
|
| 97 |
+
dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
|
| 98 |
+
data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
|
| 99 |
+
|
| 100 |
+
# May have different report acc functions for different datasets
|
| 101 |
+
if 'MMT' in dataset:
|
| 102 |
+
acc = report_acc_MMT(data)
|
| 103 |
+
else:
|
| 104 |
+
acc = report_acc(data)
|
| 105 |
+
|
| 106 |
+
score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
|
| 107 |
+
dump(acc, score_file)
|
| 108 |
+
|
| 109 |
+
return acc
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
class CustomTextMCQDataset(TextMCQDataset):
|
| 113 |
+
|
| 114 |
+
def load_data(self, dataset):
|
| 115 |
+
data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
|
| 116 |
+
|
| 117 |
+
if file_size(data_path, 'GB') > 1:
|
| 118 |
+
local_path = data_path.replace('.tsv', '_local.tsv')
|
| 119 |
+
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None):
|
| 120 |
+
from ..tools import LOCALIZE
|
| 121 |
+
LOCALIZE(data_path, local_path)
|
| 122 |
+
data_path = local_path
|
| 123 |
+
return load(data_path)
|