1f commited on Jun 7, 2025

Commit

885ccec

verified ·

1 Parent(s): 81aa597

Add files using upload-large-folder tool

Browse files

Files changed (20) hide show

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/dynamath.py +240 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_base.py +172 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_caption.py +75 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_ccocr.py +197 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_mcq.py +904 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_mt.py +128 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_vqa.py +1475 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_yorn.py +95 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/longvideobench.py +328 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/miabench.py +167 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mlvu.py +455 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mmbench_video.py +256 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mmgenbench.py +69 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mmlongbench.py +584 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mmmath.py +446 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mvbench.py +666 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/slidevqa.py +189 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/tempcompass.py +639 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/text_base.py +88 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/text_mcq.py +123 -0

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/dynamath.py ADDED Viewed

	@@ -0,0 +1,240 @@

+import re
+import json
+import sympy as sp
+import numpy as np
+import pandas as pd
+from sympy import simplify, Eq, sympify, Pow, pi
+from sympy.parsing.latex import parse_latex
+import sys
+import math
+import os
+import os.path as osp
+import argparse
+from .image_base import ImageBaseDataset
+from .utils import build_judge
+from ..utils import track_progress_rich
+from ..smp import load, dump, d2df, toliststr
+def preprocess(str1):
+    if 0 <= str1.find("{") < str1.rfind("}"):
+        str1 = str1[str1.find("{"): str1.rfind("}") + 1]
+    str2 = str1.replace("\\", "")
+    str2 = str2.replace("\\n", "\n")
+    return str2
+def transfer(str1):
+    if "\u03c0" in str1:
+        strs = str1.split('\u03c0')
+        str1 = strs[0]
+        return float(str1) * np.pi
+    else:
+        return float(str1)
+def parse_answer(answer, answer_type="multiple choice"):
+    if answer_type == "float":
+        if answer.isdigit():
+            return True, float(answer)
+        else:
+            parts = answer.split(' ')
+            answer = parts[0]
+            try:
+                answer = transfer(answer)
+                return True, answer
+            except:
+                return False, None
+    elif answer_type == "multiple choice":
+        if len(answer) == 1:
+            return True, answer.upper()
+        else:
+            in_flag = [ch in answer.upper() for ch in 'ABCDE']
+            if sum(in_flag) == 1:
+                for ch in 'ABCDE':
+                    if ch in answer.upper():
+                        return True, ch
+            return False, None
+    else:
+        return True, answer
+def DynaMath_auxeval(model, line):
+    pred = line['prediction']
+    pred = preprocess(pred)
+    succeed, short_answer = None, None
+    try:
+        dj = json.loads(pred, strict=False)
+        short_answer = dj.get("short answer")
+        assert short_answer is not None
+        succeed, short_answer = parse_answer(short_answer, answer_type=line['anwser_type'])
+        assert succeed
+    except:
+        # Failed to parse the JSON, use an auxiliary LLM to get the short answer
+        if line['answer_type'] == 'multiple choice':
+            inst = "Output the corresponing choice option, such as 'A', 'B', 'C', 'D', in a single line."
+        elif line['answer_type'] == 'float':
+            inst = "Output a three-digit floating-point number in a single line."
+        else:
+            inst = (
+                "Output a short answer in a single line. Any float numbers in the answer "
+                "should be formatted as three-digit floating-point numbers."
+            )
+        prompt = f"Free-form answer: {pred}\nInstruction: {inst}"
+        response = pred
+        succeed, short_answer = parse_answer(response, line['answer_type'])
+        if not succeed:
+            response = model.generate(prompt)
+            succeed, short_answer = parse_answer(response, line['answer_type'])
+    if line['answer_type'] == 'float':
+        if succeed:
+            diff = float(short_answer) - float(line['answer'])
+            if abs(diff) <= 0.001:
+                return dict(parse=True, extracted=short_answer, correct=True)
+            else:
+                return dict(parse=True, extracted=short_answer, correct=False)
+        else:
+            return dict(parse=False, extracted=None, correct=False)
+    elif line['answer_type'] == 'multiple choice':
+        if succeed:
+            return dict(parse=True, extracted=short_answer, correct=(short_answer == line['answer']))
+        else:
+            if line['answer'] in pred[:3].upper():
+                return dict(parse=False, extracted=None, correct=True)
+            else:
+                return dict(parse=False, extracted=None, correct=False)
+    else:
+        if succeed:
+            return dict(parse=True, extracted=short_answer, correct=(short_answer.lower() in line['answer'].lower()))
+        else:
+            return dict(parse=False, extracted=None, correct=(short_answer.lower() in line['answer'].lower()))
+class Dynamath(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {'DynaMath': 'https://opencompass.openxlab.space/utils/VLMEval/DynaMath.tsv'}
+    DATASET_MD5 = {'DynaMath': 'b8425ad9a7114571fc9366e013699494'}
+    GUIDE = """
+## Answer Instruction Please provide an answer to the question outlined above. Your response should adhere \
+to the following JSON format, which includes two keys: 'solution' and 'short answer'. The 'solution' key can contain \
+detailed steps needed to solve the question, and the 'short answer' key should provide a concise response. {INST}
+Example of expected JSON response format:
+"""
+    EXAMPLE = {
+        "solution": "[Detailed step-by-step explanation]",
+        "short answer": "[Concise Answer]"
+    }
+    TEXT_EXAMPLE = json.dumps(EXAMPLE, indent=4)
+    # Given one data record, return the built prompt (a multi-modal message), can override
+    def build_prompt(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        if self.meta_only:
+            tgt_path = toliststr(line['image_path'])
+        else:
+            tgt_path = self.dump_image(line)
+        prompt = f"## Question\n {line['question']}"
+        if line['answer_type'] == 'multiple choice':
+            inst = "Provide the corresponing choice option in the 'short answer' key, such as 'A', 'B', 'C', or 'D'."
+        elif line['answer_type'] == 'float':
+            inst = "Format the answer as a three-digit floating-point number and provide it in the 'short answer' key."
+        else:
+            inst = "Float numbers in the answer should be formatted as three-digit floating-point numbers."
+        prompt = prompt + self.GUIDE.format(INST=inst) + self.TEXT_EXAMPLE
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=prompt))
+        return msgs
+    def evaluate(self, eval_file, **judge_kwargs):
+        judge_name = judge_kwargs.pop('model', 'gpt-4o-mini')
+        model = build_judge(model=judge_name, **judge_kwargs)
+        suffix = eval_file.split('.')[-1]
+        storage = eval_file.replace(f'.{suffix}', f'_{judge_name}.xlsx')  # noqa: F841
+        score_file = eval_file.replace(f'.{suffix}', f'_{judge_name}_score.csv')  # noqa: F841
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_name}.pkl')  # noqa: F841
+        nproc = judge_kwargs.pop('nproc', 6)  # noqa: F841
+        res = load(tmp_file) if os.path.exists(tmp_file) else {}
+        res = {k: v for k, v in res.items() if v is not None}
+        model.system_prompt = """\
+You are a helpful assistant that helps me to format free-form answers into a short answer according to the instruction.
+"""
+        if not osp.exists(storage):
+            data = load(eval_file)
+            lt = len(data)
+            payloads = [dict(model=model, line=data.iloc[i]) for i in range(lt) if data.iloc[i]['index'] not in res]
+            keys = [idx for idx in data['index'] if idx not in res]
+            if len(keys):
+                results = track_progress_rich(DynaMath_auxeval, payloads, nproc=nproc, save=tmp_file, keys=keys)
+                for k, r in zip(keys, results):
+                    res[k] = r
+            data['parse'] = [res[idx]['parse'] for idx in data['index']]
+            data['extracted'] = [res[idx]['extracted'] for idx in data['index']]
+            data['correct'] = [res[idx]['correct'] for idx in data['index']]
+            dump(data, storage)
+        data = load(storage)
+        # Calculate Average Accuracy
+        score_avg = {}
+        score_avg['Overall'] = np.mean(data['correct'])
+        subs = set(data['subject'])
+        for sub in subs:
+            data_sub = data[data['subject'] == sub]
+            score_avg[f'Subject-{sub}'] = np.mean(data_sub['correct'])
+        lvls = set(data['knowledge_level'])
+        for lvl in lvls:
+            data_lvl = data[data['knowledge_level'] == lvl]
+            score_avg[f'Level-{lvl}'] = np.mean(data_lvl['correct'])
+        # Calculate the Worst Case Accuracy
+        score_worst = {}
+        data_worst = data[data['varid'] == 1]
+        qid2corr = {idx: True for idx in data_worst['index']}
+        lt = len(data)
+        for i in range(lt):
+            item = data.iloc[i]
+            qid2corr[item['qid']] *= item['correct']
+        data_worst['correct'] = [qid2corr[idx] for idx in data_worst['qid']]
+        score_worst['Overall'] = np.mean(data_worst['correct'])
+        subs = set(data_worst['subject'])
+        for sub in subs:
+            data_sub = data_worst[data_worst['subject'] == sub]
+            score_worst[f'Subject-{sub}'] = np.mean(data_sub['correct'])
+        lvls = set(data_worst['knowledge_level'])
+        for lvl in lvls:
+            data_lvl = data_worst[data_worst['knowledge_level'] == lvl]
+            score_worst[f'Level-{lvl}'] = np.mean(data_lvl['correct'])
+        d1 = {'Setting': 'Average'}
+        d1.update(score_avg)
+        d2 = {'Setting': 'Worst Case'}
+        d2.update(score_worst)
+        score = pd.concat([d2df(d1), d2df(d2)], ignore_index=True)
+        dump(score, score_file)
+        return score

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_base.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import pandas as pd
+from abc import abstractmethod
+from ..smp import *
+def img_root_map(dataset):
+    if 'MM_NIAH' in dataset:
+        return 'MMNIAH'
+    if 'CRPE' in dataset:
+        return 'CRPE'
+    if 'OCRVQA' in dataset:
+        return 'OCRVQA'
+    if 'COCO_VAL' == dataset:
+        return 'COCO'
+    if 'MMMU' in dataset:
+        return 'MMMU'
+    if "QSpatial" in dataset:
+        return "QSpatial"
+    mmbench_root_map = {
+        'MMBench_DEV_EN': 'MMBench', 'MMBench_TEST_EN': 'MMBench',
+        'MMBench_DEV_CN': 'MMBench', 'MMBench_TEST_CN': 'MMBench',
+        'MMBench': 'MMBench', 'MMBench_CN': 'MMBench',
+        'MMBench_DEV_EN_V11': 'MMBench_V11', 'MMBench_TEST_EN_V11': 'MMBench_V11',
+        'MMBench_DEV_CN_V11': 'MMBench_V11', 'MMBench_TEST_CN_V11': 'MMBench_V11',
+        'MMBench_V11': 'MMBench', 'MMBench_CN_V11': 'MMBench',
+    }
+    if dataset in mmbench_root_map:
+        return mmbench_root_map[dataset]
+    return dataset
+class ImageBaseDataset:
+    MODALITY = 'IMAGE'
+    DATASET_URL = {}
+    DATASET_MD5 = {}
+    def __init__(self, dataset='MMBench', skip_noimg=True):
+        ROOT = LMUDataRoot()
+        # You can override this variable to save image files to a different directory
+        self.dataset_name = dataset
+        self.img_root = osp.join(ROOT, 'images', img_root_map(dataset))
+        data = self.load_data(dataset)
+        self.skip_noimg = skip_noimg
+        if skip_noimg and 'image' in data:
+            data = data[~pd.isna(data['image'])]
+        data['index'] = [str(x) for x in data['index']]
+        self.meta_only = True
+        # The image field can store the base64 encoded image or another question index (for saving space)
+        if 'image' in data:
+            data['image'] = [str(x) for x in data['image']]
+            image_map = {x: y for x, y in zip(data['index'], data['image'])}
+            for k in image_map:
+                if len(image_map[k]) <= 64:
+                    idx = image_map[k]
+                    assert idx in image_map and len(image_map[idx]) > 64
+                    image_map[k] = image_map[idx]
+            images = [toliststr(image_map[k]) for k in data['index']]
+            data['image'] = [x[0] if len(x) == 1 else x for x in images]
+            self.meta_only = False
+        if 'image_path' in data:
+            paths = [toliststr(x) for x in data['image_path']]
+            data['image_path'] = [x[0] if len(x) == 1 else x for x in paths]
+        if np.all([istype(x, int) for x in data['index']]):
+            data['index'] = [int(x) for x in data['index']]
+        self.data = data
+        self.post_build(dataset)
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        return dict(self.data.iloc[idx])
+    def prepare_tsv(self, url, file_md5=None):
+        data_root = LMUDataRoot()
+        os.makedirs(data_root, exist_ok=True)
+        update_flag = False
+        file_name = url.split('/')[-1]
+        data_path = osp.join(data_root, file_name)
+        if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
+            pass
+        else:
+            warnings.warn('The dataset tsv is not downloaded')
+            download_file(url, data_path)
+            update_flag = True
+        if file_size(data_path, 'GB') > 1:
+            local_path = data_path.replace('.tsv', '_local.tsv')
+            if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag:
+                from ..tools import LOCALIZE
+                LOCALIZE(data_path, local_path)
+            data_path = local_path
+        return load(data_path)
+    def dump_image(self, line):
+        os.makedirs(self.img_root, exist_ok=True)
+        if 'image' in line:
+            if isinstance(line['image'], list):
+                tgt_path = []
+                assert 'image_path' in line
+                for img, im_name in zip(line['image'], line['image_path']):
+                    path = osp.join(self.img_root, im_name)
+                    if not read_ok(path):
+                        decode_base64_to_image_file(img, path)
+                    tgt_path.append(path)
+            else:
+                tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
+                if not read_ok(tgt_path):
+                    decode_base64_to_image_file(line['image'], tgt_path)
+                tgt_path = [tgt_path]
+        else:
+            assert 'image_path' in line
+            tgt_path = toliststr(line['image_path'])
+        return tgt_path
+    def display(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        assert isinstance(line, pd.Series) or isinstance(line, dict)
+        mmqa_display(line)
+    # Return a list of dataset names that are supported by this class, can override
+    @classmethod
+    def supported_datasets(cls):
+        return list(cls.DATASET_URL)
+    # Given the dataset name, return the dataset as a pandas dataframe, can override
+    def load_data(self, dataset):
+        url = self.DATASET_URL[dataset]
+        file_md5 = self.DATASET_MD5[dataset] if dataset in self.DATASET_MD5 else None
+        return self.prepare_tsv(url, file_md5)
+    # Post built hook, will be called after the dataset is built, can override
+    def post_build(self, dataset):
+        pass
+    # Given one data record, return the built prompt (a multi-modal message), can override
+    def build_prompt(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        if self.meta_only:
+            tgt_path = toliststr(line['image_path'])
+        else:
+            tgt_path = self.dump_image(line)
+        question = line['question']
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=question))
+        return msgs
+    # Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
+    @abstractmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        pass

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_caption.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from .image_base import ImageBaseDataset
+from ..smp import *
+class COCO_Caption_Scorer():
+    def __init__(self, ref, gt):
+        from pycocoevalcap.bleu.bleu import Bleu
+        from pycocoevalcap.rouge.rouge import Rouge
+        from pycocoevalcap.cider.cider import Cider
+        self.ref = ref
+        self.gt = gt
+        print('setting up scorers...')
+        self.scorers = [
+            (Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']),
+            (Rouge(), 'ROUGE_L'),
+            (Cider(), 'CIDEr'),
+        ]
+    def compute_scores(self):
+        total_scores = {}
+        for scorer, method in self.scorers:
+            print('computing %s score...' % (scorer.method()))
+            score, scores = scorer.compute_score(self.gt, self.ref)
+            if isinstance(method, list):
+                for sc, scs, m in zip(score, scores, method):
+                    print('%s: %0.3f' % (m, sc * 100))
+                total_scores['Bleu'] = [x * 100 for x in score]
+            else:
+                print('%s: %0.3f' % (method, score * 100))
+                total_scores[method] = score * 100
+        print('*****DONE*****')
+        for key, value in total_scores.items():
+            print('{}:{}'.format(key, value))
+        return total_scores
+class ImageCaptionDataset(ImageBaseDataset):
+    TYPE = 'Caption'
+    DATASET_URL = {
+        'COCO_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/COCO_VAL.tsv',
+    }
+    DATASET_MD5 = {
+        'COCO_VAL': '72a5079dead060269ac222c5aa5128af',
+    }
+    def load_data(self, dataset):
+        data = super().load_data(dataset)
+        if 'question' not in data:
+            data['question'] = [(
+                'Please describe this image in general. Directly provide the description, '
+                'do not include prefix like "This image depicts". '
+            )] * len(data)
+        return data
+    # It returns a dictionary of scores
+    @classmethod
+    def evaluate(self, eval_file, **kwargs):
+        data = load(eval_file)
+        lt = len(data)
+        lines = [data.iloc[i] for i in range(lt)]
+        ref, gt = {}, {}
+        for i, line in enumerate(lines):
+            ref[str(i)] = [str(line['prediction'])]
+            gt[str(i)] = eval(line['answer'])
+        scorer = COCO_Caption_Scorer(ref, gt)
+        coco_caption_score_dict = scorer.compute_scores()
+        score_pth = eval_file.replace('.xlsx', '_score.json')
+        dump(coco_caption_score_dict, score_pth)
+        return coco_caption_score_dict

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_ccocr.py ADDED Viewed

	@@ -0,0 +1,197 @@

+# flake8: noqa
+import os
+import re
+import tempfile
+from functools import partial
+import pandas as pd
+from .image_base import ImageBaseDataset
+from ..smp import *
+# should be the same as  FAIL_MSG definded in vlmeval/inference.py
+FAIL_MSG = 'Failed to obtain answer via API.'
+class CCOCRDataset(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL_MODELSCOPE = {
+        "CCOCR_DocParsing_DocPhotoChn": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/doc/doc_photo_chn_75.tsv",
+        "CCOCR_DocParsing_DocPhotoEng": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/doc/doc_photo_eng_75.tsv",
+        "CCOCR_DocParsing_DocScanChn": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/doc/doc_scan_chn_75.tsv",
+        "CCOCR_DocParsing_DocScanEng": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/doc/doc_scan_eng_75.tsv",
+        "CCOCR_DocParsing_TablePhotoChn": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/table/table_photo_chn_75.tsv",
+        "CCOCR_DocParsing_TablePhotoEng": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/table/table_photo_eng_75.tsv",
+        "CCOCR_DocParsing_TableScanChn": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/table/table_scan_chn_75.tsv",
+        "CCOCR_DocParsing_TableScanEng": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/table/table_scan_eng_75.tsv",
+        "CCOCR_DocParsing_MolecularHandwriting": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/molecular/molecular_handwriting_100.tsv",
+        "CCOCR_DocParsing_FormulaHandwriting": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/formula/formula_handwriting_100.tsv",
+        "CCOCR_Kie_Sroie2019Word": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/kie/constrained_category/sroie2019_word_347.tsv",
+        "CCOCR_Kie_Cord": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/kie/constrained_category/CORD_100.tsv",
+        "CCOCR_Kie_EphoieScut": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/kie/constrained_category/EPHOIE_SCUT_311.tsv",
+        "CCOCR_Kie_Poie": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/kie/constrained_category/POIE_250.tsv",
+        "CCOCR_Kie_ColdSibr": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/kie/open_category/COLD_SIBR_400.tsv",
+        "CCOCR_Kie_ColdCell": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/kie/open_category/COLD_CELL_600.tsv",
+        "CCOCR_MultiLanOcr_Arabic": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Arabic/Arabic_150.tsv",
+        "CCOCR_MultiLanOcr_French": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/French/French_150.tsv",
+        "CCOCR_MultiLanOcr_German": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/German/German_150.tsv",
+        "CCOCR_MultiLanOcr_Italian": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Italian/Italian_150.tsv",
+        "CCOCR_MultiLanOcr_Japanese": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Japanese/Japanese_150.tsv",
+        "CCOCR_MultiLanOcr_Korean": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Korean/Korean_150.tsv",
+        "CCOCR_MultiLanOcr_Portuguese": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Portuguese/Portuguese_150.tsv",
+        "CCOCR_MultiLanOcr_Russian": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Russian/Russian_150.tsv",
+        "CCOCR_MultiLanOcr_Spanish": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Spanish/Spanish_150.tsv",
+        "CCOCR_MultiLanOcr_Vietnamese": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Vietnamese/Vietnamese_150.tsv",
+        "CCOCR_MultiSceneOcr_Cord": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/document_text/CORD_100.tsv",
+        "CCOCR_MultiSceneOcr_Funsd": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/document_text/FUNSD_50.tsv",
+        "CCOCR_MultiSceneOcr_Iam": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/document_text/IAM_50.tsv",
+        "CCOCR_MultiSceneOcr_ZhDoc": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/document_text/zh_doc_100.tsv",
+        "CCOCR_MultiSceneOcr_ZhHandwriting": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/document_text/zh_handwriting_50.tsv",
+        "CCOCR_MultiSceneOcr_Hieragent": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/scene_text/Hieragent_100.tsv",
+        "CCOCR_MultiSceneOcr_Ic15": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/scene_text/IC15_500.tsv",
+        "CCOCR_MultiSceneOcr_Inversetext": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/scene_text/InverseText_500.tsv",
+        "CCOCR_MultiSceneOcr_Totaltext": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/scene_text/TotalText_300.tsv",
+        "CCOCR_MultiSceneOcr_ZhScene": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/scene_text/zh_scene_450.tsv",
+        "CCOCR_MultiSceneOcr_UgcLaion": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/ugc_text/ugc_laion_400.tsv",
+        "CCOCR_MultiSceneOcr_ZhDense": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/ugc_text/zh_dense_50.tsv",
+        "CCOCR_MultiSceneOcr_ZhVertical": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/ugc_text/zh_vertical_100.tsv"
+    }
+    DATASET_URL_HUGGINGFACE = {
+        "CCOCR_DocParsing_DocPhotoChn": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/doc/doc_photo_chn_75.tsv",
+        "CCOCR_DocParsing_DocPhotoEng": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/doc/doc_photo_eng_75.tsv",
+        "CCOCR_DocParsing_DocScanChn": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/doc/doc_scan_chn_75.tsv",
+        "CCOCR_DocParsing_DocScanEng": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/doc/doc_scan_eng_75.tsv",
+        "CCOCR_DocParsing_TablePhotoChn": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/table/table_photo_chn_75.tsv",
+        "CCOCR_DocParsing_TablePhotoEng": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/table/table_photo_eng_75.tsv",
+        "CCOCR_DocParsing_TableScanChn": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/table/table_scan_chn_75.tsv",
+        "CCOCR_DocParsing_TableScanEng": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/table/table_scan_eng_75.tsv",
+        "CCOCR_DocParsing_MolecularHandwriting": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/molecular/molecular_handwriting_100.tsv",
+        "CCOCR_DocParsing_FormulaHandwriting": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/formula/formula_handwriting_100.tsv",
+        "CCOCR_Kie_Sroie2019Word": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/kie/constrained_category/sroie2019_word_347.tsv",
+        "CCOCR_Kie_Cord": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/kie/constrained_category/CORD_100.tsv",
+        "CCOCR_Kie_EphoieScut": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/kie/constrained_category/EPHOIE_SCUT_311.tsv",
+        "CCOCR_Kie_Poie": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/kie/constrained_category/POIE_250.tsv",
+        "CCOCR_Kie_ColdSibr": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/kie/open_category/COLD_SIBR_400.tsv",
+        "CCOCR_Kie_ColdCell": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/kie/open_category/COLD_CELL_600.tsv",
+        "CCOCR_MultiLanOcr_Arabic": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Arabic/Arabic_150.tsv",
+        "CCOCR_MultiLanOcr_French": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/French/French_150.tsv",
+        "CCOCR_MultiLanOcr_German": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/German/German_150.tsv",
+        "CCOCR_MultiLanOcr_Italian": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Italian/Italian_150.tsv",
+        "CCOCR_MultiLanOcr_Japanese": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Japanese/Japanese_150.tsv",
+        "CCOCR_MultiLanOcr_Korean": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Korean/Korean_150.tsv",
+        "CCOCR_MultiLanOcr_Portuguese": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Portuguese/Portuguese_150.tsv",
+        "CCOCR_MultiLanOcr_Russian": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Russian/Russian_150.tsv",
+        "CCOCR_MultiLanOcr_Spanish": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Spanish/Spanish_150.tsv",
+        "CCOCR_MultiLanOcr_Vietnamese": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Vietnamese/Vietnamese_150.tsv",
+        "CCOCR_MultiSceneOcr_Cord": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/document_text/CORD_100.tsv",
+        "CCOCR_MultiSceneOcr_Funsd": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/document_text/FUNSD_50.tsv",
+        "CCOCR_MultiSceneOcr_Iam": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/document_text/IAM_50.tsv",
+        "CCOCR_MultiSceneOcr_ZhDoc": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/document_text/zh_doc_100.tsv",
+        "CCOCR_MultiSceneOcr_ZhHandwriting": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/document_text/zh_handwriting_50.tsv",
+        "CCOCR_MultiSceneOcr_Hieragent": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/scene_text/Hieragent_100.tsv",
+        "CCOCR_MultiSceneOcr_Ic15": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/scene_text/IC15_500.tsv",
+        "CCOCR_MultiSceneOcr_Inversetext": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/scene_text/InverseText_500.tsv",
+        "CCOCR_MultiSceneOcr_Totaltext": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/scene_text/TotalText_300.tsv",
+        "CCOCR_MultiSceneOcr_ZhScene": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/scene_text/zh_scene_450.tsv",
+        "CCOCR_MultiSceneOcr_UgcLaion": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/ugc_text/ugc_laion_400.tsv",
+        "CCOCR_MultiSceneOcr_ZhDense": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/ugc_text/zh_dense_50.tsv",
+        "CCOCR_MultiSceneOcr_ZhVertical": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/ugc_text/zh_vertical_100.tsv"
+    }
+    # define data path
+    DATASET_URL = DATASET_URL_MODELSCOPE
+    DATASET_MD5 = {
+        "CCOCR_DocParsing_DocPhotoChn": "9039dcbb31830d413261a95cfa29d97f",
+        "CCOCR_DocParsing_DocPhotoEng": "2ca0824881e1d7317626f2a19d902989",
+        "CCOCR_DocParsing_DocScanChn": "9e265c8aa760ebdf5c3bf9e892d55492",
+        "CCOCR_DocParsing_DocScanEng": "77d04637be3def86dbc2ce37ba64a704",
+        "CCOCR_DocParsing_TablePhotoChn": "c4dc85252ddad2b43a03a67b1d1ae983",
+        "CCOCR_DocParsing_TablePhotoEng": "02ab75d6169da0cd2ece9ce0ae14a479",
+        "CCOCR_DocParsing_TableScanChn": "f1f79959fdd01127df7377c9d46722f2",
+        "CCOCR_DocParsing_TableScanEng": "794903c7acf52bfe956eefba2166d14b",
+        "CCOCR_DocParsing_MolecularHandwriting": "30b7f7679b713ce000a939eca7b4078f",
+        "CCOCR_DocParsing_FormulaHandwriting": "e03047776ce5e79a61ae1c057e2a348e",
+        "CCOCR_Kie_Sroie2019Word": "3287d99a8e86a99b74171fa5a70f9acb",
+        "CCOCR_Kie_Cord": "ab297cadcbc7158884a301c366f3330a",
+        "CCOCR_Kie_EphoieScut": "bb8fa3ba7ea91cbf17be0904956ad3f3",
+        "CCOCR_Kie_Poie": "882b64317989ecbfed6518051cdffb14",
+        "CCOCR_Kie_ColdSibr": "109d5dad8b7081fb6a2f088e963196d4",
+        "CCOCR_Kie_ColdCell": "7b44c45b4d7d768d1dbdc08872fe7d3a",
+        "CCOCR_MultiLanOcr_Arabic": "e9a3f2bb9298d0b882ebc7a98980c3f3",
+        "CCOCR_MultiLanOcr_French": "729407ed2036c22e602eff645eddd40c",
+        "CCOCR_MultiLanOcr_German": "96fc2edae747f0ec95b0a6f9bf723022",
+        "CCOCR_MultiLanOcr_Italian": "29a508fa5d5a5e767497dd69e2430ebb",
+        "CCOCR_MultiLanOcr_Japanese": "bbcca96ccf25fff63597c2ab4f3ebb1f",
+        "CCOCR_MultiLanOcr_Korean": "0f55dbd24eba5edc189c91e124411641",
+        "CCOCR_MultiLanOcr_Portuguese": "a6fcf8831775a61aa631c0cf1c422ae7",
+        "CCOCR_MultiLanOcr_Russian": "19d2f84062a1699d3e9333912bd6b303",
+        "CCOCR_MultiLanOcr_Spanish": "f5a0cfa9f2ae4115c91c7b362034e591",
+        "CCOCR_MultiLanOcr_Vietnamese": "bf1cd4e83d91767f4906f81550cec8b9",
+        "CCOCR_MultiSceneOcr_Cord": "92943f0ccb4c5a196c574222e76759a0",
+        "CCOCR_MultiSceneOcr_Funsd": "229cc38d193edd00f4383610e98ee873",
+        "CCOCR_MultiSceneOcr_Iam": "d897a6d6c3880c65e752ec11b211204c",
+        "CCOCR_MultiSceneOcr_ZhDoc": "303682cc16c8bb51b2b896f8ceb8bd38",
+        "CCOCR_MultiSceneOcr_ZhHandwriting": "faa298d366bc05e5cfb39e334afb8eff",
+        "CCOCR_MultiSceneOcr_Hieragent": "6f132cdd0473d7cc145c3e3a08957dd6",
+        "CCOCR_MultiSceneOcr_Ic15": "3d94869f312a41d53d0578a06a2fb1f2",
+        "CCOCR_MultiSceneOcr_Inversetext": "e141d424a0c4cf9579064428a270f13d",
+        "CCOCR_MultiSceneOcr_Totaltext": "ca1daf81d49eeb57ef844b72a23c2e62",
+        "CCOCR_MultiSceneOcr_ZhScene": "9295152a66e6f117db8bfbb20a9013e6",
+        "CCOCR_MultiSceneOcr_UgcLaion": "8e9ea1fbf9d56532157e807eabf39b21",
+        "CCOCR_MultiSceneOcr_ZhDense": "de8f48ee0c8a2cf8ed7f2b3a81e6322d",
+        "CCOCR_MultiSceneOcr_ZhVertical": "4892b4aec6e7fd11e39aaea23712709b"
+    }
+    # It returns a DataFrame
+    def evaluate(self, eval_file, **judge_kwargs):
+        """
+        """
+        df = load(eval_file)
+        dict_list = df.to_dict(orient='records')
+        required_colume_list = ['answer', 'prediction', "category", "image_name", "l2-category", "split"]
+        for required_colume in required_colume_list:
+            assert required_colume in df, "required_colume: {} NOT found".format(required_colume)
+        gt_info, ptd_info = {}, {}
+        for data_info in dict_list:
+            image_name = data_info['image_name']
+            gt_info[image_name] = data_info['answer']
+            # warning the FAIL samples
+            if data_info['prediction'] != FAIL_MSG:
+                ptd_info[image_name] = data_info['prediction']
+        # assert eval_file is a single dataset
+        group_name = set([str(x) for x in df['category']]).pop()
+        op_name = set([str(x) for x in df['l2-category']]).pop()
+        data_name = set([str(x) for x in df['split']]).pop()
+        data_info = {"op": op_name, "group": group_name, "dataset": data_name,  "num": len(gt_info)}
+        try:
+            from .utils.ccocr_evaluator import evaluator_map_info as ccocr_evaluator_map
+        except ImportError as err:
+            import warnings
+            warnings.warn('The dependency of CCOCR evaluator is not properly installed')
+            warnings.warn(f'{type(err)}: {err}')
+        eval_func = ccocr_evaluator_map.get(group_name, None)
+        if eval_func is None:
+            raise ValueError("error: evaluator not defined for: {}".format(group_name))
+        meta_info, eval_info = eval_func(ptd_info, gt_info, **data_info)
+        output_info = {"meta": meta_info, "evaluation": eval_info, "config": data_info}
+        result_file = os.path.splitext(os.path.abspath(eval_file))[0] + "_eval.json"
+        dump(output_info, result_file)
+        # update global status for summary
+        # warning: the evaluate function should NOT run in parallel
+        all_status_info = {}
+        global_status_path = os.path.join(os.path.dirname(eval_file), "status.json")
+        if os.path.exists(global_status_path):
+            with open(global_status_path, "r") as f:
+                all_status_info = json.load(f)
+        all_status_info[data_name] = output_info
+        with open(global_status_path, "w") as f:
+            json.dump(all_status_info, f, ensure_ascii=False, indent=4)
+        return eval_info.get("summary")

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_mcq.py ADDED Viewed

	@@ -0,0 +1,904 @@

+import warnings
+from .image_base import ImageBaseDataset
+from .utils import build_judge, DEBUG_MESSAGE
+from ..smp import *
+import pandas as pd
+MMMB_URLS = {
+    'MMMB_ar': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_ar.tsv',
+    'MMMB_cn': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_cn.tsv',
+    'MMMB_en': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_en.tsv',
+    'MMMB_pt': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_pt.tsv',
+    'MMMB_ru': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_ru.tsv',
+    'MMMB_tr': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_tr.tsv',
+}
+MTL_MMBench_URLS = {
+    'MMBench_dev_ar': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_ar.tsv',
+    'MMBench_dev_cn': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_cn.tsv',
+    'MMBench_dev_en': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_en.tsv',
+    'MMBench_dev_pt': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_pt.tsv',
+    'MMBench_dev_tr': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_tr.tsv',
+    'MMBench_dev_ru': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_ru.tsv',
+}
+MMMB_MD5 = {
+    'MMMB_ar': 'f3a18b6385f1d9701840aa42de27aead', 'MMMB_cn': '13ed82fa89730037292fcaa27f08f430',
+    'MMMB_en': '1cd781a71ec5a2983c090b84105d6a01', 'MMMB_pt': '548ea2b3bb2da991790386f0015d30d1',
+    'MMMB_ru': 'ce1cc8a0533425ab0d86b326ebfc2984', 'MMMB_tr': '0733739d43090327975294292bc5cd67'
+}
+MTL_MMBench_MD5 = {
+    'MMBench_dev_ar': '4271b4a0d0200e1a86380a878e0d64a4', 'MMBench_dev_cn': '2ed5135326fed02c8e51ea50dda8222f',
+    'MMBench_dev_en': 'd9ab776fc018b3d45785e9a5c23431c2', 'MMBench_dev_pt': '4ddfbcd27ef12444b908c03831cd0295',
+    'MMBench_dev_tr': '4fab39d501389d3d6cc90264bb708f11', 'MMBench_dev_ru': '5ba1171ff2e68f80637bf78349e402a5'
+}
+class ImageMCQDataset(ImageBaseDataset):
+    TYPE = 'MCQ'
+    DATASET_URL = {
+        # MMBench v1.0
+        'MMBench_DEV_EN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_EN.tsv',
+        'MMBench_TEST_EN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_EN.tsv',
+        'MMBench_DEV_CN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_CN.tsv',
+        'MMBench_TEST_CN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_CN.tsv',
+        'MMBench': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench.tsv',  # Internal
+        'MMBench_CN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_CN.tsv',  # Internal
+        # MMBench v1.1
+        'MMBench_DEV_EN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_EN_V11.tsv',
+        'MMBench_TEST_EN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_EN_V11.tsv',
+        'MMBench_DEV_CN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_CN_V11.tsv',
+        'MMBench_TEST_CN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_CN_V11.tsv',
+        'MMBench_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_V11.tsv',  # Internal
+        'MMBench_CN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_CN_V11.tsv',  # Internal
+        # SEEDBench Series
+        'SEEDBench_IMG': 'https://opencompass.openxlab.space/utils/benchmarks/SEEDBench/SEEDBench_IMG.tsv',
+        'SEEDBench2': 'https://huggingface.co/datasets/VLMEval/SEEDBench2/resolve/main/SEEDBench2.tsv',
+        'SEEDBench2_Plus': 'https://opencompass.openxlab.space/utils/benchmarks/SEEDBench/SEEDBench2_Plus.tsv',
+        # ScienceQA Series
+        'ScienceQA_VAL': 'https://opencompass.openxlab.space/utils/benchmarks/ScienceQA/ScienceQA_VAL.tsv',
+        'ScienceQA_TEST': 'https://opencompass.openxlab.space/utils/benchmarks/ScienceQA/ScienceQA_TEST.tsv',
+        # MMT-Bench
+        'MMT-Bench_ALL_MI': 'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_ALL_MI.tsv',
+        'MMT-Bench_ALL': 'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_ALL.tsv',
+        'MMT-Bench_VAL_MI': 'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_VAL_MI.tsv',
+        'MMT-Bench_VAL': 'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_VAL.tsv',
+        # AesBench
+        'AesBench_VAL': 'https://huggingface.co/datasets/VLMEval/AesBench/resolve/main/AesBench_VAL.tsv',
+        'AesBench_TEST': 'https://huggingface.co/datasets/VLMEval/AesBench/resolve/main/AesBench_TEST.tsv',
+        # Q-Bench1
+        'Q-Bench1_VAL': 'https://huggingface.co/datasets/zhangzicheng/qbench_tsv/resolve/main/Q-Bench1_VAL.tsv',
+        'Q-Bench1_TEST': 'https://huggingface.co/datasets/zhangzicheng/qbench_tsv/resolve/main/Q-Bench1_TEST.tsv',
+        # A-Bench
+        'A-Bench_VAL': 'https://huggingface.co/datasets/zhangzicheng/abench_tsv/resolve/main/A-bench_VAL.tsv',
+        'A-Bench_TEST': 'https://huggingface.co/datasets/zhangzicheng/abench_tsv/resolve/main/A-bench_TEST.tsv',
+        # R-Bench
+        'R-Bench-Dis': 'https://huggingface.co/datasets/lcysyzxdxc/R-Bench/blob/main/R-bench-dis.tsv',
+        'R-Bench-Ref': 'https://huggingface.co/datasets/lcysyzxdxc/R-Bench/blob/main/R-bench-ref.tsv',
+        # Other Benchmarks
+        'CCBench': 'https://opencompass.openxlab.space/utils/VLMEval/CCBench.tsv',
+        'AI2D_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST.tsv',
+        'AI2D_TEST_NO_MASK': 'https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST_NO_MASK.tsv',
+        'MMStar': 'https://opencompass.openxlab.space/utils/VLMEval/MMStar.tsv',
+        'RealWorldQA': 'https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv',
+        'MLLMGuard_DS': 'https://opencompass.openxlab.space/utils/VLMEval/MLLMGuard_DS.tsv',
+        'BLINK': 'https://opencompass.openxlab.space/utils/VLMEval/BLINK.tsv',
+        'TaskMeAnything_v1_imageqa_random': (
+            'https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-random/'
+            'resolve/main/TaskMeAnything-v1-imageqa-random.tsv'
+        ),
+        'A-OKVQA': 'https://huggingface.co/datasets/Allen8/A-OKVQA/resolve/main/a-okvqa.tsv',
+        'WorldMedQA-V': 'https://opencompass.openxlab.space/utils/VLMEval/WorldMedQA-V.tsv',
+        'VisOnlyQA-VLMEvalKit': (
+            'https://huggingface.co/datasets/ryokamoi/VisOnlyQA_Eval_Real/'
+            'resolve/main/visonlyqa_vlmevalkit.tsv'
+        ),
+        '3DSRBench': (
+            'https://huggingface.co/datasets/ccvl/3DSRBench/'
+            'resolve/main/3dsrbench_v1_vlmevalkit_circular.tsv'
+        ),
+    }
+    DATASET_MD5 = {
+        # MMBench v1.0
+        'MMBench_DEV_EN': 'b6caf1133a01c6bb705cf753bb527ed8',
+        'MMBench_TEST_EN': '6939fadb0ce626fefc0bdc9c64efc528',
+        'MMBench_DEV_CN': '08b8fc3324a5ed74155350f57be69fbd',
+        'MMBench_TEST_CN': '7e1239baf0ee4c8b513e19705a0f317e',
+        'MMBench': '4115aea3383f3dd0083be6a633e0f820',  # Internal Only
+        'MMBench_CN': '2e053ffc90ea598b1feae13c36dc13ee',    # Internal Only
+        # MMBench v1.1
+        'MMBench_DEV_EN_V11': '30c05be8f2f347a50be25aa067248184',
+        'MMBench_TEST_EN_V11': '26f0f15381a21720255091d3e0316ce6',
+        'MMBench_DEV_CN_V11': '593f9b5f6bea453d870a798b34ae4f37',
+        'MMBench_TEST_CN_V11': '74bbe4556dac745613c7cbe5ad787050',
+        'MMBench_V11': 'b9276414f57af1308dcc4d0cd9b42e7c',  # Internal Only
+        'MMBench_CN_V11': '95f6980dd1b4de38e3cbffe0305a3f25',    # Internal Only
+        # SEEDBench
+        'SEEDBench_IMG': '68017231464752261a2526d6ca3a10c0',
+        'SEEDBench2': '4ec15cf864c4f16274112284f531813e',
+        'SEEDBench2_Plus': 'e32d3216dc4f452b0fe497a52015d1fd',
+        # ScienceQA
+        'ScienceQA_VAL': '96320d05e142e585e7204e72affd29f3',
+        'ScienceQA_TEST': 'e42e9e00f9c59a80d8a5db35bc32b71f',
+        # MMT-Bench
+        'MMT-Bench_ALL_MI': '5272157097e19cdd7cb41e412ab3b7c7',
+        'MMT-Bench_ALL': 'b273a2f4c596fe4f2605de0494cd632f',
+        'MMT-Bench_VAL_MI': 'c7d7b998eb5cd9aa36c7d4f721472462',
+        'MMT-Bench_VAL': '8dd4b730f53dbf9c3aed90ca31c928e0',
+        # AesBench
+        'AesBench_VAL': '3edb0c319e9187aa0b97fe7a11700a8c',
+        'AesBench_TEST': '58b1f7ba2cc32e1d68896d6ee716bbf8',
+        # Q-Bench1
+        'Q-Bench1_VAL': '837bdb6cd2da571713543462815187b7',
+        'Q-Bench1_TEST': '15e759bfd58c9d5f30b23a317d347153',
+        # A-Bench
+        'A-Bench_VAL': '218563ec50d34bb336c814143a5bb9c1',
+        'A-Bench_TEST': '567013fb033a20cf23f51d8e865bd16c',
+        # R-Bench
+        'R-Bench-Dis': 'd6e961dbfc43350688af2560226830b4',
+        'R-Bench-Ref': '270c1cb555acb523f3fdb178ed57021d',
+        # Other Benchmarks
+        'CCBench': 'f5dde47f24dc5a6fb6e595b409b466ac',
+        'AI2D_TEST': '0f593e0d1c7df9a3d69bf1f947e71975',
+        'AI2D_TEST_NO_MASK': 'fd8f463634d4fe9fbd23b876e8eea5be',
+        'MMStar': 'e1ecd2140806c1b1bbf54b43372efb9e',
+        'RealWorldQA': '4de008f55dc4fd008ca9e15321dc44b7',
+        'MLLMGuard_DS': '975fc0dd7119386e198c37d71e274b3f',
+        'BLINK': '3b6649b6a662184ea046908e5506260e',
+        'TaskMeAnything_v1_imageqa_random': '023fef69e2ca21827afb77c5ec3bc889',
+        'WorldMedQA-V': '441e63875e30c87f5750528b57b41285',
+        "VisOnlyQA-VLMEvalKit": 'cf460a31d2acb8d3a7cecd0e69298bfa',
+        '3DSRBench': '13a99f33164dc1b9faf0e8b8b01fd6f2',
+    }
+    DATASET_URL.update(MMMB_URLS)
+    DATASET_URL.update(MTL_MMBench_URLS)
+    DATASET_MD5.update(MMMB_MD5)
+    DATASET_MD5.update(MTL_MMBench_MD5)
+    def build_prompt(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        if self.meta_only:
+            tgt_path = toliststr(line['image_path'])
+        else:
+            tgt_path = self.dump_image(line)
+        question = line['question']
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        options_prompt = 'Options:\n'
+        for key, item in options.items():
+            options_prompt += f'{key}. {item}\n'
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        prompt = ''
+        if hint is not None:
+            prompt += f'Hint: {hint}\n'
+        prompt += f'Question: {question}\n'
+        if len(options):
+            prompt += options_prompt
+            prompt += 'Please select the correct answer from the options above. \n'
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=prompt))
+        return msgs
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.multiple_choice import report_acc, report_acc_MMT, mcq_circular_eval, mcq_vanilla_eval
+        # assert dataset is not None
+        dataset_map = {
+            'MMBench_TEST_EN': 'MMBench', 'MMBench_TEST_EN_V11': 'MMBench_V11',
+            'MMBench_TEST_CN': 'MMBench_CN', 'MMBench_TEST_CN_V11': 'MMBench_CN_V11'
+        }
+        dataset = self.dataset_name
+        if dataset in dataset_map:
+            dataset = dataset_map[dataset]
+        nproc = judge_kwargs.pop('nproc', 4)
+        circular = False
+        if listinstr(['mmbench', 'ccbench'], dataset.lower()):
+            data = load(eval_file)
+            data['index'] = [int(x) for x in data['index']]
+            dump(data, eval_file)
+            circular = True
+        suffix = eval_file.split('.')[-1]
+        model = judge_kwargs.get('model', 'exact_matching')
+        assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
+        name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
+        name_str = name_str_map[model] if model in name_str_map else model
+        if model == 'exact_matching':
+            model = None
+        elif gpt_key_set():
+            model = build_judge(**judge_kwargs)
+            if not model.working():
+                warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
+                warnings.warn(DEBUG_MESSAGE)
+                model = None
+        else:
+            warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+            model = None
+        result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
+        data = load(eval_file)
+        data = data.sort_values(by='index')
+        data['prediction'] = [str(x) for x in data['prediction']]
+        # If not choice label, then use lower case
+        for k in data.keys():
+            data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
+        meta = self.data
+        meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
+        data_map = {x: y for x, y in zip(data['index'], data['question'])}
+        for k in data_map:
+            assert k in meta_q_map, (
+                f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
+            )
+        if circular:
+            data = mcq_circular_eval(model, data, meta, nproc, result_file, self.dataset_name)
+        else:
+            data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
+        # load split
+        dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+        data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+        # May have different report acc functions for different datasets
+        if 'MMT' in dataset:
+            acc = report_acc_MMT(data)
+        else:
+            acc = report_acc(data)
+        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        dump(acc, score_file)
+        if dataset == 'AesBench_VAL':
+            warnings.warn('Note that AesBench VAL is just a toy version of AesBench TEST. For full results, \
+                           please evaluate on AesBench TEST. The AesBench TEST dataset is more than 20 times \
+                           larger than the VAL dataset and the leaderboard results are based on AesBench TEST.')
+        if dataset == 'VisOnlyQA-VLMEvalKit':
+            warnings.warn('Note that the results on VisOnlyQA-VLMEvalKit are different from the results on \
+                           the original VisOnlyQA. VisOnlyQA-VLMEvalKit does not include the \
+                           chemistry__shape_multi split and uses a different evaluation prompt. Please \
+                           explicitly specify the version of the dataset when you report results.')
+        return acc
+class MMMUDataset(ImageMCQDataset):
+    DATASET_URL = {
+        'MMMU_DEV_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_DEV_VAL.tsv',
+        'MMMU_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_TEST.tsv',
+    }
+    DATASET_MD5 = {
+        'MMMU_DEV_VAL': '585e8ad75e73f75dcad265dfd0417d64',
+        'MMMU_TEST': 'c19875d11a2d348d07e5eb4bdf33166d',
+    }
+    @staticmethod
+    def split_MMMU(msgs):
+        text, images = None, []
+        for s in msgs:
+            if s['type'] == 'image':
+                images.append(s['value'])
+            elif s['type'] == 'text':
+                assert text is None
+                text = s['value']
+        text_segs = text.split('<image ')
+        if len(text_segs) == 1:
+            return msgs
+        segs = [dict(type='text', value=text_segs[0])]
+        for i, seg in enumerate(text_segs):
+            if i == 0:
+                continue
+            assert istype(seg[0], int) and seg[1] == '>'
+            image_idx = int(seg[0]) - 1
+            segs.append(dict(type='image', value=images[image_idx]))
+            segs.append(dict(type='text', value=seg[2:]))
+        return segs
+    def build_prompt(self, line):
+        msgs = super().build_prompt(line)
+        msgs = self.split_MMMU(msgs)
+        return msgs
+class MUIRDataset(ImageMCQDataset):
+    DATASET_URL = {
+        'MUIRBench': 'http://opencompass.openxxlab.com/utils/VLMEval/MUIRBench.tsv'
+    }
+    DATASET_MD5 = {
+        'MUIRBench': '2e5e6fd7699761b08a7cb3ab8c0c2ec8'
+    }
+    @staticmethod
+    def split_MUIR(msgs):
+        text, images = None, []
+        # Separate images and text from msgs
+        for s in msgs:
+            if s['type'] == 'image':
+                images.append(s['value'])
+            elif s['type'] == 'text':
+                assert text is None  # Ensure only one text entry is expected
+                text = s['value']
+        # Split text by <image> tags
+        text_segs = text.split('<image>')
+        # Initialize the segments list
+        segs = []
+        # Iterate through the text segments and images
+        for i, seg in enumerate(text_segs):
+            # Append the image if this is not the first segment and there are still images left
+            if i > 0 and i - 1 < len(images):
+                segs.append(dict(type='image', value=images[i - 1]))
+            # Append the text segment (if it's non-empty)
+            if len(seg) > 0:
+                segs.append(dict(type='text', value=seg))
+        return segs
+    def build_prompt(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        if self.meta_only:
+            tgt_path = toliststr(line['image_path'])
+        else:
+            tgt_path = self.dump_image(line)
+        question = line['question']
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        # options_prompt = ''
+        options_prompt = '\n'.join([f'{key}. {item}' for key, item in options.items()])
+        # for key, item in options.items():
+        #     options_prompt += f'{key}. {item}\n'
+        prompt = ''
+        prompt += f'{question}\n'
+        if len(options):
+            prompt += options_prompt
+            prompt += "\nAnswer with the option's letter from the given choices directly."
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=prompt))
+        msgs = self.split_MUIR(msgs)
+        return msgs
+class GMAIMMBenchDataset(ImageMCQDataset):
+    DATASET_URL = {
+        'GMAI-MMBench_VAL': 'https://huggingface.co/datasets/VLMEval/GMAI-MMBench/resolve/main/GMAI-MMBench_VAL.tsv',
+        'GMAI_mm_bench_TEST_part_1': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_1.tsv',  # noqa: E501
+        'GMAI_mm_bench_TEST_part_2': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_2.tsv',  # noqa: E501
+        'GMAI_mm_bench_TEST_part_3': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_3.tsv',  # noqa: E501
+        'GMAI_mm_bench_TEST_part_4': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_4.tsv',  # noqa: E501
+        'GMAI_mm_bench_TEST_part_5': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_5.tsv',  # noqa: E501
+        'GMAI_mm_bench_TEST_part_6': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_6.tsv',  # noqa: E501
+        'GMAI_mm_bench_TEST_part_7': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_7.tsv',  # noqa: E501
+        'GMAI_mm_bench_TEST_part_8': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_8.tsv',  # noqa: E501
+        'GMAI_mm_bench_TEST_part_9': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_9.tsv',  # noqa: E501
+        'GMAI_mm_bench_TEST_part_10': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_10.tsv',  # noqa: E501
+        'GMAI_mm_bench_TEST_part_11': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_11.tsv',  # noqa: E501
+    }
+    DATASET_MD5 = {
+        'GMAI-MMBench_VAL': '254bd581627866f1c499d3d6b4422324',
+        'GMAI_mm_bench_TEST_part_1': '900d735231230a63f4ed45665c078ef4',
+        'GMAI_mm_bench_TEST_part_2': '1b27ab621386945d7e4a765ad2d22b0e',
+        'GMAI_mm_bench_TEST_part_3': '44bdc2b6267dd505d529b8cad06f0fb2',
+        'GMAI_mm_bench_TEST_part_4': '5a04a04fcac9f1466709f242fdb80acb',
+        'GMAI_mm_bench_TEST_part_5': 'c70baf8909eda9af0ddeab275c721336',
+        'GMAI_mm_bench_TEST_part_6': '825abc39596b644dead9350d0cfa3b96',
+        'GMAI_mm_bench_TEST_part_7': 'defb8aed2fb77365a76b6b9abd6a2701',
+        'GMAI_mm_bench_TEST_part_8': 'ff490d60b85f2bb0abb67a435b298c65',
+        'GMAI_mm_bench_TEST_part_9': 'ff67c86f40da93b09139ac1d1ba5dc6b',
+        'GMAI_mm_bench_TEST_part_10': '3dae94627b9ac0fe00180d4780fbf6dc',
+        'GMAI_mm_bench_TEST_part_11': 'd08dc813f0eb6bbab63cae2a9d113c4b',
+    }
+    @classmethod
+    def supported_datasets(cls):
+        return ['GMAI-MMBench_VAL', 'GMAI-MMBench_TEST']
+    def load_data(self, dataset):
+        if dataset == 'GMAI-MMBench_VAL':
+            data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
+            if file_size(data_path, 'GB') > 1:
+                local_path = data_path.replace('.tsv', '_local.tsv')
+                if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL'):
+                    from ..tools import LOCALIZE
+                    LOCALIZE(data_path, local_path)
+                data_path = local_path
+            return load(data_path)
+        elif dataset == 'GMAI-MMBench_TEST':
+            dfs = []
+            for part_num in range(1, 12):
+                part_name = f'GMAI_mm_bench_TEST_part_{part_num}'
+                url = self.DATASET_URL[part_name]
+                file_md5 = self.DATASET_MD5.get(part_name)
+                tsv_path = osp.join(LMUDataRoot(), f'{part_name}.tsv')
+                if not osp.exists(tsv_path) or (file_md5 and md5(tsv_path) != file_md5):
+                    download_file(url, filename=tsv_path)
+                local_path = tsv_path.replace('.tsv', '_local.tsv')
+                if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL'):
+                    from ..tools import LOCALIZE
+                    LOCALIZE(tsv_path, local_path)
+                tsv_path = local_path
+                # 加载数据
+                df = load(tsv_path)
+                dfs.append(df)
+            # 合并所有数据
+            data = pd.concat(dfs, ignore_index=True)
+            return data
+        else:
+            raise ValueError(f"未知的数据集：{dataset}")
+    def report_acc_by_groups(self, df, group_column):
+        res = defaultdict(list)
+        # Check for the 'split' column
+        if 'split' in df:
+            splits = list(set(df['split']))
+            res['split'] = splits
+        else:
+            df['split'] = ['none'] * len(df)
+            res['split'] = ['none']
+        res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']]
+        if group_column not in df:
+            raise ValueError(f"Column '{group_column}' not found in dataframe.")  # noqa: E713
+        abilities = list(set(df[group_column]))
+        abilities = ['None' if isinstance(ab, float) and pd.isna(ab) else ab for ab in abilities]
+        abilities.sort()
+        for ab in abilities:
+            ab_name = ab
+            sub_df = df[df[group_column] == ab]
+            res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']]
+        return pd.DataFrame(res)
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.multiple_choice import report_acc, mcq_vanilla_eval
+        nproc = judge_kwargs.pop('nproc', 4)
+        suffix = eval_file.split('.')[-1]
+        model = judge_kwargs.get('model', 'exact_matching')
+        assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
+        name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
+        name_str = name_str_map[model] if model in name_str_map else model
+        if model == 'exact_matching':
+            model = None
+        elif gpt_key_set():
+            model = build_judge(**judge_kwargs)
+            if not model.working():
+                warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
+                warnings.warn(DEBUG_MESSAGE)
+                model = None
+        else:
+            warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+            model = None
+        result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
+        data = load(eval_file)
+        data = data.sort_values(by='index')
+        data['prediction'] = [str(x) for x in data['prediction']]
+        # If not choice label, then use lower case
+        for k in data.keys():
+            data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
+        meta = self.data
+        meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
+        data_map = {x: y for x, y in zip(data['index'], data['question'])}
+        for k in data_map:
+            assert k in meta_q_map, (
+                f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
+            )
+        data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
+        # load split
+        dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+        data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+        acc = report_acc(data)
+        for group_col in ['clinical vqa task', 'department', 'perceptual granularity']:
+            acc_grouped = self.report_acc_by_groups(data, group_col)
+            score_file_grouped = eval_file.replace(f'.{suffix}', f'_{group_col}_acc.csv')
+            dump(acc_grouped, score_file_grouped)
+        return acc
+class MMERealWorld(ImageMCQDataset):
+    TYPE = 'MMERealWorld'
+    DATASET_MD5 = {
+        'MME-RealWorld': '271c33ec814c39533c467ec6fb8a6f36',
+        'MME-RealWorld-Lite': '4c17057d7d3b6c4a0d4397c3dae0881c',
+        'MME-RealWorld-CN': 'daaa763d52a760a38606d5dedb3fe444',
+    }
+    SYS = {
+        'MME-RealWorld': (
+            'Select the best answer to the above multiple-choice question based on the image. '
+            'Respond with only the letter (A, B, C, D, or E) of the correct option. \n'
+            'The best answer is:'
+        ),
+        'MME-RealWorld-Lite': (
+            'Select the best answer to the above multiple-choice question based on the image. '
+            'Respond with only the letter (A, B, C, D, or E) of the correct option. \n'
+            'The best answer is:'
+        ),
+        'MME-RealWorld-CN': (
+            '根据图像选择上述多项选择题的最佳答案。只需回答正确选项的字母（A, B, C, D 或 E）。\n'
+            '最佳答案为：'
+        ),
+    }
+    @classmethod
+    def supported_datasets(cls):
+        return ['MME-RealWorld', 'MME-RealWorld-CN', 'MME-RealWorld-Lite',]
+    def load_data(
+        self, dataset="MME-RealWorld", repo_id="yifanzhang114/MME-RealWorld-Base64"
+    ):
+        def check_integrity(pth):
+            data_file = osp.join(pth, f"{dataset}.tsv")
+            if not os.path.exists(data_file):
+                return False
+            if md5(data_file) != self.DATASET_MD5[dataset]:
+                return False
+            return True
+        def generate_tsv(pth):
+            tsv_file = os.path.join(pth, f"{dataset}.tsv")
+            if os.path.exists(tsv_file):
+                print(f"{tsv_file} already exists.")
+                return
+            json_dir = os.path.join(pth, dataset)
+            json_files = [f for f in os.listdir(json_dir) if f.endswith(".json")]
+            data_list = []
+            for json_file in json_files:
+                with open(os.path.join(json_dir, json_file), "r") as f:
+                    data = json.load(f)
+                    for item in tqdm(data):
+                        choice_prompt = (
+                            "The choices are listed below:\n"
+                            if dataset in ["MME-RealWorld", "MME-RealWorld-Lite"]
+                            else "选项如下所示:\n"
+                        )
+                        data_list.append(
+                            {
+                                "index": item["index"],
+                                "image": item["image"],
+                                "question": item["question"],
+                                "multi-choice options": choice_prompt
+                                + "\n".join(item["multi-choice options"]),
+                                "A": item["multi-choice options"][0][4:],
+                                "B": item["multi-choice options"][1][4:],
+                                "C": item["multi-choice options"][2][4:],
+                                "D": item["multi-choice options"][3][4:],
+                                "E": item["multi-choice options"][4][4:],
+                                "answer": item["answer"],
+                                "category": item["category"],
+                                "l2-category": item["l2-category"],
+                            }
+                        )
+            df = pd.DataFrame(data_list)
+            df.to_csv(tsv_file, sep="\t", index=False)
+            print(f"TSV file saved to {tsv_file}")
+        # Check if dataset is cached and has integrity
+        if dataset == "MME-RealWorld-Lite":
+            url = 'https://huggingface.co/datasets/yifanzhang114/MME-RealWorld-Base64/resolve/main/mme_realworld_lite.tsv'  # noqa: E501
+            file_md5 = (
+                self.DATASET_MD5[dataset] if dataset in self.DATASET_MD5 else None
+            )
+            datas = self.prepare_tsv(url, file_md5)
+            choice_prompt = "The choices are listed below:\n"
+            for index, item in datas.iterrows():
+                options = eval(item["multi-choice options"])
+                datas.loc[index, "multi-choice options"] = choice_prompt + "\n".join(
+                    options
+                )
+                datas.loc[index, "A"] = options[0][4:]
+                datas.loc[index, "B"] = options[1][4:]
+                datas.loc[index, "C"] = options[2][4:]
+                datas.loc[index, "D"] = options[3][4:]
+                datas.loc[index, "E"] = options[4][4:]
+            return datas
+        update_flag = False
+        cache_path = get_cache_path(repo_id)
+        if cache_path is not None and check_integrity(cache_path):
+            dataset_path = cache_path
+            print(f"Using cached dataset from {cache_path}")
+        else:
+            from huggingface_hub import snapshot_download
+            # Download or find the dataset path
+            dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
+            generate_tsv(dataset_path)
+            update_flag = True
+        data_path = os.path.join(dataset_path, f"{dataset}.tsv")
+        if file_size(data_path, "GB") > 1:
+            local_path = data_path.replace(".tsv", "_local.tsv")
+            if (
+                not osp.exists(local_path)
+                or os.environ.get("FORCE_LOCAL", None)
+                or update_flag
+            ):
+                from vlmeval.tools import LOCALIZE
+                LOCALIZE(data_path, local_path)
+            data_path = local_path
+        return load(data_path)
+    def post_build(self, dataset):
+        self.TYPE = 'MMERealWorld'
+    # Given one data record, return the built prompt (a multi-modal message), can override
+    def build_prompt(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        if self.meta_only:
+            tgt_path = toliststr(line['image_path'])
+        else:
+            tgt_path = self.dump_image(line)
+        question = line['question']
+        choice_prompt = line['multi-choice options'] + '\n'
+        question += ' ' + choice_prompt + self.SYS[self.dataset_name]
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=question))
+        return msgs
+    # It returns a dictionary
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.multiple_choice import extract_characters_regex, get_dimension_rating
+        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        FAIL_MSG = 'Failed to obtain answer via API.'
+        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
+        tgt_file = eval_file.replace('.xlsx', '_rating.json')
+        score_file = eval_file.replace('.xlsx', '_score.xlsx')
+        if not osp.exists(score_file):
+            res = {} if not osp.exists(tmp_file) else load(tmp_file)
+            res = {k: v for k, v in res.items() if FAIL_MSG not in v}
+            data = load(eval_file)
+            cnt_rejected = 0
+            data_un = data[~pd.isna(data['prediction'])]
+            for idx in data['index']:
+                ans = data.loc[data['index'] == idx, 'answer'].values[0]
+                pred = data.loc[data['index'] == idx, 'prediction'].values[0]
+                extract_pred = extract_characters_regex(pred)
+                if extract_pred == '':
+                    cnt_rejected += 1
+                    data.loc[data['index'] == idx, 'score'] = 0
+                else:
+                    data.loc[data['index'] == idx, 'score'] = int(extract_pred == ans)
+            print(
+                f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
+                f'failed to obtain the score for another {cnt_rejected} questions. '
+                f'Those questions will be counted as 0 score in ALL rating.'
+            )
+            dump(data, score_file)
+        rating = get_dimension_rating(score_file)
+        dump(rating, tgt_file)
+        return rating
+class HRBenchDataset(ImageMCQDataset):
+    DATASET_URL = {
+        'HRBench4K': 'https://huggingface.co/datasets/DreamMr/HR-Bench/resolve/main/hr_bench_4k.tsv',
+        'HRBench8K': 'https://huggingface.co/datasets/DreamMr/HR-Bench/resolve/main/hr_bench_8k.tsv',
+    }
+    DATASET_MD5 = {
+        'HRBench4K': 'f6b041b03d49543494b8a56d2e35be65',
+        'HRBench8K': '274c9c7f89329b804a4723178a00219c',
+    }
+    def evaluate(self, eval_file, **judge_kwargs):
+        assert os.path.exists(eval_file), '{} does not exist!'.format(eval_file)
+        from .utils.multiple_choice import mcq_vanilla_eval
+        from .utils.hrbench import report_acc_hrbench
+        nproc = judge_kwargs.pop('nproc', 4)
+        suffix = eval_file.split('.')[-1]
+        model = judge_kwargs.get('model', 'extract_matching')
+        assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
+        name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
+        name_str = name_str_map[model] if model in name_str_map else model
+        if model == 'exact_matching':
+            model = None
+        elif gpt_key_set():
+            model = build_judge(**judge_kwargs)
+            if not model.working():
+                warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
+                warnings.warn(DEBUG_MESSAGE)
+                model = None
+        else:
+            warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+            model = None
+        result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
+        data = load(eval_file)
+        data = data.sort_values(by='index')
+        data['prediction'] = [str(x) for x in data['prediction']]
+        # If not choice label, then use lower case
+        for k in data.keys():
+            data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
+        meta = self.data
+        meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
+        data_map = {x: y for x, y in zip(data['index'], data['question'])}
+        for k in data_map:
+            assert k in meta_q_map, (
+                f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
+            )
+        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        if osp.exists(score_file):
+            acc = load(score_file)
+            return acc
+        data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
+        dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+        data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+        acc = report_acc_hrbench(data)
+        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        dump(acc, score_file)
+        return acc
+class CustomMCQDataset(ImageMCQDataset):
+    def load_data(self, dataset):
+        data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
+        if file_size(data_path, 'GB') > 1:
+            local_path = data_path.replace('.tsv', '_local.tsv')
+            if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None):
+                from ..tools import LOCALIZE
+                LOCALIZE(data_path, local_path)
+            data_path = local_path
+        return load(data_path)
+class NaturalBenchDataset(ImageMCQDataset):
+    DATASET_URL = {
+        'NaturalBenchDataset': (
+            'https://huggingface.co/datasets/BaiqiL/'
+            'NaturalBench/resolve/main/NaturalBenchDataset.tsv'
+        ),
+    }
+    DATASET_MD5 = {
+        'NaturalBenchDataset':'dbe25b044bc35696426381e9ba4fe930',
+    }
+    def build_prompt(self, line):
+        SUFFIX_FOR_VQA = {
+            "yes_no": "Please answer Yes or No.",
+            "multiple_choice": "Please output the letter corresponding to the correct option."
+        }
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        if self.meta_only:
+            tgt_path = toliststr(line['image_path'])
+        else:
+            tgt_path = self.dump_image(line)
+        question = line['question']
+        prompt = f'{question} {SUFFIX_FOR_VQA[line["type"]]}'
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=prompt))
+        return msgs
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.naturalbench import extract_answer, get_scores
+        data = load(eval_file)
+        data = data.sort_values(by='index')
+        predictions = [str(x) for x in data['prediction']]
+        answers = [str(x) for x in data['answer']]
+        indexs = [str(x) for x in data['index']]
+        meta = self.data
+        types = [str(x) for x in meta['type']]
+        results = {}
+        assert len(predictions) == len(answers) == len(indexs) == len(types) == (1900 * 4)
+        number_answered_samples = len(predictions) // 4
+        for i in range(number_answered_samples):
+            results[i] = {
+                "q0_i0": extract_answer(predictions[i * 4], types[i * 4]),
+                "q0_i1": extract_answer(predictions[i * 4 + 1], types[i * 4 + 1]),
+                "q1_i0": extract_answer(predictions[i * 4 + 2], types[i * 4 + 2]),
+                "q1_i1": extract_answer(predictions[i * 4 + 3], types[i * 4 + 3])
+            }
+        scores = get_scores(results)
+        print(scores)
+        score_file = 'NaturalBench_acc.csv'
+        df = pd.DataFrame(list(scores.items()), columns=['Metric', 'Score'])
+        dump(df, score_file)
+        return scores

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_mt.py ADDED Viewed

	@@ -0,0 +1,128 @@

+from .image_base import ImageBaseDataset
+from .utils.judge_util import build_judge
+from ..smp import *
+from ..utils import track_progress_rich
+class ImageMTDataset(ImageBaseDataset):
+    TYPE = 'MT'
+    def build_prompt(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        if self.meta_only:
+            tgt_path = toliststr(line['image_path'])
+        else:
+            tgt_path = self.dump_image(line)
+        questions = toliststr(line['question'])
+        if 'answer' in line:
+            answers = toliststr(line['answer'])
+        else:
+            answers = [''] * len(questions)
+        assert len(questions) == len(answers)
+        dlgs, pics_number = [], 0
+        for i in range(len(questions)):
+            q, a = questions[i], answers[i]
+            if '<ImageHere>' in q:
+                content = []
+                tag_number = q.count('<ImageHere>')
+                images = tgt_path[pics_number: pics_number + tag_number]
+                pics_number += tag_number
+                q_split = q.split('<ImageHere>')
+                for i in range(tag_number):
+                    qsp, im = q_split[i], images[i]
+                    if qsp != '':
+                        content.append(dict(type='text', value=qsp))
+                    content.append(dict(type='image', value=im))
+                if q_split[-1] != '':
+                    content.append(dict(type='text', value=q_split[-1]))
+            else:
+                content = [dict(type='text', value=q)]
+            dlgs.append(dict(role='user', content=content))
+            assert '<ImageHere>' not in a, 'We currently do not support images in the answer. '
+            content = [dict(type='text', value=a)]
+            dlgs.append(dict(role='assistant', content=content))
+        return dlgs
+class MMDUDataset(ImageMTDataset):
+    DATASET_URL = {'MMDU': 'https://opencompass.openxlab.space/utils/VLMEval/MMDU.tsv'}
+    DATASET_MD5 = {'MMDU': '848b635a88a078f49aebcc6e39792061'}
+    DIMS = [
+        'Creativity', 'Richness', 'Visual Perception', 'Logical Coherence',
+        'Answer Accuracy', 'Image Relationship Understanding', 'Overall Score'
+    ]
+    def calculat_metric(self, ans):
+        all = defaultdict(lambda: 0)
+        tot = defaultdict(lambda: 0)
+        valid = defaultdict(lambda: 0)
+        for k in ans:
+            res = ans[k]['res']
+            assert isinstance(res, pd.DataFrame)
+            lt = len(res)
+            for i in range(lt):
+                line = res.iloc[i]
+                for k in self.DIMS:
+                    tot[k] += 1
+                    if k in line and line[k] is not None:
+                        try:
+                            score = int(line[k])
+                            score = np.clip(score, 0, 10)
+                            all[k] += score
+                            valid[k] += 1
+                        except Exception as e:
+                            print(f'Failed to parse the score: {str(e)}')
+        sp1 = {'set': 'all'}
+        sp1.update({k: all[k] / tot[k] * 10 for k in self.DIMS})
+        sp2 = {'set': 'valid'}
+        sp2.update({k: all[k] / valid[k] * 10 for k in self.DIMS})
+        return pd.DataFrame([sp1, sp2])
+    def evaluate(self, eval_file, **judge_kwargs):
+        suffix = eval_file.split('.')[-1]
+        model = judge_kwargs['model']
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
+        nproc = judge_kwargs.pop('nproc', 4)
+        data = load(eval_file)
+        model = judge_kwargs.pop('model', 'gpt-4o')
+        judge_model = build_judge(model=model, **judge_kwargs)
+        lt = len(data)
+        lines = [data.iloc[i] for i in range(lt)]
+        tups = [(judge_model, line) for line in lines]
+        indices = [line['index'] for line in lines]
+        ans = {}
+        if osp.exists(tmp_file):
+            ans = load(tmp_file)
+        tups = [x for x, i in zip(tups, indices) if i not in ans]
+        indices = [i for i in indices if i not in ans]
+        from .utils.mmdu import mmdu_score
+        if len(indices):
+            new_results = track_progress_rich(
+                mmdu_score,
+                tups,
+                nproc=nproc,
+                chunksize=nproc,
+                keys=indices,
+                save=tmp_file,)
+            ans = load(tmp_file)
+            for k, v in zip(indices, new_results):
+                assert k in ans
+        metric = self.calculat_metric(ans)
+        dump(metric, score_file)
+        return metric

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_vqa.py ADDED Viewed

	@@ -0,0 +1,1475 @@

+import os
+import re
+import tempfile
+from functools import partial
+import pandas as pd
+from .image_base import ImageBaseDataset
+from .utils import build_judge, DEBUG_MESSAGE
+from ..smp import *
+from ..utils import track_progress_rich
+class ImageVQADataset(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'OCRVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/OCRVQA_TEST.tsv',
+        'OCRVQA_TESTCORE': 'https://opencompass.openxlab.space/utils/VLMEval/OCRVQA_TESTCORE.tsv',
+        'TextVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/TextVQA_VAL.tsv',
+        'DocVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/DocVQA_VAL.tsv',
+        'DocVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/DocVQA_TEST.tsv',
+        'InfoVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/InfoVQA_VAL.tsv',
+        'InfoVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/InfoVQA_TEST.tsv',
+        'ChartQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/ChartQA_TEST.tsv',
+        'GQA_TestDev_Balanced': 'https://opencompass.openxlab.space/utils/VLMEval/GQA_TestDev_Balanced.tsv',
+    }
+    DATASET_MD5 = {
+        'OCRVQA_TEST': 'ca46a6d74b403e9d6c0b670f6fc00db9',
+        'OCRVQA_TESTCORE': 'c5239fe77db8bdc1f2ad8e55e0d1fe97',
+        'TextVQA_VAL': 'b233b31f551bbf4056f2f955da3a92cd',
+        'DocVQA_VAL': 'd5ee77e1926ff10690d469c56b73eabf',
+        'DocVQA_TEST': '6a2f28cac26ef2d3447374e8c6f6c8e9',
+        'InfoVQA_VAL': '2342e9c225222f0ef4dec545ebb126fe',
+        'InfoVQA_TEST': 'df535bf51b88dc9718252c34131a6227',
+        'ChartQA_TEST': 'c902e0aa9be5582a7aad6dcf52734b42',
+        'GQA_TestDev_Balanced': '99b62f22e224d9b2f32dcbe41359d1c9',
+    }
+    def build_prompt(self, line):
+        msgs = super().build_prompt(line)
+        assert msgs[-1]['type'] == 'text'
+        msgs[-1]['value'] += '\nAnswer the question using a single word or phrase.'
+        return msgs
+    # It returns a DataFrame
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.vqa_eval import hit_calculate, process_line
+        data = load(eval_file)
+        dataset = self.dataset_name
+        assert 'answer' in data and 'prediction' in data
+        data['prediction'] = [str(x) for x in data['prediction']]
+        data['answer'] = [str(x) for x in data['answer']]
+        lt = len(data)
+        pool = mp.Pool(16)
+        lines = [data.iloc[i] for i in range(lt)]
+        if listinstr(['TextVQA'], dataset):
+            res = pool.map(partial(process_line, method='vqa_score'), lines)
+        elif listinstr(['ChartQA'], dataset):
+            res = pool.map(partial(process_line, method='relaxed_accuracy'), lines)
+        elif listinstr(['OCRVQA', 'GQA'], dataset):
+            res = pool.map(partial(process_line, method='accuracy'), lines)
+        elif listinstr(['DocVQA', 'InfoVQA'], dataset):
+            res = pool.map(partial(process_line, method='anls'), lines)
+        else:  # default using vqa_score to calculate score
+            res = pool.map(process_line, lines)
+        hit = hit_calculate(res, dataset)
+        ret = dict()
+        if 'split' in data:
+            splits = set(data['split'])
+            for sp in splits:
+                sub = [r for l, r in zip(lines, res) if l['split'] == sp]
+                # [np.mean(x['match']) >= full_score_weight for x in sub]
+                hit = hit_calculate(sub, dataset)
+                ret[sp] = np.mean(hit) * 100
+            sub = [r for l, r in zip(lines, res)]
+            hit = hit_calculate(sub, dataset)
+            ret['Overall'] = np.mean(hit) * 100
+        else:
+            ret['Overall'] = np.mean(hit) * 100
+            if 'category' in data:
+                cates = list(set(data['category']))
+                cates.sort()
+                for c in cates:
+                    sub = [r for l, r in zip(lines, res) if l['category'] == c]
+                    # [np.mean(x['match']) >= full_score_weight for x in sub]
+                    hit = hit_calculate(sub, dataset)
+                    ret[c] = np.mean(hit) * 100
+        ret = d2df(ret)
+        ret.round(2)
+        suffix = eval_file.split('.')[-1]
+        result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        dump(ret, result_file)
+        return ret
+class VizWiz(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'VizWiz': 'https://opencompass.openxlab.space/utils/VLMEval/VizWiz.tsv'
+    }
+    DATASET_MD5 = {
+        'VizWiz': 'fa4ac4164467563ed2fac6eac6631bd0'
+    }
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.vqa_eval import hit_calculate, process_line
+        suffix = eval_file.split('.')[-1]
+        result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        if not osp.exists(result_file):
+            data = load(eval_file)
+            assert 'answers' in data and 'prediction' in data
+            data['prediction'] = [str(x) for x in data['prediction']]
+            data['answer'] = [str(x) for x in data['answers']]
+            lt = len(data)
+            pool = mp.Pool(16)
+            lines = [data.iloc[i] for i in range(lt)]
+            res = pool.map(process_line, lines)
+            hit = hit_calculate(res, 'VizWiz')
+            ret = dict()
+            ret['Overall'] = np.mean(hit) * 100
+            ret = d2df(ret)
+            ret.round(2)
+            dump(ret, result_file)
+        retz = pd.read_csv(result_file)
+        return retz
+class OCRBench(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'OCRBench': 'https://opencompass.openxlab.space/utils/VLMEval/OCRBench.tsv'
+    }
+    DATASET_MD5 = {'OCRBench': 'e953d98a987cc6e26ef717b61260b778'}
+    # It returns a dictionary
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        OCRBench_score = {
+            'Regular Text Recognition': 0,
+            'Irregular Text Recognition': 0,
+            'Artistic Text Recognition': 0,
+            'Handwriting Recognition': 0,
+            'Digit String Recognition': 0,
+            'Non-Semantic Text Recognition': 0,
+            'Scene Text-centric VQA': 0,
+            'Doc-oriented VQA': 0,
+            'Key Information Extraction': 0,
+            'Handwritten Mathematical Expression Recognition': 0,
+        }
+        data = load(eval_file)
+        lt = len(data)
+        lines = [data.iloc[i] for i in range(lt)]
+        for i in tqdm(range(len(lines))):
+            line = lines[i]
+            predict = str(line['prediction'])
+            answers = eval(line['answer'])
+            category = line['category']
+            if category == 'Handwritten Mathematical Expression Recognition':
+                for j in range(len(answers)):
+                    answer = answers[j].strip().replace('\n', ' ').replace(' ', '')
+                    predict = predict.strip().replace('\n', ' ').replace(' ', '')
+                    if answer in predict:
+                        OCRBench_score[category] += 1
+                        break
+            else:
+                for j in range(len(answers)):
+                    answer = answers[j].lower().strip().replace('\n', ' ')
+                    predict = predict.lower().strip().replace('\n', ' ')
+                    if answer in predict:
+                        OCRBench_score[category] += 1
+                        break
+        final_score_dict = {}
+        final_score_dict['Text Recognition'] = \
+            (OCRBench_score['Regular Text Recognition'] + OCRBench_score['Irregular Text Recognition']
+             + OCRBench_score['Artistic Text Recognition'] + OCRBench_score['Handwriting Recognition']
+             + OCRBench_score['Digit String Recognition'] + OCRBench_score['Non-Semantic Text Recognition'])
+        final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA']
+        final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA']
+        final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction']
+        final_score_dict['Handwritten Mathematical Expression Recognition'] = \
+            (OCRBench_score['Handwritten Mathematical Expression Recognition'])
+        final_score_dict['Final Score'] = \
+            (final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA']
+             + final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction']
+             + final_score_dict['Handwritten Mathematical Expression Recognition'])
+        final_score_dict['Final Score Norm'] = (float(final_score_dict['Final Score']) / 10)
+        score_pth = eval_file.replace('.xlsx', '_score.json')
+        dump(final_score_dict, score_pth)
+        return final_score_dict
+class MathVista(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'MathVista_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/MathVista_MINI.tsv'
+    }
+    DATASET_MD5 = {'MathVista_MINI': 'f199b98e178e5a2a20e7048f5dcb0464'}
+    # It returns a DataFrame
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.mathvista import MathVista_auxeval, MathVista_acc
+        model = judge_kwargs['model']
+        suffix = eval_file.split('.')[-1]
+        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        nproc = judge_kwargs.pop('nproc', 4)
+        if not osp.exists(storage):
+            data = load(eval_file)
+            model = build_judge(max_tokens=128, **judge_kwargs)
+            assert model.working(), ('MathVista evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = [line['index'] for line in lines]
+            ans = {}
+            if osp.exists(tmp_file):
+                ans = load(tmp_file)
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+            if len(indices):
+                new_results = track_progress_rich(
+                    MathVista_auxeval,
+                    tups,
+                    nproc=nproc,
+                    chunksize=nproc,
+                    keys=indices,
+                    save=tmp_file,
+                )
+                ans = load(tmp_file)
+                for k, v in zip(indices, new_results):
+                    assert k in ans
+                    assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
+            data['res'] = [ans[idx]['res'] for idx in data['index']]
+            data['log'] = [ans[idx]['log'] for idx in data['index']]
+            dump(data, storage)
+        score = MathVista_acc(storage)
+        score_pth = storage.replace('.xlsx', '_score.csv')
+        dump(score, score_pth)
+        return score
+class MathVerse(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'MathVerse_MINI': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIV.tsv', # noqa
+        'MathVerse_MINI_Vision_Only': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVOnly.tsv', # noqa
+        'MathVerse_MINI_Vision_Dominant': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVDom.tsv', # noqa
+        'MathVerse_MINI_Vision_Intensive': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVInt.tsv', # noqa
+        'MathVerse_MINI_Text_Lite': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINITLite.tsv', # noqa
+        'MathVerse_MINI_Text_Dominant': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINITDom.tsv', # noqa
+    }
+    DATASET_MD5 = {
+        'MathVerse_MINI': '5017caca32b7fa110c350a1bea861b65',
+        'MathVerse_MINI_Vision_Only': '68a11d4680014ac881fa37adeadea3a4',
+        'MathVerse_MINI_Vision_Dominant': 'b8fb63852d261ab2aaefba29cc2414d3',
+        'MathVerse_MINI_Vision_Intensive': '01cbd35be202bb0c4873a4186a63bc19',
+        'MathVerse_MINI_Text_Lite': '19e4b13bdd30b89a03b2e358bcfefa04',
+        'MathVerse_MINI_Text_Dominant': '4f5cd2fa6630ea00bb11d6fde1f6fe6a',
+    }
+    # It returns a DataFrame
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.mathverse import MathVerse_auxeval_extract, MathVerse_auxeval_score, MathVerse_acc
+        model = judge_kwargs['model']
+        suffix = eval_file.split('.')[-1]
+        storage_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.xlsx')
+        tmp_file_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.pkl')
+        storage_score = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
+        tmp_file_score = eval_file.replace(f'.{suffix}', f'_{model}_score.pkl')
+        nproc = judge_kwargs.pop('nproc', 4)
+        # stage1: extract the answer
+        if not osp.exists(storage_extract):
+            data = load(eval_file)
+            model = build_judge(max_tokens=128, **judge_kwargs)
+            assert model.working(), ('MathVerse evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = [line['index'] for line in lines]
+            ans = {}
+            if osp.exists(tmp_file_extract):
+                ans = load(tmp_file_extract)
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+            if len(indices):
+                new_results = track_progress_rich(
+                    MathVerse_auxeval_extract,
+                    tups,
+                    nproc=nproc,
+                    chunksize=nproc,
+                    keys=indices,
+                    save=tmp_file_extract,
+                )
+                ans = load(tmp_file_extract)
+                for k, v in zip(indices, new_results):
+                    assert k in ans
+                    assert ans[k]['log_extract'] == v['log_extract'] and ans[k]['extract'] == v['extract']
+            data['extract'] = [ans[idx]['extract'] for idx in data['index']]
+            data['log_extract'] = [ans[idx]['log_extract'] for idx in data['index']]
+            dump(data, storage_extract)
+        # stage2: score the answer
+        if not osp.exists(storage_score):
+            data = load(storage_extract)
+            model = build_judge(max_tokens=128, **judge_kwargs)
+            assert model.working(), ('MathVerse evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = [line['index'] for line in lines]
+            ans = {}
+            if osp.exists(tmp_file_score):
+                ans = load(tmp_file_score)
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+            if len(indices):
+                new_results = track_progress_rich(
+                    MathVerse_auxeval_score,
+                    tups,
+                    nproc=nproc,
+                    chunksize=nproc,
+                    keys=indices,
+                    save=tmp_file_score,
+                )
+                ans = load(tmp_file_score)
+                for k, v in zip(indices, new_results):
+                    assert k in ans
+                    assert ans[k]['log_score'] == v['log_score'] and ans[k]['score'] == v['score']
+            data['score'] = [ans[idx]['score'] for idx in data['index']]
+            data['log_score'] = [ans[idx]['log_score'] for idx in data['index']]
+            dump(data, storage_score)
+        score = MathVerse_acc(storage_score)
+        score_pth = storage_score.replace('.xlsx', '.csv')
+        dump(score, score_pth)
+        return score
+class MathVision(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'MathVision': 'https://opencompass.openxlab.space/utils/VLMEval/MathVision.tsv',
+        'MathVision_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/MathVision_MINI.tsv'
+    }
+    DATASET_MD5 = {
+        'MathVision': '93f6de14f7916e598aa1b7165589831e',
+        'MathVision_MINI': '060fe4fa5d868987ce179307bd5f8a33'
+    }
+    # It returns a DataFrame
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.mathv import MATH_V_auxeval, MATH_V_acc
+        if 'model' in judge_kwargs:
+            model = judge_kwargs['model']
+        else:
+            model = os.path.basename(os.environ.get('LOCAL_LLM'))
+        suffix = eval_file.split('.')[-1]
+        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        nproc = judge_kwargs.pop('nproc', 4)
+        if not osp.exists(storage):
+            data = load(eval_file)
+            model = build_judge(max_tokens=128, **judge_kwargs)
+            assert model.working(), ('MATH-Vision evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = [line['index'] for line in lines]
+            ans = {}
+            if osp.exists(tmp_file):
+                ans = load(tmp_file)
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+            if len(indices):
+                new_results = track_progress_rich(
+                    MATH_V_auxeval,
+                    tups,
+                    nproc=nproc,
+                    chunksize=nproc,
+                    keys=indices,
+                    save=tmp_file,
+                )
+                ans = load(tmp_file)
+                for k, v in zip(indices, new_results):
+                    assert k in ans
+                    assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
+            data['res'] = [ans[idx]['res'] for idx in data['index']]
+            data['log'] = [ans[idx]['log'] for idx in data['index']]
+            dump(data, storage)
+        score = MATH_V_acc(storage)
+        score_pth = storage.replace('.xlsx', '_score.csv')
+        dump(score, score_pth)
+        return score
+class OlympiadBench(ImageBaseDataset):
+    TYPE = 'VQA_ex_prompt'
+    DATASET_URL = {
+        'OlympiadBench': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench.tsv',
+        'OlympiadBench_EN': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench_EN.tsv',
+        'OlympiadBench_CN': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench_CN.tsv'
+    }
+    DATASET_MD5 = {
+        'OlympiadBench': '9735ae0f0299eae1e7d07f5a7feab914',
+        'OlympiadBench_EN': '5c68e100d394351fc7049f29d4d4efed',
+        'OlympiadBench_CN': 'ea01b16788955702c79650c701e5b623'
+    }
+    def dump_image(self, line):
+        os.makedirs(self.img_root, exist_ok=True)
+        tgt_path_z = []
+        if isinstance(line['image'], list):
+            for i in range(len(line['image'])):
+                tgt_path = osp.join(self.img_root, f"{line['index']}--{i + 1}.jpg")
+                if not read_ok(tgt_path):
+                    decode_base64_to_image_file(line['image'][i], tgt_path)
+                tgt_path_z.append(tgt_path)
+        else:
+            tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
+            if not read_ok(tgt_path):
+                decode_base64_to_image_file(line['image'], tgt_path)
+            tgt_path_z.append(tgt_path)
+        return tgt_path_z
+    def build_prompt(self, line):
+        from .utils.olympiadbench import get_answer_type_text, make_input
+        self.is_chinese = 'zh' in line['source']
+        self.is_math = 'maths' in line['source']
+        self.is_theorem_proving = 'TP' in line['source']
+        if self.is_chinese:
+            subject_content = '数学' if self.is_math else '物理'
+            if self.is_theorem_proving:
+                prompt = (
+                    f"以下是中国{subject_content}竞赛中的证明题。请根据题目的要求，运用逻辑推理及常用定理证明题目中的命题。"
+                    "证明过程中使用的变量和公式请使用LaTeX格式表示。"
+                )
+            else:
+                answer_type_text = get_answer_type_text(line['answer_type'], is_chinese=True,
+                                                        multiple_answer=line['is_multiple_answer'])
+                if line['is_multiple_answer']:
+                    multiple_answer_text = '\\boxed{用英文逗号连接的多个答案}'
+                else:
+                    multiple_answer_text = '\\boxed{答案}'
+                unit_text = ''
+                if line['unit']:
+                    multiple_answer_text += '(单位)'
+                    unit_text = '，注意答案的单位不要放在\\boxed{}中'
+                prompt = (
+                    f'以下是中国{subject_content}竞赛中的解答题{answer_type_text}。请根据题目的要求和所提供的信息计算得出答案。'
+                    f'解答过程和结果中使用的变量和公式请使用LaTeX格式表示。请在最后以“所以最终答案是{multiple_answer_text}。”'
+                    f'显式给出结果{unit_text}。'
+                )
+        else:
+            subject_content = 'Math' if self.is_math else 'Physics'
+            if self.is_theorem_proving:
+                prompt = (
+                    f'The following is a theorem proving problem from an International {subject_content} competition. '
+                    'Please use logical reasoning and common theorems to prove the proposition in the problem '
+                    'according to the given requirements. '
+                    'Please use LaTeX format to represent the variables and formulas used in the proof.'
+                )
+            else:
+                if line['is_multiple_answer']:
+                    multiple_answer_text = '\\boxed{multiple answers connected with commas}'
+                else:
+                    multiple_answer_text = '\\boxed{answer}'
+                unit_text = ''
+                if line['unit']:
+                    multiple_answer_text += '(unit)'
+                    unit_text = ', note that the unit of the answer should not be included in \\boxed{}'
+                answer_type_text = get_answer_type_text(line['answer_type'], is_chinese=False,
+                                                        multiple_answer=line['is_multiple_answer'])
+                prompt = (
+                    f'The following is an open-ended problem from an International {subject_content} competition. '
+                    f'{answer_type_text}Please calculate the answer according to the given requirements and '
+                    'the information provided. Please use LaTeX format to represent the variables and formulas '
+                    'used in the solution process and results. Please end your solution with "So the final answer '
+                    f'is {multiple_answer_text}." and give the result explicitly{unit_text}.'
+                )
+        if self.is_math:
+            input = make_input(prompt, line['question'])
+        else:
+            if 'context' in line.keys() and str(line['context']) != 'nan':  # cannot be null
+                input = make_input(prompt, line['context'] + '\n' + line['question'])
+            else:
+                input = make_input(prompt, line['question'])
+        ret = [dict(type='text', value=input)]
+        tgt_path = self.dump_image(line)
+        ret.extend([dict(type='image', value=s) for s in tgt_path])
+        return ret
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.olympiadbench import MathJudger, extract_answer
+        judger = MathJudger()
+        suffix = eval_file.split('.')[-1]
+        name_str1 = 'judge'
+        name_str2 = 'score'
+        result_file = eval_file.replace(f'.{suffix}', f'_{name_str1}_result.xlsx')
+        score_file = eval_file.replace(f'.{suffix}', f'_{name_str2}_result.csv')
+        if not osp.exists(result_file):
+            data = load(eval_file)
+            scorez = []
+            for i in tqdm(data.iterrows()):
+                line = i[1]
+                model_answer = line['prediction']
+                is_chinese = 'zh' in line['source']
+                model_answer = extract_answer(is_chinese, model_answer, is_deepseek=False)
+                answer_type = line['answer_type']
+                final_answer = line['final_answer'][2:-2]
+                if str(answer_type) != 'nan' and 'Tuple' in answer_type:
+                    judge_result = judger.judge(model_answer, final_answer)
+                else:
+                    if str(line['error']) != 'nan':
+                        if ',' in line['error']:
+                            precisions = line['error'].split(',')
+                            precisions = [float(p) if p else 1e-8 for p in precisions]
+                            judge_result = judger.judge(model_answer, final_answer, precisions)
+                        else:
+                            precision = float(line['error'])
+                            judge_result = judger.judge(model_answer, final_answer, precision)
+                    else:
+                        judge_result = judger.judge(model_answer, final_answer)
+                scorez.append(judge_result)
+            data['score'] = scorez
+            dump(data, result_file)
+        judge_file = load(result_file)
+        if not osp.exists(score_file):
+            name_list = ['OE_MM_maths_en_COMP', 'OE_MM_maths_zh_CEE', 'OE_MM_maths_zh_COMP', 'OE_MM_physics_en_COMP',
+                         'OE_MM_physics_zh_CEE','OE_TO_maths_en_COMP', 'OE_TO_maths_zh_CEE', 'OE_TO_maths_zh_COMP',
+                         'OE_TO_physics_en_COMP', 'OE_TO_physics_zh_CEE']
+            sample_list = [[] for _ in range(len(name_list))]
+            for i in judge_file.iterrows():
+                line = i[1]
+                for j in range(len(name_list)):
+                    if line['source'] == name_list[j]:
+                        sample_list[j].append(line['score'])
+            acc_dict = {}
+            correct_list = []
+            # fine-grained
+            for i in range(len(name_list)):
+                correct_num = 0
+                for j in sample_list[i]:
+                    if j:
+                        correct_num += 1
+                correct_list.append(correct_num)
+                acc = 100 * correct_num / len(sample_list[i])
+                acc_dict[name_list[i]] = [acc]
+            # 4 grained
+            labela = ['zh', 'en']
+            labelb = ['maths', 'physics']
+            grain_list = [[x,y] for x in labela for y in labelb]
+            for j in grain_list:
+                dict_name = j[0] + "_" + j[1]
+                correct_num = 0
+                full_num = 0
+                for i in range(len(name_list)):
+                    if all(k in name_list[i] for k in j):
+                        correct_num += correct_list[i]
+                        full_num += len(sample_list[i])
+                acc = 100 * correct_num / full_num
+                acc_dict[dict_name] = [acc]
+            # 2 grained
+            grain_list = ['maths', 'physics']
+            for j in grain_list:
+                dict_name = j
+                correct_num = 0
+                full_num = 0
+                for i in range(len(name_list)):
+                    if j in name_list[i]:
+                        correct_num += correct_list[i]
+                        full_num += len(sample_list[i])
+                acc = 100 * correct_num / full_num
+                acc_dict[dict_name] = [acc]
+            # AVG
+            correct_num = sum(correct_list)
+            acc = 100 * correct_num / len(judge_file)
+            acc_dict['AVG'] = [acc]
+            acc_pd = pd.DataFrame(acc_dict)
+            acc_pd.to_csv(score_file, index=False, encoding='gbk')
+        accdz = pd.read_csv(score_file)
+        return accdz
+class WeMath(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'WeMath': 'https://opencompass.openxlab.space/utils/VLMEval/WeMath.tsv'
+    }
+    DATASET_MD5 = {'WeMath': '056142c89b09d864702450b5b5ea0913'}
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.wemath import wemath_evaluate_models, wemath_accuracy
+        from .utils.multiple_choice import mcq_vanilla_eval
+        # model = judge_kwargs['model']
+        model = judge_kwargs.get('model', 'exact_matching')
+        assert model in ['exact_matching', 'gpt-4-0125', 'gpt-4-turbo', 'gpt-4o-mini'], model
+        name_str_map = {'gpt-4-0125': 'gpt4', 'gpt-4-turbo': 'gpt4-turbo', 'gpt-4o-mini': 'gpt4o-mini'}
+        name_str = name_str_map[model] if model in name_str_map else model
+        if model == 'exact_matching':
+            model = None
+        elif gpt_key_set():
+            model = build_judge(**judge_kwargs)
+            if not model.working():
+                warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
+                warnings.warn(DEBUG_MESSAGE)
+                model = None
+        else:
+            warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+            model = None
+        suffix = eval_file.split('.')[-1]
+        storage = eval_file.replace(f'.{suffix}', f'_{name_str}.xlsx')
+        nproc = judge_kwargs.pop('nproc', 4)
+        if not osp.exists(storage) and model is not None:
+            data = load(eval_file)
+            result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
+            data = load(eval_file)
+            data = data.sort_values(by='index')
+            data['prediction'] = [str(x) for x in data['prediction']]
+            # If not choice label, then use lower case
+            for k in data.keys():
+                data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
+            meta = self.data
+            meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
+            data_map = {x: y for x, y in zip(data['index'], data['question'])}
+            for k in data_map:
+                assert k in meta_q_map, (
+                    f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
+                )
+            data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
+            if 'id' in data.columns:
+                # 更改列名
+                data.rename(columns={'id': 'ID'}, inplace=True)
+            dump(data, storage)
+        if osp.exists(storage):
+            accuracy_scores = wemath_evaluate_models(storage)
+            four_dim_scores = wemath_accuracy(storage)
+        else:
+            accuracy_scores = wemath_evaluate_models(eval_file)
+            four_dim_scores = wemath_accuracy(eval_file)
+        combine_score = {**accuracy_scores, **four_dim_scores}
+        combine_score = pd.DataFrame(combine_score)
+        score_pth = storage.replace('.xlsx', '_score.csv')
+        dump(combine_score, score_pth)
+        return combine_score
+class LogicVista(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'LogicVista': 'https://opencompass.openxlab.space/utils/VLMEval/LogicVista.tsv'
+    }
+    DATASET_MD5 = {'LogicVista': '41c5d33adf33765c399e0e6ae588c061'}
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.logicvista import LogicVista_auxeval, evaluate_logicvista
+        # model = judge_kwargs['model']
+        model = judge_kwargs.get('model', 'exact_matching')
+        assert model in ['exact_matching', 'gpt-4-0125', 'gpt-4-turbo', 'gpt-4o-mini'], model
+        name_str_map = {'gpt-4-0125': 'gpt4', 'gpt-4-turbo': 'gpt4-turbo', 'gpt-4o-mini': 'gpt4o-mini'}
+        name_str = name_str_map[model] if model in name_str_map else model
+        if model == 'exact_matching':
+            model = None
+        elif gpt_key_set():
+            model = build_judge(**judge_kwargs)
+            if not model.working():
+                warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
+                warnings.warn(DEBUG_MESSAGE)
+                model = None
+        else:
+            warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+            model = None
+        suffix = eval_file.split('.')[-1]
+        storage = eval_file.replace(f'.{suffix}', f'_{name_str}.xlsx')
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{name_str}.pkl')
+        nproc = judge_kwargs.pop('nproc', 4)
+        if not osp.exists(storage) and model is not None:
+            data = load(eval_file)
+            model = build_judge(max_tokens=128, **judge_kwargs)
+            assert model.working(), ('LogicVista evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = [line['index'] for line in lines]
+            ans = {}
+            if osp.exists(tmp_file):
+                ans = load(tmp_file)
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+            if len(indices):
+                new_results = track_progress_rich(
+                    LogicVista_auxeval,
+                    tups,
+                    nproc=nproc,
+                    chunksize=nproc,
+                    keys=indices,
+                    save=tmp_file,
+                )
+                ans = load(tmp_file)
+                for k, v in zip(indices, new_results):
+                    assert k in ans
+                    assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res'] and ans[k]['hit'] == v['hit']
+            data['res'] = [ans[idx]['res'] for idx in data['index']]
+            data['log'] = [ans[idx]['log'] for idx in data['index']]
+            data['hit'] = [ans[idx]['hit'] for idx in data['index']]
+            dump(data, storage)
+        if osp.exists(storage):
+            accuracy_scores = evaluate_logicvista(storage)
+            score_pth = storage.replace('.xlsx', '_score.csv')
+            dump(accuracy_scores, score_pth)
+            return accuracy_scores
+class LLaVABench(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {'LLaVABench': 'https://opencompass.openxlab.space/utils/VLMEval/LLaVABench.tsv'}
+    DATASET_MD5 = {'LLaVABench': 'd382a093f749a697820d3dadd61c8428'}
+    # It returns a DataFrame
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.llavabench import (
+            build_prompt,
+            LLaVABench_atomeval,
+            LLaVABench_score,
+        )
+        suffix = '.' + eval_file.split('.')[-1]
+        record_file = eval_file.replace(suffix, '_openai_result' + suffix)
+        score_file = eval_file.replace(suffix, '_score.csv')
+        nproc = judge_kwargs.pop('nproc', 4)
+        system_prompt = 'You are a helpful and precise assistant for checking the quality of the answer.'
+        if not osp.exists(record_file):
+            data = load(eval_file)
+            lines = [data.iloc[i] for i in range(len(data))]
+            model = build_judge(temperature=0.2, system_prompt=system_prompt, **judge_kwargs)
+            assert model.working(), ('LLaVABench evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
+            prompts = [build_prompt(line) for line in lines]
+            tups = [(model, prompt) for prompt in prompts]
+            scores = track_progress_rich(LLaVABench_atomeval, tups, nproc=nproc, chunksize=nproc)
+            data['gpt4_score'] = [x[0] for x in scores]
+            data['score'] = [x[1] for x in scores]
+            dump(data, record_file)
+        data = load(record_file)
+        ret = LLaVABench_score(data).round(1)
+        dump(ret, score_file)
+        return ret
+class MMVet(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'MMVet': 'https://opencompass.openxlab.space/utils/VLMEval/MMVet.tsv',
+        'MMVet_Hard': 'http://opencompass.openxlab.space/utils/VLMEval/MMVet_Hard.tsv'
+    }
+    DATASET_MD5 = {'MMVet': '748aa6d4aa9d4de798306a63718455e3', 'MMVet_Hard': '63a598819a936a2e77c410a78a21ff16'}
+    # It returns a DataFrame
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.mmvet import MMVet_auxeval, MMVet_acc
+        suffix = eval_file.split('.')[-1]
+        model = judge_kwargs['model']
+        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        nproc = judge_kwargs.pop('nproc', 4)
+        if not osp.exists(storage):
+            data = load(eval_file)
+            model = build_judge(max_tokens=3, **judge_kwargs)
+            assert model.working(), ('MMVet evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = [line['index'] for line in lines]
+            ans = load(tmp_file) if osp.exists(tmp_file) else {}
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+            if len(indices):
+                new_results = track_progress_rich(
+                    MMVet_auxeval,
+                    tups,
+                    nproc=nproc,
+                    chunksize=nproc,
+                    keys=indices,
+                    save=tmp_file,
+                )
+                ans = load(tmp_file)
+                for k, v in zip(indices, new_results):
+                    assert k in ans
+                    assert ans[k]['log'] == v['log'] and ans[k]['score'] == v['score']
+            data['score'] = [ans[idx]['score'] for idx in data['index']]
+            data['log'] = [ans[idx]['log'] for idx in data['index']]
+            dump(data, storage)
+        score, score_fine = MMVet_acc(storage)
+        score_pth = storage.replace('.xlsx', '_score.csv')
+        score_fine_pth = storage.replace('.xlsx', '_score_fine.csv')
+        dump(score, score_pth)
+        dump(score_fine, score_fine_pth)
+        return score
+class MTVQADataset(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {'MTVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/MTVQA_TEST.tsv'}
+    DATASET_MD5 = {'MTVQA_TEST': 'd87c17dbab934b7cd89c0a3c1c5657f4'}
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        data = load(eval_file)
+        assert 'answer' in data and 'prediction' in data and 'category' in data
+        data['prediction'] = [str(x) for x in data['prediction']]
+        data['answer'] = [str(x) for x in data['answer']]
+        if 'split' in data:
+            assert np.all([x.lower() == 'test' for x in data['split']]), 'We only support MTVQA_TEST for now. '
+        lt = len(data)
+        category_scores = defaultdict(list)
+        for i in range(lt):
+            line = data.iloc[i]
+            ans = line['answer'].strip().lower().replace('.', '')
+            pred = line['prediction'].strip().lower().replace('.', '')
+            cate = line['category']
+            score = 1.0 if ans in pred else 0.0
+            category_scores[cate].append(score)
+            category_scores['Average'].append(score)
+        # Calculate the average score for each category, the score is normalized to [0, 100]
+        category_averages = {category: np.mean(scores) * 100 for category, scores in category_scores.items()}
+        suffix = eval_file.split('.')[-1]
+        result_file = eval_file.replace(f'.{suffix}', '_acc.json')
+        dump(category_averages, result_file)
+        return category_averages
+    # MT-VQA adopts a custom prompt
+    def build_prompt(self, line):
+        msgs = super().build_prompt(line)
+        assert sum([x['type'] == 'text' for x in msgs]) == 1
+        for item in msgs:
+            if item['type'] == 'text':
+                item['value'] += '\nAnswer the question using a word or phrase in the language of the question.'
+        return msgs
+class TableVQABench(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'TableVQABench': 'https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/mentor-vil/datasets/tablevqa-bench.tsv'
+    }
+    DATASET_MD5 = {'TableVQABench': '2550adc61bdc82d8e62f3b003de7c62d'}
+    from .utils.tablevqabench import FINTABNETQA_PROMPT, VTABFACT_PROMPT, VWTQ_PROMPT
+    # It returns a DataFrame
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        import pandas as pd
+        from .utils.tablevqabench import evaluate_fintabnet, evaluate_tabfact, evaluate_wtq
+        data = load(eval_file)
+        assert 'answer' in data and 'prediction' in data
+        data['prediction'] = data['prediction'].str.replace('^Answer: ', '', regex=True)
+        data_group = dict(tuple(data.groupby('split')))
+        eval_result = {'split': [], 'average_scores': []}
+        for split in ['fintabnetqa', 'vtabfact', 'vwtq', 'vwtq_syn']:
+            data_split = data_group[split].to_dict(orient='records')
+            if split == 'fintabnetqa':
+                split_eval_meta = evaluate_fintabnet(data_split, ['accuracy'])
+            elif split == 'vtabfact':
+                split_eval_meta = evaluate_tabfact(data_split, ['accuracy'])
+            elif split == 'vwtq' or split == 'vwtq_syn':
+                split_eval_meta = evaluate_wtq(data_split, ['accuracy'])
+            eval_result['split'].append(split)
+            eval_result['average_scores'].append(split_eval_meta['average_scores'])
+        suffix = eval_file.split('.')[-1]
+        result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        eval_result = pd.DataFrame(eval_result)
+        dump(eval_result, result_file)
+        return eval_result
+    # TableVQABench adopts a custom prompt
+    def build_prompt(self, line):
+        msgs = super().build_prompt(line)
+        assert sum([x['type'] == 'text' for x in msgs]) == 1
+        for item in msgs:
+            if item['type'] == 'text':
+                if line['split'] == 'fintabnetqa':
+                    item['value'] = self.FINTABNETQA_PROMPT.format_map({'question': item['value']})
+                elif line['split'] == 'vtabfact':
+                    item['value'] = self.VTABFACT_PROMPT.format_map({'question': item['value']})
+                elif line['split'] == 'vwtq_syn' or line['split'] == 'vwtq':
+                    item['value'] = self.VWTQ_PROMPT.format_map({'question': item['value']})
+        return msgs
+class CustomVQADataset(ImageBaseDataset):
+    TYPE = 'VQA'
+    def load_data(self, dataset):
+        data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
+        if file_size(data_path, 'GB') > 1:
+            local_path = data_path.replace('.tsv', '_local.tsv')
+            if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None):
+                from ..tools import LOCALIZE
+                LOCALIZE(data_path, local_path)
+            data_path = local_path
+        return load(data_path)
+    def evaluate(self, eval_file, **judge_kwargs):
+        raise NotImplementedError
+class CRPE(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'CRPE_EXIST': 'https://huggingface.co/datasets/petter12321/crpe_vlmevalkit/resolve/main/CRPE_EXIST.tsv',
+        'CRPE_RELATION': 'https://huggingface.co/datasets/petter12321/crpe_vlmevalkit/resolve/main/CRPE_RELATION.tsv'
+    }
+    DATASET_MD5 = {
+        'CRPE_EXIST': '315584e23ac1ff7f8719ed3b7ad90f08',
+        'CRPE_RELATION': 'bad7094cde0b572288f4b119c2d0c656'}
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.crpe import is_correct
+        # find-image, count-text, find-text,
+        # infer-choose, count-image, visual-reasoning
+        score = {
+            'exist': 0,
+            'subject': 0,
+            'predicate': 0,
+            'object': 0,
+            'total': 0,
+        }
+        num = {
+            'exist': 0,
+            'subject': 0,
+            'predicate': 0,
+            'object': 0,
+            'total': 0,
+        }
+        final_score_dict = {
+            'exist': 0,
+            'subject': 0,
+            'predicate': 0,
+            'object': 0,
+            'total': 0,
+        }
+        data = load(eval_file)
+        lt = len(data)
+        lines = [data.iloc[i] for i in range(lt)]
+        for i in tqdm(range(len(lines))):
+            line = lines[i]
+            predict = str(line['prediction'])
+            answers = str(line['answer'])
+            # print("predict =", predict)
+            # print("answers =", answers)
+            category = line['category']
+            if is_correct(answers, predict):
+                score[category] += 1
+                score['total'] += 1
+            num[category] += 1
+            num['total'] += 1
+        for category in ['exist', 'subject', 'predicate', 'object', 'total']:
+            if num[category] != 0:
+                final_score_dict[category] = score[category] / num[category]
+            else:
+                final_score_dict[category] = None
+        score_pth = eval_file.replace('.xlsx', '_score.json')
+        dump(final_score_dict, score_pth)
+        return final_score_dict
+    def build_prompt(self, line):
+        ROOT = LMUDataRoot()
+        msgs = super().build_prompt(line)
+        for msg in msgs:
+            if msg['type'] == 'image':
+                msg['value'] = osp.join(osp.join(ROOT, 'images', self.dataset_name), msg['value'])
+        return msgs
+class QSpatial(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'QSpatial_plus': '',
+        'QSpatial_scannet': ''
+    }
+    # NOTE: To evaluate Q-Spatial-ScanNet, you need to get the permission from ScanNet website
+    # Once you get the permission, you can use the helper code here to download and extract necessary images:
+    # https://github.com/andrewliao11/Q-Spatial-Bench-code?tab=readme-ov-file#for-qspatial_scannet
+    qspatial_root = "TO_BE_REPLACED_WITH_THE_PATH_TO_QSPATIAL_DATASET"
+    url = "https://raw.githubusercontent.com/andrewliao11/Q-Spatial-Bench-code/refs/heads/main/prompt_templates/"
+    def post_build(self, dataset):
+        # Download the prompt templates from github
+        links = [
+            self.url + "system_prompt.txt",
+            self.url + "spatial_prompt_single.txt",
+            self.url + "spatial_prompt_steps.txt",
+            self.url + "standard_prompt.txt",
+            self.url + "zero_shot_prompt.txt"
+        ]
+        with tempfile.TemporaryDirectory() as temp_dir:
+            for link in links:
+                tgt_path = os.path.join(temp_dir, link.split("/")[-1])
+                os.system(f"wget {link} -O {tgt_path}")
+            self.system_prompt = open(os.path.join(temp_dir, "system_prompt.txt")).read()
+            self._prompt_templates = dict(
+                spatial_prompt_single=open(os.path.join(temp_dir, "spatial_prompt_single.txt")).read(),
+                spatial_prompt_steps=open(os.path.join(temp_dir, "spatial_prompt_steps.txt")).read(),
+                standard_prompt=open(os.path.join(temp_dir, "standard_prompt.txt")).read(),
+                zero_shot_prompt=open(os.path.join(temp_dir, "zero_shot_prompt.txt")).read(),
+            )
+    # Given one data record, return the built prompt (a multi-modal message), can override
+    def build_prompt(self, line):
+        from jinja2.sandbox import SandboxedEnvironment
+        text_prompt_template = self._prompt_templates["spatial_prompt_single"]
+        env = SandboxedEnvironment()
+        text_prompt = env.from_string(text_prompt_template).render(question=line["question"])
+        tgt_path = self.dump_image(line)
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=f"{self.system_prompt}\n{text_prompt}"))
+        return msgs
+    # Given the dataset name, return the dataset as a pandas dataframe, can override
+    def load_data(self, dataset):
+        import io
+        import pandas as pd
+        from datasets import load_dataset
+        hf_dataset = load_dataset("andrewliao11/Q-Spatial-Bench", split=dataset)
+        df = hf_dataset.to_pandas()
+        df.reset_index(drop=True, inplace=True)
+        df['index'] = df.index
+        df['answer'] = list(zip(df['answer_value'], df['answer_unit']))
+        df = df[['index'] + [col for col in df.columns if col != 'index']]
+        if dataset == "QSpatial_scannet":
+            df = df.drop(columns=["image"])
+            df["image"] = [Image.open(os.path.join(self.qspatial_root, image_path)) for image_path in df["image_path"]]
+        else:
+            df["image"] = [Image.open(io.BytesIO(image_dict["bytes"])) for image_dict in df["image"]]
+        df["image"] = [encode_image_to_base64(image) for image in df["image"]]
+        return df
+    @classmethod
+    def get_multiplier(self, unit):
+        unit = unit.lower()
+        if unit in ["meters", "meter", "m", "metre", "metres"]:
+            multiplier = 100
+        elif unit in ["centimeters", "centimeter", "cm"]:
+            multiplier = 1
+        elif unit in ["feet", "foot", "ft"]:
+            multiplier = 30.48
+        elif unit in ["inch", "inches", "in"]:
+            multiplier = 2.54
+        elif unit in ["mm"]:
+            multiplier = 0.1
+        else:
+            print(f"Unknown unit: {unit}")
+            multiplier = 0.
+        return multiplier
+    @classmethod
+    def parse_string(self, input_str):
+        # Regular expression to match the pattern (number or range, text)
+        match = re.match(r'\(([\d.-]+), (.+)\)', input_str)
+        if match:
+            number_part = match.group(1)
+            text = match.group(2)
+            if '-' in number_part:
+                start, end = map(float, number_part.split('-'))
+                number = (start + end) / 2
+            else:
+                number = float(number_part)
+            return number * self.get_multiplier(text)
+        else:
+            print(f"Unable to parse the input string {input_str}")
+            return 0
+    @classmethod
+    def parse_prediction(self, vlm_response):
+        # Value
+        pattern = r'scalar{([^}]*)}'
+        str_inside_scalar_boxes = re.findall(pattern, vlm_response)[-1]
+        scalar_list = re.findall(r'\d+\.?\d*', str_inside_scalar_boxes)
+        parsed_scalar = np.array(scalar_list).astype(float).mean()
+        # Unit
+        pattern = r'distance_unit{([^}]*)}'
+        str_inside_unit_boxes = re.findall(pattern, vlm_response)
+        parsed_unit = str_inside_unit_boxes[-1]
+        pred_value_in_cms = parsed_scalar * self.get_multiplier(parsed_unit)
+        return pred_value_in_cms
+    # It returns a dictionary
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        data = load(eval_file)
+        if "model" in judge_kwargs:
+            from .utils.qspatial import QSpatial_auxeval
+            # extract using model
+            model = judge_kwargs['model']
+            suffix = eval_file.split('.')[-1]
+            storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
+            tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+            nproc = judge_kwargs.pop('nproc', 4)
+            if not osp.exists(storage):
+                model = build_judge(max_tokens=128, **judge_kwargs)
+                assert model.working(), ('Evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
+                lt = len(data)
+                lines = [data.iloc[i] for i in range(lt)]
+                tups = [(model, line) for line in lines]
+                indices = [line['index'] for line in lines]
+                ans = {}
+                if osp.exists(tmp_file):
+                    ans = load(tmp_file)
+                tups = [x for x, i in zip(tups, indices) if i not in ans]
+                indices = [i for i in indices if i not in ans]
+                if len(indices):
+                    new_results = track_progress_rich(
+                        QSpatial_auxeval,
+                        tups,
+                        nproc=nproc,
+                        chunksize=nproc,
+                        keys=indices,
+                        save=tmp_file,
+                    )
+                    ans = load(tmp_file)
+                    for k, v in zip(indices, new_results):
+                        assert k in ans
+                        assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
+                data['res'] = [ans[idx]['res'] for idx in data['index']]
+                data['log'] = [ans[idx]['log'] for idx in data['index']]
+                dump(data, storage)
+            data = load(storage)
+            pred_value_in_cms = []
+            for res in data["res"]:
+                try:
+                    pred_value_in_cms.append(self.parse_string(res))
+                except ValueError:
+                    pred_value_in_cms.append(0.)
+            pred_value_in_cms = np.array(pred_value_in_cms) + 1e-8
+        else:
+            # regex parsing
+            pred_value_in_cms = []
+            n_errors_in_parsing = 0
+            for pred in data["prediction"]:
+                try:
+                    parsed_value = self.parse_prediction(pred)
+                except IndexError:
+                    n_errors_in_parsing += 1
+                    parsed_value = 1e-8
+                pred_value_in_cms.append(parsed_value)
+            print(f"Encounter {n_errors_in_parsing} errors in parsing")
+            pred_value_in_cms = np.array(pred_value_in_cms) + 1e-8
+        # Ground truth
+        ground_truth_value_in_cms = []
+        for answer in data["answer"]:
+            value, unit = eval(answer)
+            ground_truth_value_in_cms.append(value * self.get_multiplier(unit))
+        ground_truth_value_in_cms = np.array(ground_truth_value_in_cms) + 1e-8
+        # Calculate the score
+        pred_gt = pred_value_in_cms / ground_truth_value_in_cms
+        gt_pred = ground_truth_value_in_cms / pred_value_in_cms
+        delta_2 = np.stack([pred_gt, gt_pred]).max(0) < 2.
+        delta_1_point_5 = np.stack([pred_gt, gt_pred]).max(0) < 1.5
+        data["eval_score_delta_2"] = delta_2
+        data["eval_score_delta_1_point_5"] = delta_1_point_5
+        final_score_dict = {
+            "delta_2": delta_2.mean(),
+            "delta_1_point_5": delta_1_point_5.mean()
+        }
+        for question_type in set(data["question_type"]):
+            filtered_data = data[data["question_type"] == question_type]
+            delta_2_per_question_type = filtered_data["eval_score_delta_2"].mean()
+            delta_1_point_5_per_question_type = filtered_data["eval_score_delta_1_point_5"].mean()
+            final_score_dict.update({f"{question_type}_delta_2": delta_2_per_question_type})
+            final_score_dict.update({f"{question_type}_delta_1_point_5": delta_1_point_5_per_question_type})
+        score_pth = eval_file.replace('.xlsx', '_score.json')
+        dump(final_score_dict, score_pth)
+        return final_score_dict
+class MMNIAH(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'MM_NIAH_VAL':
+            'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/MM_NIAH_VAL.tsv',
+        'MM_NIAH_TEST':
+            ['https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-aa',
+             'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ab',
+             'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ac',
+             'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ad',
+             'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ae']}
+    DATASET_MD5 = {'MM_NIAH_VAL': '27e5a8c3cef7746cb38f89cd86c474c5',
+                   'MM_NIAH_TEST': 'f490eb2a43096307465fe9e7ef13497c'}
+    def prepare_tsv(self, url, file_md5=None):
+        import os
+        data_root = LMUDataRoot()
+        os.makedirs(data_root, exist_ok=True)
+        update_flag = False
+        file_name = 'MM_NIAH_VAL.tsv' if 'MM_NIAH_VAL' in url else 'MM_NIAH_TEST.tsv'
+        data_path = osp.join(data_root, file_name)
+        if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
+            pass
+        elif file_name == 'MM_NIAH_TEST.tsv':
+            warnings.warn('The dataset tsv is not downloaded')
+            for i in range(len(url)):
+                if osp.exists(osp.join(data_root, 'part-a' + chr(ord('a') + i))):
+                    print('part_a' + chr(ord('a') + i) + ' is existed')
+                    continue
+                download_file(url[i], data_path)
+            file_prefix = 'part-'
+            output_file = data_path
+            split_files = sorted([f for f in os.listdir(data_root) if f.startswith(file_prefix)])
+            with open(output_file, 'wb') as outfile:
+                # 逐个读取每个拆分文件并写入到输出文件
+                for filename in split_files:
+                    with open(osp.join(data_root, filename), 'rb') as infile:
+                        outfile.write(infile.read())
+            update_flag = True
+        else:
+            warnings.warn('The dataset tsv is not downloaded')
+            download_file(url, data_path)
+            update_flag = True
+        if file_size(data_path, 'GB') > 1:
+            local_path = data_path.replace('.tsv', '_local.tsv')
+            if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag:
+                from ..tools import LOCALIZE
+                LOCALIZE(data_path, local_path)
+            data_path = local_path
+        return load(data_path)
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.mmniah import is_correct
+        # find-image, count-text, find-text,
+        # infer-choose, count-image, visual-reasoning
+        MMNIAH_score = {
+            'count-text': 0,
+            'find-image': 0,
+            'find-text': 0,
+            'infer-choose': 0,
+            'count-image': 0,
+            'visual-reasoning': 0,
+            'total': 0,
+        }
+        MMNIAH_num = {
+            'count-text': 0,
+            'find-image': 0,
+            'find-text': 0,
+            'infer-choose': 0,
+            'count-image': 0,
+            'visual-reasoning': 0,
+            'total': 0,
+        }
+        final_score_dict = {
+            'count-text': 0,
+            'find-image': 0,
+            'find-text': 0,
+            'infer-choose': 0,
+            'count-image': 0,
+            'visual-reasoning': 0,
+            'total': 0,
+        }
+        data = load(eval_file)
+        lt = len(data)
+        lines = [data.iloc[i] for i in range(lt)]
+        for i in tqdm(range(len(lines))):
+            line = lines[i]
+            predict = line['prediction']
+            answers = line['answer']
+            category = line['category']
+            if category in ['visual-reasoning', 'find-image']:
+                answers = int(answers)
+            if is_correct(answers, predict):
+                MMNIAH_score[category] += 1
+                MMNIAH_score['total'] += 1
+            MMNIAH_num[category] += 1
+            MMNIAH_num['total'] += 1
+        for category in ['find-image', 'count-text', 'find-text',
+                         'infer-choose', 'count-image', 'visual-reasoning', 'total']:
+            if MMNIAH_num[category] != 0:
+                final_score_dict[category] = MMNIAH_score[category] / MMNIAH_num[category]
+            else:
+                final_score_dict[category] = None
+        score_pth = eval_file.replace('.xlsx', '_score.json')
+        dump(final_score_dict, score_pth)
+        return final_score_dict
+    def build_prompt(self, line):
+        msgs = super().build_prompt(line)
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        totalchoice = line['multi-choice options']
+        totalchoice = eval(totalchoice)
+        # find-image, count-text, find-text,
+        # infer-choose, count-image, visual-reasoning
+        context = msgs[-1]['value']
+        context = eval(context)
+        question = context[0] + '\n' + context[1]
+        # tgt_path是所有图像地址列表
+        tgt_path = []
+        for i in range(len(msgs) - 1):
+            tgt_path.append(msgs[i]['value'])
+        choices = totalchoice[0]
+        choices_image = totalchoice[1]
+        if choices:
+            for c_idx, c in enumerate(choices):
+                question = f"{question}\n{chr(c_idx + ord('A'))}. {c}"
+            question += "\nAnswer with the option's letter from the given choices directly."
+        elif choices_image:
+            for c_idx in range(len(choices_image)):
+                question = f"{question}\n{chr(c_idx + ord('A'))}. <image>"
+            question += "\nAnswer with the option's letter from the given choices directly."
+        else:
+            question += '\nAnswer the question using a single word or phrase.'
+        question = '<start>' + question + '<end>'
+        question = question.split('<image>')
+        if choices_image:
+            for i in range(len(question) - 5):
+                question[i] = question[i] + '\n<image>'
+            for i in range(len(question) - 5, len(question) - 1):
+                question[i] = question[i] + '<image>'
+        else:
+            for i in range(len(question) - 1):
+                question[i] = question[i] + '\n<image>'
+        assert len(tgt_path) + 1 == len(question)
+        context = []
+        for i in range(len(tgt_path)):
+            context.append(question[i])
+            context.append(tgt_path[i])
+        context.append(question[-1])
+        context[0] = context[0][7:]
+        context[-1] = context[-1][:-5]
+        msgs = []
+        for i in range(len(context)):
+            if i % 2 == 0:
+                msgs.append(dict(type='text', value=context[i]))
+            else:
+                ROOT = LMUDataRoot()
+                msgs.append(dict(type='image', value=osp.join(osp.join(ROOT, 'images', self.dataset_name), context[i])))
+        for element in msgs:
+            if element['value'] == '':
+                msgs.remove(element)
+        return msgs

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/image_yorn.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from ..smp import *
+from ..utils import *
+from .image_base import ImageBaseDataset
+from .utils import build_judge, DEBUG_MESSAGE
+class ImageYORNDataset(ImageBaseDataset):
+    TYPE = 'Y/N'
+    DATASET_URL = {
+        'MME': 'https://opencompass.openxlab.space/utils/VLMEval/MME.tsv',
+        'HallusionBench': 'https://opencompass.openxlab.space/utils/VLMEval/HallusionBench.tsv',
+        'POPE': 'https://opencompass.openxlab.space/utils/VLMEval/POPE.tsv',
+        'AMBER': 'https://huggingface.co/datasets/yifanzhang114/AMBER_base64/resolve/main/AMBER.tsv',
+    }
+    DATASET_MD5 = {
+        'MME': 'b36b43c3f09801f5d368627fb92187c3',
+        'HallusionBench': '0c23ac0dc9ef46832d7a24504f2a0c7c',
+        'POPE': 'c12f5acb142f2ef1f85a26ba2fbe41d5',
+        'AMBER': '970d94c0410916166e0a76ba75da7934',
+    }
+    # It returns a dataframe
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.yorn import YOrN_Extraction, YOrN_auxeval
+        from .utils.yorn import default_rating, MME_rating, Hallusion_rating, POPE_rating, AMBER_rating
+        dataset = self.dataset_name
+        data = load(eval_file)
+        data['prediction'] = [str(x) for x in data['prediction']]
+        storage = eval_file.replace('.xlsx', '_auxmatch.xlsx')
+        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
+        nproc = judge_kwargs.pop('nproc', 4)
+        if not osp.exists(storage):
+            ans_map = {k: YOrN_Extraction(v) for k, v in zip(data['index'], data['prediction'])}
+            if osp.exists(tmp_file):
+                tmp = load(tmp_file)
+                for k in tmp:
+                    if ans_map[k] == 'Unknown' and tmp[k] != 'Unknown':
+                        ans_map[k] = tmp[k]
+            data['extracted'] = [ans_map[x] for x in data['index']]
+            unknown = data[data['extracted'] == 'Unknown']
+            model = judge_kwargs.get('model', 'exact_matching')
+            if model == 'exact_matching':
+                model = None
+            elif gpt_key_set():
+                model = build_judge(**judge_kwargs)
+                if not model.working():
+                    warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
+                    warnings.warn(DEBUG_MESSAGE)
+                    model = None
+            else:
+                model = None
+                warnings.warn('OPENAI_API_KEY is not working properly, will use exact matching for evaluation')
+            if model is not None:
+                lt = len(unknown)
+                lines = [unknown.iloc[i] for i in range(lt)]
+                tups = [(model, line) for line in lines]
+                indices = list(unknown['index'])
+                if len(tups):
+                    res = track_progress_rich(
+                        YOrN_auxeval, tups, nproc=nproc, chunksize=nproc, keys=indices, save=tmp_file)
+                    for k, v in zip(indices, res):
+                        ans_map[k] = v
+            data['extracted'] = [ans_map[x] for x in data['index']]
+            dump(data, storage)
+        data = load(storage)
+        if listinstr(['AMBER'], dataset):
+            data['score'] = (data['answer'].str.lower() == data['extracted'].str.lower())
+        else:
+            data['score'] = (data['answer'] == data['extracted'])
+        dump(data, storage)
+        if dataset is not None and listinstr(['MME'], dataset):
+            score = MME_rating(storage)
+        elif dataset is not None and listinstr(['Hallusion'], dataset):
+            score = Hallusion_rating(storage)
+        elif dataset is not None and listinstr(['POPE'], dataset):
+            score = POPE_rating(storage)
+        elif dataset is not None and listinstr(['AMBER'], dataset):
+            score = AMBER_rating(storage)
+        else:
+            score = default_rating(storage)
+        score_tgt = eval_file.replace('.xlsx', '_score.csv')
+        dump(score, score_tgt)
+        return score

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/longvideobench.py ADDED Viewed

	@@ -0,0 +1,328 @@

+from huggingface_hub import snapshot_download
+from ..smp import *
+from .video_base import VideoBaseDataset
+from .utils import build_judge, DEBUG_MESSAGE
+from glob import glob
+FAIL_MSG = 'Failed to obtain answer via API.'
+def timestamp_to_seconds(timestamp):
+    # Split the timestamp into hours, minutes, and seconds
+    h, m, s = timestamp.split(":")
+    # Convert hours, minutes, and total seconds (including fractions) to float and compute total seconds
+    total_seconds = int(h) * 3600 + int(m) * 60 + float(s)
+    return total_seconds
+def uniformly_subsample(lst, K):
+    n = len(lst)
+    if K >= n:
+        return lst
+    step = n / K
+    return [lst[int(i * step)] for i in range(K)]
+def insert_subtitles_into_frames(
+    frames,
+    frame_timestamps,
+    subtitles,
+    starting_timestamp_for_subtitles,
+    duration,
+):
+    interleaved_list = []
+    cur_i = 0
+    for subtitle in subtitles:
+        if "timestamp" in subtitle:
+            start, end = subtitle["timestamp"]
+            if not isinstance(end, float):
+                end = duration
+            start -= starting_timestamp_for_subtitles
+            end -= starting_timestamp_for_subtitles
+            subtitle_timestamp = (start + end) / 2
+            subtitle_text = subtitle["text"]
+        else:
+            start, end = subtitle["start"], subtitle["end"]
+            start = timestamp_to_seconds(start)
+            end = timestamp_to_seconds(end)
+            start -= starting_timestamp_for_subtitles
+            end -= starting_timestamp_for_subtitles
+            subtitle_timestamp = (start + end) / 2
+            subtitle_text = subtitle["line"]
+        for i, (frame, frame_timestamp) in enumerate(
+            zip(frames[cur_i:], frame_timestamps[cur_i:])
+        ):
+            if frame_timestamp <= subtitle_timestamp:
+                # print("frame:", frame_timestamp)
+                interleaved_list.append({"type": "image", "value": frame})
+                cur_i += 1
+            else:
+                break
+        if end - start < 1:
+            end = subtitle_timestamp + 0.5
+            start = subtitle_timestamp - 0.5
+        covering_frames = False
+        for frame, frame_timestamp in zip(frames, frame_timestamps):
+            if frame_timestamp < end and frame_timestamp > start:
+                covering_frames = True
+                break
+        if covering_frames:
+            interleaved_list.append({"type": "text", "value": subtitle_text + "\n"})
+        else:
+            pass
+    for i, (frame, frame_timestamp) in enumerate(
+        zip(frames[cur_i:], frame_timestamps[cur_i:])
+    ):
+        interleaved_list.append({"type": "image", "value": frame})
+    return interleaved_list
+class LongVideoBench(VideoBaseDataset):
+    MD5 = '82905eae3a5ae7383c5a8ee9655e1ab9'
+    SYS = ''
+    TYPE = 'Video-MCQ'
+    def __init__(self, dataset='LongVideoBench', use_subtitle=False, nframe=0, fps=-1):
+        super().__init__(dataset=dataset, nframe=nframe, fps=fps)
+        self.use_subtitle = use_subtitle
+        self.dataset_name = dataset
+    @classmethod
+    def supported_datasets(cls):
+        return ['LongVideoBench']
+    def prepare_dataset(self, dataset_name='LongVideoBench', repo_id='longvideobench/LongVideoBench'):
+        def check_integrity(pth):
+            data_file = osp.join(pth, f'{dataset_name}.tsv')
+            if not osp.exists(data_file):
+                return False
+            if md5(data_file) != self.MD5:
+                print("md5 mismatch", md5(data_file), self.MD5)
+                return False
+            data = load(data_file)
+            for video_pth in data['video_path']:
+                if not osp.exists(osp.join(pth, video_pth)):
+                    print(video_pth, "is not found")
+                    return False
+            return True
+        if modelscope_flag_set():
+            repo_id = "AI-ModelScope/LongVideoBench"
+        cache_path = get_cache_path(repo_id)
+        if cache_path is not None and check_integrity(cache_path):
+            dataset_path = cache_path
+        else:
+            def generate_tsv(pth):
+                data_file = osp.join(pth, f'{dataset_name}.tsv')
+                if osp.exists(data_file) and md5(data_file) == self.MD5:
+                    return
+                data_file = pd.read_json(osp.join(pth, 'lvb_val.json'))
+                data_file = data_file.assign(index=range(len(data_file)))
+                data_file['video'] = data_file['video_id']
+                data_file['video_path'] = data_file['video_path'].apply(lambda x: f'./videos/{x}')
+                data_file.to_csv(osp.join(pth, f'{dataset_name}.tsv'), sep='\t', index=False)
+            if modelscope_flag_set():
+                from modelscope import dataset_snapshot_download
+                dataset_snapshot_download(dataset_id=repo_id)
+            else:
+                snapshot_download(repo_id=repo_id, repo_type='dataset')
+            print("All videos are downloaded for LongVideoBench")
+            if not glob(osp.join(cache_path, "videos")):
+                tar_files = glob(osp.join(cache_path, "**/*.tar*"), recursive=True)
+                def untar_video_data(tar_file, cache_dir):
+                    import tarfile
+                    with tarfile.open(tar_file, "r") as tar_ref:
+                        tar_ref.extractall(cache_dir)
+                        print(f"Extracted all files from {tar_file} to {cache_dir}")
+                def concat_tar_parts(tar_parts, output_tar):
+                    with open(output_tar, "wb") as out_tar:
+                        from tqdm import tqdm
+                        for part in tqdm(sorted(tar_parts)):
+                            with open(part, "rb") as part_file:
+                                out_tar.write(part_file.read())
+                    print(f"Concatenated parts {tar_parts} into {output_tar}")
+                tar_parts_dict = {}
+                # Group tar parts together
+                for tar_file in tar_files:
+                    base_name = tar_file.split(".tar")[0]
+                    if base_name not in tar_parts_dict:
+                        tar_parts_dict[base_name] = []
+                    tar_parts_dict[base_name].append(tar_file)
+                # Concatenate and untar split parts
+                for base_name, parts in tar_parts_dict.items():
+                    print(f"Extracting following tar files: {parts}")
+                    output_tar = base_name + ".tar"
+                    if not osp.exists(output_tar):
+                        print('Start concatenating tar files')
+                        concat_tar_parts(parts, output_tar)
+                        print('Finish concatenating tar files')
+                    if not osp.exists(osp.join(cache_path, osp.basename(base_name))):
+                        untar_video_data(output_tar, cache_path)
+            print('All videos are extracted for LongVideoBench')
+            dataset_path = cache_path
+            generate_tsv(dataset_path)
+        data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
+        return dict(data_file=data_file, root=dataset_path)
+    def save_video_frames(self, video_path, video_llm=False):
+        vid_path = osp.join(self.data_root, video_path)
+        vid = decord.VideoReader(vid_path)
+        video_info = {
+            'fps': vid.get_avg_fps(),
+            'n_frames': len(vid),
+        }
+        if self.nframe > 0 and self.fps < 0:
+            step_size = len(vid) / (self.nframe + 1)
+            indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
+            frame_paths = self.frame_paths(video_path[:-4])
+        elif self.fps > 0:
+            # not constrained by num_frames, get frames by fps
+            total_duration = video_info['n_frames'] / video_info['fps']
+            required_frames = int(total_duration * self.fps)
+            step_size = video_info['fps'] / self.fps
+            indices = [int(i * step_size) for i in range(required_frames)]
+            frame_paths = self.frame_paths_fps(video_path[:-4], len(indices))
+        flag = np.all([osp.exists(p) for p in frame_paths])
+        if not flag:
+            images = [vid[i].asnumpy() for i in indices]
+            images = [Image.fromarray(arr) for arr in images]
+            for im, pth in zip(images, frame_paths):
+                if not osp.exists(pth) and not video_llm:
+                    im.save(pth)
+        return frame_paths, indices, video_info
+    # def save_video_into_images(self, line, num_frames=8):
+    #     frame_paths, indices, video_info = self.save_video_frames(line['video_path'], num_frames)
+    #     return frame_paths
+    def build_prompt(self, line, video_llm):
+        if isinstance(line, int):
+            assert line < len(self)
+            line = self.data.iloc[line]
+        frames, indices, video_info = self.save_video_frames(line['video_path'], video_llm)
+        fps = video_info["fps"]
+        message = [dict(type='text', value=self.SYS)]
+        if video_llm:
+            message.append(dict(type='video', value=osp.join(self.data_root, line['video_path'])))
+        else:
+            if not self.use_subtitle:
+                with open(osp.join(self.data_root, "subtitles", line["subtitle_path"])) as f:
+                    subtitles = json.load(f)
+                frame_message = insert_subtitles_into_frames(
+                    frames,
+                    [ind_ / fps for ind_ in indices],
+                    subtitles,
+                    line["starting_timestamp_for_subtitles"],
+                    line["duration"]
+                )
+                message += frame_message
+            else:
+                for im in frames:
+                    message.append(dict(type='image', value=im))
+        line['question'] += '\n' + '\n'.join(
+            ["{}. {}".format(chr(ord("A") + i), cand) for i, cand in enumerate(eval(line['candidates']))]
+        )
+        prompt = line["question"] + "\nAnswer with the option's letter from the given choices directly."
+        message.append(dict(type='text', value=prompt))
+        return message
+    # It returns a dictionary
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.longvideobench import get_dimension_rating, extract_characters_regex, extract_option
+        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
+        tgt_file = eval_file.replace('.xlsx', '_rating.json')
+        score_file = eval_file.replace('.xlsx', '_score.xlsx')
+        if not osp.exists(score_file):
+            model = judge_kwargs.get('model', 'exact_matching')
+            assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
+            if model == 'exact_matching':
+                model = None
+            elif gpt_key_set():
+                model = build_judge(**judge_kwargs)
+                if not model.working():
+                    warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
+                    warnings.warn(DEBUG_MESSAGE)
+                    model = None
+            else:
+                warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+                model = None
+            res = {} if not osp.exists(tmp_file) else load(tmp_file)
+            res = {k: v for k, v in res.items() if FAIL_MSG not in v}
+            data = load(eval_file)
+            data_un = data[~pd.isna(data['prediction'])]
+            for idx in data['index']:
+                ans = data.loc[data['index'] == idx, 'correct_choice'].values[0]
+                ans = chr(ord("A") + ans)
+                pred = str(data.loc[data['index'] == idx, 'prediction'].values[0])
+                if extract_characters_regex(pred) == '':
+                    extract_pred = extract_option(
+                        model,
+                        data.loc[data['index'] == idx].to_dict(orient='records')[0],
+                        'LongVideoBench'
+                    )
+                    data.loc[idx, 'score'] = int(extract_pred == ans)
+                else:
+                    data.loc[idx, 'score'] = int(extract_characters_regex(pred) == ans)
+            rejected = [x for x in data['score'] if x == -1]
+            print(
+                f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
+                f'failed to obtain the score for another {len(rejected)} questions. '
+                f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
+            )
+            dump(data, score_file)
+        rating = get_dimension_rating(score_file)
+        dump(rating, tgt_file)
+        return rating

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/miabench.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import json
+import os
+import pandas as pd
+from .image_base import ImageBaseDataset
+from ..smp import *
+from .utils import build_judge, DEBUG_MESSAGE
+from ..utils import track_progress_rich
+def generate_prompt(d):
+    question = d['question']
+    weights = eval(d['component_weight'])
+    components = eval(d['components'])
+    num_of_component = int(d['num_of_component'])
+    response = d['prediction']
+    if num_of_component == 1:
+        components = f"The first component is: '{components[0]}'. "
+        score = f"The first component is worth: {weights[0]} scores. "
+    elif num_of_component == 2:
+        components = f"The first component is: '{components[0]}', and the second component is '{components[1]}'. "
+        score = f"The first and second component is each worth {weights[0]} and {weights[1]} scores. "
+    elif num_of_component == 3:
+        components = (
+            f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
+            f"and the third component is '{components[2]}'. "
+        )
+        score = (
+            "The first, second, and third component is each worth "
+            f"{weights[0]}, {weights[1]}, and {weights[2]} scores."
+        )
+    elif num_of_component == 4:
+        components = (
+            f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
+            f"and the third component is '{components[2]}', and the fourth component is '{components[3]}'. "
+        )
+        score = (
+            "The first, second, third, and fourth component is each worth "
+            f"{weights[0]}, {weights[1]}, {weights[2]}, and {weights[3]} scores."
+        )
+    elif num_of_component == 5:
+        components = (
+            f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
+            f"and the third component is '{components[2]}', and the fourth component is '{components[3]}', "
+            f"and the fifth component is '{components[4]}'. "
+        )
+        score = (
+            "The first, second, third, fourth, and fifth component is each worth "
+            f"{weights[0]}, {weights[1]}, {weights[2]}, {weights[3]}, and {weights[4]} scores."
+        )
+    return (
+        "Here is an instruction for a multimodal LLM: '"
+        f"{question}"
+        "'. You need to grade if the response from the model follows each component of the instruction. "
+        f"{components}"
+        "The response is: '"
+        f"{response}"
+        "'. You need to score the response and be strict. The total score ranges from 0 to 10, "
+        "depending on if the response follows the instruction. "
+        f"{score}"
+        "List scores of each component, and the total score in one sentence in this format: "
+        "score of component 1: x/2, score of component 2: y/8, total score: z/10. Then explain your reasons."
+    )
+def process_rawscore(component_type, raw_score):
+    first_sentence = raw_score.split('.')[0].split(',')
+    score_dict = {}
+    for i in range(len(first_sentence) - 1):
+        score_ = first_sentence[i].split(':')[1][1:].split('/')
+        score = int(score_[0]) / int(score_[1])
+        score_dict[component_type[i]] = score
+    total_score_ = first_sentence[i + 1].split(':')[1][1:].split('/')
+    total_score = int(total_score_[0]) / int(total_score_[1])
+    score_dict['total_score'] = total_score
+    return score_dict
+def get_score_dict(data, score_raw):
+    cat_score_dict = {}
+    for i in range(len(data)):
+        try:
+            cmp = data['component_type'][i][2:-2]
+            cmp_list = cmp.split('\', \'')
+            score_dict = process_rawscore(cmp_list, score_raw[i])
+            for key, val in score_dict.items():
+                if key not in cat_score_dict.keys():
+                    cat_score_dict[key] = [val]
+                else:
+                    cat_score_dict[key].append(val)
+        except:
+            pass
+    cat_score_dict_average = {}
+    for key, val in cat_score_dict.items():
+        cat_score_dict_average[key] = sum(val) / len(val)
+    return cat_score_dict_average
+class MIABench(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'MIA-Bench': 'https://opencompass.openxlab.space/utils/VLMEval/Mia-Bench.tsv',
+    }
+    DATASET_MD5 = {
+        'MIA-Bench': '0b9de595f4dd40af18a69b94d89aba82',
+    }
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        judge_name = judge_kwargs.pop('model', 'gpt-4o')
+        model = build_judge(model=judge_name, **judge_kwargs)
+        suffix = eval_file.split('.')[-1]
+        storage = eval_file.replace(f'.{suffix}', f'_{judge_name}.xlsx')  # noqa: F841
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_name}.pkl')  # noqa: F841
+        nproc = judge_kwargs.pop('nproc', 4)  # noqa: F841
+        if not osp.exists(storage):
+            data = load(eval_file)
+            num_samples = len(data)
+            lines = [data.loc[i] for i in range(num_samples)]
+            prompts = [generate_prompt(line) for line in lines]
+            org_data = MIABench('MIA-Bench').data
+            img_map = {x: y for x, y in zip(org_data['index'], org_data['image'])}
+            image_b64 = [img_map[idx] for idx in data['index']]
+            indices = list(data['index'])
+            mm_messages = [
+                dict(message=[
+                    dict(type='text', value=prompt),
+                    dict(type='image', value=f'data:image/jpeg;base64,{b64}')
+                ])
+                for prompt, b64 in zip(prompts, image_b64)
+            ]
+            res = {}
+            if osp.exists(tmp_file):
+                res = load(tmp_file)
+            jobs = {k: v for k, v in zip(indices, mm_messages) if k not in res}
+            job_keys = list(jobs.keys())
+            job_vals = [jobs[k] for k in job_keys]
+            resps = track_progress_rich(
+                model.generate,
+                job_vals,
+                nproc=nproc,
+                chunksize=nproc,
+                keys=job_keys,
+                save=tmp_file,
+            )
+            for k, resp in zip(job_keys, resps):
+                res[k] = resp
+            data['score_raw'] = [res[idx] for idx in indices]
+            dump(data, storage)
+        goresult = load(storage)
+        results = get_score_dict(goresult, goresult['score_raw'])
+        result_pth = storage.replace('.xlsx', '_score.csv')
+        results_pd = pd.DataFrame.from_dict(list(results.items()))
+        dump(results_pd, result_pth)
+        return results

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mlvu.py ADDED Viewed

	@@ -0,0 +1,455 @@

+import huggingface_hub
+from huggingface_hub import snapshot_download
+from ..smp import *
+from .video_concat_dataset import ConcatVideoDataset
+from .video_base import VideoBaseDataset
+from .utils import build_judge, DEBUG_MESSAGE
+from ..utils import track_progress_rich
+import torchvision.transforms as T
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+from decord import VideoReader, cpu
+import pandas as pd
+import imageio
+import cv2
+import zipfile
+import os
+import glob
+from .utils.mlvu import *
+FAIL_MSG = 'Failed to obtain answer via API.'
+class MLVU(ConcatVideoDataset):
+    def __init__(self, dataset='MLVU', nframe=0, fps=-1):
+        self.DATASET_SETS[dataset] = ['MLVU_MCQ', 'MLVU_OpenEnded']
+        self.type_data_dict = {
+            'M-Avg':['plotQA', 'needle', 'ego', 'count', 'anomaly_reco', 'topic_reasoning'],
+            'G-Avg':['sub_scene', 'summary']
+        }
+        super().__init__(dataset=dataset, nframe=nframe, fps=fps)
+    @classmethod
+    def supported_datasets(cls):
+        return ['MLVU']
+    def evaluate(self, eval_file, **judge_kwargs):
+        result = super().evaluate(eval_file=eval_file, **judge_kwargs)
+        suffix = eval_file.split('.')[-1]
+        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        for key in self.type_data_dict:
+            result.loc[key] = 0.0
+            for name, item in result.iterrows():
+                if name in self.type_data_dict[key]:
+                    result.loc[key, 'success'] += item['success']
+                    result.loc[key, 'overall'] += item['overall']
+            if key == 'G-Avg':
+                result.loc[key, 'acc'] = round(
+                    result.loc[key, 'success'] / result.loc[key, 'overall'], 2
+                )
+            else:
+                result.loc[key, 'acc'] = round(
+                    result.loc[key, 'success'] / result.loc[key, 'overall'] * 100, 1
+                )
+        result = result.reset_index().rename(columns={'index': 'task'})
+        dump(result, score_file)
+        return result
+class MLVU_MCQ(VideoBaseDataset):
+    MD5 = 'bb5c37e7cf8d43fc9a25c23d2b4633f5'
+    BASE_SYS = 'Carefully watch this video and pay attention to every detail. '
+    SYS = BASE_SYS + 'Based on your observations, select the best option that accurately addresses the question.'
+    TYPE = 'Video-MCQ'
+    def __init__(self, dataset='MLVU_MCQ', nframe=0, fps=-1):
+        self.type_data_list = {
+            'plotQA': ('1_plotQA.json', './MLVU/video/1_plotQA', 'MCQ'),
+            'needle': ('2_needle.json', './MLVU/video/2_needle', 'MCQ'),
+            'ego': ('3_ego.json', './MLVU/video/3_ego', 'MCQ'),
+            'count': ('4_count.json', './MLVU/video/4_count', 'MCQ'),
+            'order': ('5_order.json', './MLVU/video/5_order', 'MCQ'),
+            'anomaly_reco': ('6_anomaly_reco.json', './MLVU/video/6_anomaly_reco', 'MCQ'),
+            'topic_reasoning': ('7_topic_reasoning.json', './MLVU/video/7_topic_reasoning', 'MCQ'),
+        }
+        super().__init__(dataset=dataset, nframe=nframe, fps=fps)
+    @classmethod
+    def supported_datasets(cls):
+        return ['MLVU_MCQ']
+    def prepare_dataset(self, dataset_name='MLVU_MCQ', repo_id='MLVU/MVLU'):
+        def check_integrity(pth):
+            data_file = osp.join(pth, f'{dataset_name}.tsv')
+            if not os.path.exists(data_file):
+                return False
+            if md5(data_file) != self.MD5:
+                return False
+            data = load(data_file)
+            for idx, item in data.iterrows():
+                if not osp.exists(osp.join(pth, item['prefix'], item['video'])):
+                    return False
+            return True
+        if modelscope_flag_set():
+            repo_id = "AI-ModelScope/MLVU"
+        cache_path = get_cache_path(repo_id)
+        if cache_path is not None and check_integrity(cache_path):
+            dataset_path = cache_path
+        else:
+            def generate_tsv(pth):
+                data_file = osp.join(pth, f'{dataset_name}.tsv')
+                if os.path.exists(data_file) and md5(data_file) == self.MD5:
+                    return
+                json_data_dir = os.path.join(dataset_path, 'MLVU', 'json')
+                self.data_list = []
+                for k, v in self.type_data_list.items():
+                    with open(os.path.join(json_data_dir, v[0]), 'r') as f:
+                        json_data = json.load(f)
+                    for data in json_data:
+                        self.data_list.append({
+                            'task_type': k,
+                            'prefix': v[1],
+                            'duration': data['duration'],
+                            'video': data['video'],
+                            'question': data['question'],
+                            'answer': data['answer'],
+                            'candidates': data['candidates'],
+                        })
+                data_df = pd.DataFrame(self.data_list)
+                data_df = data_df.assign(index=range(len(data_df)))
+                data_df.to_csv(data_file, sep='\t', index=False)
+            if modelscope_flag_set():
+                from modelscope import dataset_snapshot_download
+                dataset_path = dataset_snapshot_download(dataset_id=repo_id)
+            else:
+                hf_token = os.environ.get('HUGGINGFACE_TOKEN')
+                huggingface_hub.login(hf_token)
+                dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
+            generate_tsv(dataset_path)
+        data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
+        return dict(root=dataset_path, data_file=data_file)
+    def qa_template(self, data):
+        question = f"Question: {data['question']}\n"
+        question += 'Options:\n'
+        answer = data['answer']
+        answer_idx = -1
+        for idx, c in enumerate(eval(data['candidates'])):
+            question += f"({chr(ord('A') + idx)}) {c}\n"
+            if c == answer:
+                answer_idx = idx
+        question = question.rstrip()
+        answer = f"({chr(ord('A') + answer_idx)}) {answer}"
+        return question, answer
+    def save_video_frames(self, line):
+        suffix = line['video'].split('.')[-1]
+        video = line['video'].replace(f'.{suffix}','')
+        vid_path = osp.join(self.data_root, line['prefix'], line['video'])
+        vid = decord.VideoReader(vid_path)
+        video_info = {
+            'fps': vid.get_avg_fps(),
+            'n_frames': len(vid),
+        }
+        if self.nframe > 0 and self.fps < 0:
+            step_size = len(vid) / (self.nframe + 1)
+            indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
+            frame_paths = self.frame_paths(video)
+        elif self.fps > 0:
+            # not constrained by num_frames, get frames by fps
+            total_duration = video_info['n_frames'] / video_info['fps']
+            required_frames = int(total_duration * self.fps)
+            step_size = video_info['fps'] / self.fps
+            indices = [int(i * step_size) for i in range(required_frames)]
+            frame_paths = self.frame_paths_fps(video, len(indices))
+        flag = np.all([osp.exists(p) for p in frame_paths])
+        if not flag:
+            images = [vid[i].asnumpy() for i in indices]
+            images = [Image.fromarray(arr) for arr in images]
+            for im, pth in zip(images, frame_paths):
+                if not osp.exists(pth):
+                    im.save(pth)
+        return frame_paths
+    def save_video_into_images(self, line):
+        frame_paths = self.save_video_frames(line)
+        return frame_paths
+    def build_prompt(self, line, video_llm):
+        if isinstance(line, int):
+            assert line < len(self)
+            line = self.data.iloc[line]
+        question, answer = self.qa_template(line)
+        message = [dict(type='text', value=self.SYS, role='system')]
+        message.append(dict(type='text', value=question))
+        video_path = os.path.join(self.data_root, line['prefix'], line['video'])
+        if video_llm:
+            message.append(dict(type='video', value=video_path))
+        else:
+            img_frame_paths = self.save_video_into_images(line)
+            for im in img_frame_paths:
+                message.append(dict(type='image', value=im))
+        message.append(dict(type='text', value='\nOnly give the best option.'))
+        return message
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
+        score_file = eval_file.replace('.xlsx', '_score.xlsx')
+        if not osp.exists(score_file):
+            model = judge_kwargs.setdefault('model', 'chatgpt-0125')
+            assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
+            if model == 'exact_matching':
+                model = None
+            elif gpt_key_set():
+                model = build_judge(**judge_kwargs)
+                if not model.working():
+                    warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
+                    warnings.warn(DEBUG_MESSAGE)
+                    model = None
+            else:
+                warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+                model = None
+            res = {} if not osp.exists(tmp_file) else load(tmp_file)
+            res = {k: v for k, v in res.items() if FAIL_MSG not in v}
+            data = load(eval_file)
+            data_un = data[~pd.isna(data['prediction'])]
+            for idx in data['index']:
+                ans = data.loc[data['index'] == idx, 'answer'].values[0]
+                pred = data.loc[data['index'] == idx, 'prediction'].values[0]
+                options = eval(data.loc[data['index'] == idx, 'candidates'].values[0])
+                answer_idx = -1
+                for id, c in enumerate(options):
+                    if c == ans:
+                        answer_idx = id
+                ans = f"({chr(ord('A') + answer_idx)}) {ans}"
+                input_item = data.loc[data['index'] == idx].to_dict(orient='records')[0]
+                for id, option_content in enumerate(eval(input_item['candidates'])):
+                    input_item[chr(ord('A') + id)] = option_content
+                    if option_content == input_item['answer']:
+                        input_item['answer'] = chr(ord('A') + id)
+                if FAIL_MSG in pred:
+                    data.loc[idx, 'score'] = -1
+                else:
+                    data.loc[idx, 'score'] = int(check_ans_with_model(
+                        pred, ans, model,
+                        input_item,
+                        'MLVU_MCQ'
+                    ))
+            rejected = [x for x in data['score'] if x == -1]
+            print(
+                f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
+                f'failed to obtain the score for another {len(rejected)} questions. '
+                f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
+            )
+            dump(data, score_file)
+        rating = get_dimension_rating(score_file)
+        return rating
+class MLVU_OpenEnded(VideoBaseDataset):
+    MD5 = 'cee573a3627c6ac434ded704c60511ba'
+    BASE_SYS = 'Carefully watch this video and pay attention to every detail. '
+    SYS = BASE_SYS + 'Based on your observations, answer the given questions.'
+    TYPE = 'Video-VQA'
+    def __init__(self, dataset='MLVU_OpenEnded', nframe=0, fps=-1):
+        self.type_data_list = {
+            'sub_scene': ('8_sub_scene.json', './MLVU/video/8_sub_scene', 'VQA'),
+            'summary': ('9_summary.json', './MLVU/video/9_summary', 'VQA')
+        }
+        super().__init__(dataset=dataset, nframe=nframe, fps=fps)
+    @classmethod
+    def supported_datasets(cls):
+        return ['MLVU_OpenEnded']
+    def prepare_dataset(self, dataset_name='MLVU_OpenEnded', repo_id='MLVU/MVLU'):
+        def check_integrity(pth):
+            data_file = osp.join(pth, f'{dataset_name}.tsv')
+            if not os.path.exists(data_file):
+                return False
+            if md5(data_file) != self.MD5:
+                return False
+            data = load(data_file)
+            for idx, item in data.iterrows():
+                if not osp.exists(osp.join(pth, item['prefix'], item['video'])):
+                    return False
+            return True
+        if modelscope_flag_set():
+            repo_id = "AI-ModelScope/MLVU"
+        cache_path = get_cache_path(repo_id)
+        if cache_path is not None and check_integrity(cache_path):
+            dataset_path = cache_path
+        else:
+            def generate_tsv(pth):
+                data_file = osp.join(pth, f'{dataset_name}.tsv')
+                if os.path.exists(data_file) and md5(data_file) == self.MD5:
+                    return
+                json_data_dir = os.path.join(dataset_path, 'MLVU', 'json')
+                self.data_list = []
+                for k, v in self.type_data_list.items():
+                    with open(os.path.join(json_data_dir, v[0]), 'r') as f:
+                        json_data = json.load(f)
+                    for data in json_data:
+                        self.data_list.append({
+                            'task_type': k,
+                            'prefix': v[1],
+                            'duration': data['duration'],
+                            'video': data['video'],
+                            'question': data['question'],
+                            'answer': data['answer'],
+                            'scoring_points': data['scoring_points'] if 'scoring_points' in data else ''
+                        })
+                data_df = pd.DataFrame(self.data_list)
+                data_df = data_df.assign(index=range(len(data_df)))
+                data_df.to_csv(data_file, sep='\t', index=False)
+            if modelscope_flag_set():
+                from modelscope import dataset_snapshot_download
+                dataset_path = dataset_snapshot_download(dataset_id=repo_id)
+            else:
+                hf_token = os.environ.get('HUGGINGFACE_TOKEN')
+                huggingface_hub.login(hf_token)
+                dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
+            generate_tsv(dataset_path)
+        data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
+        return dict(root=dataset_path, data_file=data_file)
+    def qa_template(self, data):
+        question = f"{data['question']}"
+        answer = data['answer']
+        return question, answer
+    def save_video_frames(self, line):
+        suffix = line['video'].split('.')[-1]
+        video = line['video'].replace(f'.{suffix}','')
+        vid_path = osp.join(self.data_root, line['prefix'], line['video'])
+        vid = decord.VideoReader(vid_path)
+        video_info = {
+            'fps': vid.get_avg_fps(),
+            'n_frames': len(vid),
+        }
+        if self.nframe > 0 and self.fps < 0:
+            step_size = len(vid) / (self.nframe + 1)
+            indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
+            frame_paths = self.frame_paths(video)
+        elif self.fps > 0:
+            # not constrained by num_frames, get frames by fps
+            total_duration = video_info['n_frames'] / video_info['fps']
+            required_frames = int(total_duration * self.fps)
+            step_size = video_info['fps'] / self.fps
+            indices = [int(i * step_size) for i in range(required_frames)]
+            frame_paths = self.frame_paths_fps(video, len(indices))
+        flag = np.all([osp.exists(p) for p in frame_paths])
+        if not flag:
+            images = [vid[i].asnumpy() for i in indices]
+            images = [Image.fromarray(arr) for arr in images]
+            for im, pth in zip(images, frame_paths):
+                if not osp.exists(pth):
+                    im.save(pth)
+        return frame_paths
+    def save_video_into_images(self, line):
+        frame_paths = self.save_video_frames(line)
+        return frame_paths
+    def build_prompt(self, line, video_llm):
+        if isinstance(line, int):
+            assert line < len(self)
+            line = self.data.iloc[line]
+        question, answer = self.qa_template(line)
+        message = [dict(type='text', value=self.SYS, role='system')]
+        message.append(dict(type='text', value=question))
+        video_path = os.path.join(self.data_root, line['prefix'], line['video'])
+        if video_llm:
+            message.append(dict(type='video', value=video_path))
+        else:
+            img_frame_paths = self.save_video_into_images(line)
+            for im in img_frame_paths:
+                message.append(dict(type='image', value=im))
+        return message
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        model = judge_kwargs['model'] if 'model' in judge_kwargs else judge_kwargs.setdefault('model', 'gpt-4-0125')
+        if model != 'gpt-4-0125':
+            print('MLVU Open Ended default using gpt-4-0125! So judge model is changed to gpt-4-0125')
+            judge_kwargs['model'] = 'gpt-4-0125'
+        suffix = eval_file.split('.')[-1]
+        score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        nproc = judge_kwargs.pop('nproc', 4)
+        if not osp.exists(score_file):
+            data = load(eval_file)
+            model_dict = {
+                'sub_scene': build_judge(system_prompt=system_prompt_sub_scene, **judge_kwargs),
+                'summary': build_judge(system_prompt=system_prompt_summary, **judge_kwargs)
+            }
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model_dict[line['task_type']], line) for line in lines]
+            indices = [line['index'] for line in lines]
+            ans = {}
+            if osp.exists(tmp_file):
+                ans = load(tmp_file)
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+            if len(indices):
+                _ = track_progress_rich(
+                    MLVU_OpenEnded_generate,
+                    tups,
+                    nproc=nproc,
+                    chunksize=nproc,
+                    keys=indices,
+                    save=tmp_file,
+                )
+            ans = load(tmp_file)
+            data = MLVU_OpenEnded_extract(ans, data)
+            dump(data, score_file)
+        rating = get_dimension_rating(score_file)
+        return rating

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mmbench_video.py ADDED Viewed

	@@ -0,0 +1,256 @@

+from huggingface_hub import snapshot_download
+from ..smp import *
+from .video_base import VideoBaseDataset
+from .utils import build_judge, DEBUG_MESSAGE
+from ..utils import track_progress_rich
+FAIL_MSG = 'Failed to obtain answer via API.'
+def unwrap_hf_pkl(pth, suffix='.mp4'):
+    base_dir = os.path.join(pth, 'video_pkl/')
+    target_dir = os.path.join(pth, 'video/')
+    pickle_files = [os.path.join(base_dir, file) for file in os.listdir(base_dir)]
+    pickle_files.sort()
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir, exist_ok=True)
+        for pickle_file in pickle_files:
+            with open(pickle_file, 'rb') as file:
+                video_data = pickle.load(file)
+            # For each video file in the pickle file, write its contents to a new mp4 file
+            for video_name, video_content in video_data.items():
+                output_path = os.path.join(target_dir, f'{video_name}{suffix}')
+                with open(output_path, 'wb') as output_file:
+                    output_file.write(video_content)
+        print('The video file has been restored and stored from the pickle file.')
+    else:
+        print('The video file already exists.')
+class MMBenchVideo(VideoBaseDataset):
+    MD5 = '98f7df3eb1007fc375ea6fe88a98e2ff'
+    SYS = 'You are an AI assistant responsible for answering questions about videos.'
+    FRAMES_TMPL_PACK = """
+You will be provided with {} separate frames uniformly sampled from a video, \
+the frames are provided in chronological order of the video.
+Please analyze these images and provide the answer / answers to the \
+following question / questions about the video content.
+If multiple questions are provided (with indices I1, I2, I3, ...), \
+you should organize your answers in the following json format:
+{{
+    'I1': 'Answer to Question I1',
+    'I2': 'Answer to Question I2',
+    ...
+}}
+Otherwise, please directly reply with your response to the only question.
+Even if the information in these separate frames is not enough to give an answer,
+PLEASE GIVE A RESPONSE TO EACH OF THE QUESTIONS IN THE FORMAT DESCRIBED ABOVE.
+"""
+    FRAMES_TMPL_NOPACK = """
+You will be provided with {} separate frames uniformly sampled from a video, \
+the frames are provided in chronological order of the video.
+Please analyze these images and provide the answer to the question about the video content.
+Please directly reply with your response to the only question.
+"""
+    TYPE = 'Video-VQA'
+    def __init__(self, dataset='MMBench-Video', pack=False, nframe=0, fps=-1):
+        super().__init__(dataset=dataset, pack=pack, nframe=nframe, fps=fps)
+    @classmethod
+    def supported_datasets(cls):
+        return ['MMBench-Video']
+    def prepare_dataset(self, dataset_name='MMBench-Video', repo_id='opencompass/MMBench-Video'):
+        def check_integrity(pth):
+            data_file = osp.join(pth, f'{dataset_name}.tsv')
+            if md5(data_file) != self.MD5:
+                return False
+            data = load(data_file)
+            for video_pth in data['video_path']:
+                if not osp.exists(osp.join(pth, video_pth)):
+                    return False
+            return True
+        cache_path = get_cache_path(repo_id)
+        if cache_path is not None and check_integrity(cache_path):
+            dataset_path = cache_path
+        else:
+            if modelscope_flag_set():
+                from modelscope import dataset_snapshot_download
+                dataset_path = dataset_snapshot_download(dataset_id=repo_id)
+            else:
+                dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
+            unwrap_hf_pkl(dataset_path)
+        self.video_path = osp.join(dataset_path, 'video/')
+        data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
+        return dict(data_file=data_file, root=osp.join(dataset_path, 'video'))
+    def build_prompt_pack(self, line):
+        if isinstance(line, int):
+            assert line < len(self)
+            video = self.videos[line]
+        elif isinstance(line, pd.Series):
+            video = line['video']
+        elif isinstance(line, str):
+            video = line
+        frames = self.save_video_frames(video)
+        sub = self.data[self.data['video'] == video]
+        sys_prompt = self.SYS + self.FRAMES_TMPL_PACK.format(len(frames))
+        message = [dict(type='text', value=sys_prompt)]
+        for im in frames:
+            message.append(dict(type='image', value=im))
+        nq = len(sub)
+        prompt = 'Questions: \n{}\nAnswers: \n'
+        qs = {int(sub.iloc[i]['index']): sub.iloc[i]['question'] for i in range(nq)}
+        prompt = prompt.format(json.dumps(qs))
+        message.append(dict(type='text', value=prompt))
+        return message
+    def build_prompt_nopack(self, line, video_llm):
+        if isinstance(line, int):
+            assert line < len(self)
+            line = self.data.iloc[line]
+        if video_llm:
+            question = line['question']
+            prefix, video_idx_path = os.path.split(line['video_path'])
+            message = [dict(type='text', value=question)]
+            message.append(dict(type='video', value=os.path.join(self.video_path, video_idx_path)))
+            return message
+        else:
+            frames = self.save_video_frames(line['video'])
+            sys_prompt = self.FRAMES_TMPL_NOPACK.format(len(frames))
+            message = [dict(type='text', value=sys_prompt)]
+            for im in frames:
+                message.append(dict(type='image', value=im))
+            prompt = 'Question: {}\nAnswer: '.format(line['question'])
+            message.append(dict(type='text', value=prompt))
+        return message
+    def build_prompt(self, line, video_llm):
+        if self.pack and not video_llm:
+            return self.build_prompt_pack(line)
+        else:
+            return self.build_prompt_nopack(line, video_llm)
+    @staticmethod
+    def remove_side_quote(s, syms=[',', '"', "'"]):
+        if np.all([x in syms for x in s]):
+            return ''
+        while s[0] in syms:
+            s = s[1:]
+        while s[-1] in syms:
+            s = s[:-1]
+        return s
+    @staticmethod
+    def robust_json_load(s):
+        try:
+            jsons = list(extract_json_objects(s))
+            assert len(jsons) == 1
+            return jsons[0]
+        except:
+            if '{' in s and s.find('{') == s.rfind('{'):
+                sub_str = s[s.find('{') + 1:].strip()
+                lines = sub_str.split('\n')
+                res = {}
+                for l in lines:
+                    l = l.strip()
+                    if ': ' in l:
+                        key = l.split(': ')[0].strip()
+                        val = l.split(': ')[1].strip()
+                        key = MMBenchVideo.remove_side_quote(key)
+                        val = MMBenchVideo.remove_side_quote(val)
+                        if len(key) and len(val):
+                            res[key] = val
+                return res
+            return None
+    def load_pack_answers(self, data_raw):
+        vstats = defaultdict(lambda: 0)
+        data = defaultdict(lambda: {})
+        for k in data_raw:
+            ans = data_raw[k].strip()
+            if FAIL_MSG in ans:
+                vstats['GEN_FAIL'] += 1
+                continue
+            res = self.robust_json_load(ans)
+            if res is not None:
+                data[k] = res
+                vstats['PARSE_OK'] += 1
+            else:
+                vstats['PARSE_FAIL'] += 1
+        # return data
+        meta = cp.deepcopy(self.data)
+        lt = len(meta)
+        prediction = []
+        for i in range(lt):
+            line = meta.iloc[i]
+            vid = line['video']
+            idx = str(line['index'])
+            prediction.append(data[vid][idx] if idx in data[vid] else None)
+        meta['prediction'] = prediction
+        vstats['VALIDQ'] = len([x for x in prediction if x is not None])
+        vstats['INVALIDQ'] = len([x for x in prediction if x is None])
+        return meta, vstats
+    # It returns a dictionary
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.mmbench_video import get_dimension_rating, system_prompt, build_prompt
+        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        judge = judge_kwargs['model']
+        nproc = judge_kwargs.pop('nproc', 4)
+        tmp_file = eval_file.replace('.xlsx', f'_{judge}_tmp.pkl')
+        tgt_file = eval_file.replace('.xlsx', f'_{judge}_rating.json')
+        score_file = eval_file.replace('.xlsx', f'_{judge}_score.xlsx')
+        model = build_judge(system_prompt=system_prompt, **judge_kwargs)
+        assert model.working(), 'MMBench-Video evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE
+        if not osp.exists(score_file):
+            res = {} if not osp.exists(tmp_file) else load(tmp_file)
+            res = {k: v for k, v in res.items() if model.fail_msg not in v}
+            data = load(eval_file)
+            data_un = data[~data['index'].isin(res)]
+            data_un = data_un[~pd.isna(data_un['prediction'])]
+            lt = len(data_un)
+            prompts = [build_prompt(data_un.iloc[i]) for i in range(lt)]
+            indices = [data_un.iloc[i]['index'] for i in range(lt)]
+            if len(prompts):
+                _ = track_progress_rich(
+                    model.generate,
+                    prompts,
+                    keys=indices,
+                    save=tmp_file,
+                    nproc=nproc,
+                    chunksize=nproc
+                )
+            score_map = load(tmp_file)
+            data['score'] = [score_map[idx] if idx in score_map else -1 for idx in data['index']]
+            rejected = [x for x in score_map.values() if FAIL_MSG in x]
+            data['score'] = [int(x) if istype(x, int) else -1 for x in data['score']]
+            print(
+                f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(score_map)} questions, '
+                f'failed to obtain the score for another {len(rejected)} questions. '
+                f'Those questions will be counted as 0 score in ALL rating, and will not be counted in VALID rating.'
+            )
+            dump(data, score_file)
+        rating = get_dimension_rating(score_file)
+        dump(rating, tgt_file)
+        return rating

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mmgenbench.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import warnings
+import pandas as pd
+from abc import abstractmethod
+from ..smp import *
+from .image_base import ImageBaseDataset
+class MMGenBench(ImageBaseDataset):
+    prompt_list = [
+        """
+# Role
+You are an expert in the field of image understanding, focusing on the \
+understanding of images and generating the image caption-prompt.
+# Definition Explanation
+image caption-prompt: Refers to the caption or description of an image, \
+used to provide to a Text-to-Image model to generate a new image.
+Text-to-Image model: Can generate a new image based on the provided image \
+caption-prompt, such as stable diffusion 3, flux, and other image generation models.
+# Task Description
+Generate an image caption-prompt based on the input image.
+# Key Points and Requirements
+1. Accurately understand the input image and precisely generate an image caption-prompt.
+2. The generated image caption-prompt, when provided to the Text-to-Image model, requires the \
+Text-to-Image model to generate a new image that is as consistent as possible with the input image.
+3. The generated image caption-prompt must conform to the preferences of the Text-to-Image model.
+4. The generated image caption-prompt should describe the input image in as much \
+detail as possible, and it should be between 20 to 60 words.
+# Output Format
+A string, that is the image caption-prompt. No extra output needed.
+"""
+    ]
+    TYPE = 'GenerateImgPrompt'
+    DATASET_URL = {
+        'MMGenBench-Test': 'https://huggingface.co/datasets/lerogo/MMGenBench/resolve/main/MMGenBench-Test.tsv',
+        'MMGenBench-Domain': 'https://huggingface.co/datasets/lerogo/MMGenBench/resolve/main/MMGenBench-Domain.tsv',
+    }
+    PROMPT_MAP = {
+        'MMGenBench-Test': prompt_list[0],
+        'MMGenBench-Domain': prompt_list[0],
+    }
+    DATASET_MD5 = {
+        'MMGenBench-Test': "94f8dac6bbf7c20be403f99adeaa73da",
+        'MMGenBench-Domain': "5c10daf6e2c5f08bdfb0701aa6db86bb",
+    }
+    def __init__(self, dataset='MMGenBench', **kwargs):
+        super().__init__(dataset, **kwargs)
+        warnings.warn('This dataset is for inference only and does not support direct output of evaluation results.\n')
+        warnings.warn('Please refer to "https://github.com/lerogo/MMGenBench" for more evaluation information.\n')
+    def load_data(self, dataset):
+        data = super().load_data(dataset)
+        if 'question' not in data:
+            data['question'] = [(
+                self.PROMPT_MAP[dataset]
+            )] * len(data)
+        return data
+    # Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
+    @abstractmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        warnings.warn('This evaluation method is not supported.\n')
+        warnings.warn('Please refer to "https://github.com/lerogo/MMGenBench" for more evaluation information.\n')
+        return None

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mmlongbench.py ADDED Viewed

	@@ -0,0 +1,584 @@

+import re
+import math
+from urllib.request import urlopen
+from PIL import Image, ImageDraw, ImageFont
+import torchvision.transforms as transforms
+from vlmeval.dataset.utils import build_judge, levenshtein_distance
+from vlmeval.smp import *
+from .image_base import ImageBaseDataset
+FAIL_MSG = 'Failed to obtain answer via API.'
+def get_gpt4_ICE():
+    example_1 = """
+---
+Question: List the primary questions asked about the services in this report.
+Analysis:  The primary questions asked about the services in the report for The Limes Residential Home are:\n\n
+1. Is the service safe?\n
+2. Is the service effective?\n
+3. Is the service caring?\n
+4. Is the service responsive?\n
+5. Is the service well-led?
+Extracted answer: [
+    'Is the servife safe?',
+    'Is the service effective',
+    'Is the serve caring?',
+    'Is the service responsive?',
+    'Is the service well-led?'
+]
+Answer format: List\n
+"""
+    example_2 = """
+---
+Question: How many regulations of the HSCA 2008 are breached in all according to this report?
+Analysis: According to the report, the provider breached 10 Health and Social Care Act 2008 (Regulated Activities)
+Regulations in total. Here are the specifics:\n\n1. Regulation 13: Safeguarding service users from abuse and
+improper treatment\n2. Regulation 12: Safe care and treatment\n3. Regulation 18: Staffing\n4. Regulation 11:
+Need for consent\n5. Regulation 10: Dignity and respect\n6. Regulation 9: Person-centred care\n7. Regulation 17:
+Good governance\n8. Regulation 18 (CQC Registration Regulations 2009): Notification of other incidents\n9.
+Regulation 18: Failure to maintain an accurate and up-to-date care plan\n10. Regulation 11: Failure to implement
+the Mental Capacity Act 2005 code of practice effectively\n\nThese breaches involve issues concerning staffing,
+safeguarding, medicines management, dignity and respect, consent, care planning, governance, and failure to
+notify the CQC of incidents.
+Extracted answer: 10
+Answer format: Integer\n
+"""
+    example_3 = """
+---
+Question: According to the survey that is the percentage of Chinese who are paying more or
+about the same attention to politics after Trump's election?
+Analysis: The survey provided does not specify the percentage of Chinese individuals specifically who are paying
+more or about the same attention to politics after Trump's election. The report focuses primarily on American
+demographics and does not include specific details about the Chinese population in relation to this question. If
+you need information about a different demographic or a summary of the findings from the American demographic,
+I can certainly help with that!
+Extracted answer: Not answerable
+Answer format: String\n
+"""
+    example_4 = """
+---
+Question: How many quotations from male respondent over 50 years old are included in this report?
+Analysis: The image you've provided appears to be a screenshot of a document with multiple charts. However, the
+text is too small and blurry to read accurately. If you can provide a clearer image or more context, I might be
+able to help you with your question.
+Extracted answer: Fail to answer
+Answer format: String\n
+"""
+    return [example_1, example_2, example_3, example_4]
+def build_mmlongbench_gpt4_prompt(line):
+    task_description = """
+Given the question and analysis, you are tasked to extract answers with required formats from the free-form analysis.
+- Your extracted answers should be one of the following formats: (1) Integer, (2) Float, (3) String and (4) List.
+If you find the analysis the question can not be answered from the given documents, type "Not answerable".
+Exception: If the analysis only tells you that it can not read/understand the images or documents,
+type "Fail to answer".
+- Please make your response as concise as possible. Also note that your response should be formatted as below:
+```
+Extracted answer: [answer]
+Answer format: [answer format]
+```
+Please read the following example, then extract the answer from the model response
+and type it at the end of the prompt.\n
+"""
+    question = line['question']
+    prediction = str(line['prediction'])
+    prompt = task_description
+    examples = get_gpt4_ICE()
+    for example in examples:
+        prompt += example
+    prompt += '---\nQuestion:' + question + '\n'
+    prompt += 'Analysis: ' + prediction
+    return prompt
+def anls_compute(groundtruth, prediction, threshold=0.5):
+    dist = levenshtein_distance(groundtruth, prediction)
+    length = max(len(groundtruth.upper()), len(prediction.upper()))
+    value = 0.0 if length == 0 else float(dist) / float(length)
+    anls = 1.0 - value
+    if anls <= threshold:
+        anls = 0.0
+    return anls
+def is_float_equal(reference, prediction, include_percentage: bool = False, is_close: float = False) -> bool:
+    def get_precision(gt_ans: float) -> int:
+        precision = 3
+        if '.' in str(gt_ans):
+            precision = len(str(gt_ans).split('.')[-1])
+        return precision
+    reference = float(str(reference).strip().rstrip('%').strip())
+    try:
+        prediction = float(str(prediction).strip().rstrip('%').strip())
+    except:
+        return False
+    if include_percentage:
+        gt_result = [reference / 100, reference, reference * 100]
+    else:
+        gt_result = [reference]
+    for item in gt_result:
+        try:
+            if is_close:
+                if math.isclose(item, prediction, rel_tol=0.01):
+                    return True
+            precision = max(min(get_precision(prediction), get_precision(item)), 2)
+            if round(prediction, precision) == round(item, precision):
+                return True
+        except Exception:
+            continue
+    return False
+def get_clean_string(s):
+    s = str(s).lower().strip()
+    if s.endswith('mile'):
+        s.rstrip('mile').strip()
+    if s.endswith('miles'):
+        s.rstrip('miles').strip()
+    if s.endswith('million'):
+        s.rstrip('million').strip()
+    # remove parenthesis
+    s = re.sub(r'\s*\([^)]*\)', '', s).strip()
+    # remove quotes
+    s = re.sub(r"^['\"]|['\"]$", '', s).strip()
+    s = s.strip().lstrip('$').strip()
+    s = s.strip().rstrip('%').strip()
+    return s
+def is_exact_match(s):
+    flag = False
+    # Website
+    if 'https://' in s:
+        flag = True
+    # code file
+    if s.endswith('.py') or s.endswith('ipynb'):
+        flag = True
+    if s.startswith('page'):
+        flag = True
+    # telephone number
+    if re.fullmatch(r'\b\d+(-\d+|\s\d+)?\b', s):
+        flag = True
+    # time
+    if 'a.m.' in s or 'p.m.' in s:
+        flag = True
+    # YYYY-MM-DD
+    if re.fullmatch(r'\b\d{4}[-\s]\d{2}[-\s]\d{2}\b', s):
+        flag = True
+    # YYYY-MM
+    if re.fullmatch(r'\b\d{4}[-\s]\d{2}\b', s):
+        flag = True
+    # Email address
+    if re.fullmatch(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', s):
+        flag = True
+    return flag
+def isfloat(num):
+    try:
+        float(num)
+        return True
+    except ValueError:
+        return False
+def get_font():
+    try:
+        truetype_url = "http://opencompass.openxlab.space/utils/Fonts/SimHei.ttf"
+        ff = urlopen(truetype_url)
+        font = ImageFont.truetype(ff, size=40)
+    except Exception as e:
+        logging.warning(f'{type(e)}: {e}')
+        logging.warning("Fail to download the font. Use the default one.")
+        font = ImageFont.load_default(size=40)
+    return font
+def frame2img(img_path_list, font, save_path=None, idx_start=0):
+    imgs = [Image.open(img_path) for img_path in img_path_list]
+    new_imgs = []
+    for img in imgs:
+        w, h = img.size
+        scale = w / h
+        if w > h:
+            new_w = 560 * 2
+            new_h = int(560 * 2 / scale)
+        else:
+            new_w = int(560 * 2 * scale)
+            new_h = 560 * 2
+        img = transforms.functional.resize(img, [new_h, new_w],)
+        new_imgs.append(img)
+    imgs = new_imgs
+    new_w = 0
+    new_h = 0
+    pad = 40
+    if w > h:
+        for im in imgs:
+            w, h = im.size
+            new_w = max(new_w, w)
+            new_h += h + 10 + pad
+        new_img = Image.new("RGB", (new_w, new_h), "white")
+        draw = ImageDraw.Draw(new_img)
+        curr_h = 0
+        for idx, im in enumerate(imgs):
+            w, h = im.size
+            new_img.paste(im, (0, pad + curr_h))
+            draw.text((0, curr_h), f"<IMAGE {idx+idx_start}>", font=font, fill="black")
+            if idx + 1 < len(imgs):
+                draw.line([(0, pad + curr_h + h + 5), (new_w, pad + curr_h + h + 5)], fill='black', width=2)
+            curr_h += h + 10 + pad
+    else:
+        for im in imgs:
+            w, h = im.size
+            new_w += w + 10
+            new_h = max(new_h, h)
+        new_h += pad
+        new_img = Image.new('RGB', (new_w, new_h), 'white')
+        draw = ImageDraw.Draw(new_img)
+        curr_w = 0
+        for idx, im in enumerate(imgs):
+            w, h = im.size
+            new_img.paste(im, (curr_w, pad))
+            draw.text((curr_w, 0), f"<IMAGE {idx+idx_start}>", font=font, fill='black')
+            if idx + 1 < len(imgs):
+                draw.line([(curr_w + w + 5, 0), (curr_w + w + 5, new_h)], fill='black', width=2)
+            curr_w += w + 10
+    if save_path is not None:
+        new_img.save(save_path)
+    return new_img
+def concat_images(image_list, max_concat=1, column_num=1):
+    concatenated_images = []
+    if column_num == -1:
+        MAX_COLUMN_NUM = 20
+        max_concat = 1
+        while len(image_list) / max_concat > MAX_COLUMN_NUM:
+            max_concat += 1
+        interval = max(math.ceil(len(image_list) / max_concat), 1)
+        for i in range(0, len(image_list), interval):
+            batch_images = image_list[i:i + interval]
+            concatenated_image = frame2img(batch_images, font=get_font(), idx_start=i)
+            concatenated_images.append(concatenated_image)
+    else:
+        interval = max(math.ceil(len(image_list) / max_concat), 1)
+        for i in range(0, len(image_list), interval):
+            batch_images = [Image.open(filename) for filename in image_list[i:i + interval]]
+            if column_num == 1:
+                total_height = batch_images[0].height * len(batch_images)
+            else:
+                total_height = batch_images[0].height * ((len(batch_images) - 1) // column_num + 1)
+            concatenated_image = Image.new('RGB', (batch_images[0].width * column_num, total_height), 'white')
+            x_offset, y_offset = 0, 0
+            for count, image in enumerate(batch_images):
+                concatenated_image.paste(image, (x_offset, y_offset))
+                x_offset += image.width
+                if (count + 1) % column_num == 0:
+                    y_offset += image.height
+                    x_offset = 0
+            concatenated_images.append(concatenated_image)
+    return concatenated_images
+def eval_score(gt, pred, answer_type):
+    if answer_type == 'Int':
+        try:
+            gt, pred = int(gt), int(float(pred))
+        except:
+            pred = ''
+        score = (gt == pred)
+    elif answer_type == 'Float':
+        try:
+            gt = float(get_clean_string(str(gt)))
+            pred = float(get_clean_string(str(pred)))
+        except:
+            pred = ''
+        score = is_float_equal(gt, pred, include_percentage=True, is_close=True)
+    elif answer_type == 'Str':
+        gt = get_clean_string(gt)
+        pred = get_clean_string(pred)
+        if is_exact_match(gt):
+            score = (gt == pred)
+        else:
+            score = anls_compute(gt, pred)
+    else:
+        if isinstance(gt, str) and gt.startswith('['):
+            gt = eval(gt)
+        if not isinstance(gt, list):
+            gt = [gt]
+        if isinstance(pred, str) and pred.startswith('['):
+            pred = eval(pred)
+        if not isinstance(pred, list):
+            pred = [pred]
+        print(len(gt), len(pred))
+        if len(gt) != len(pred):
+            score = 0.0
+        else:
+            gt = sorted([get_clean_string(a) for a in gt])
+            pred = sorted([get_clean_string(a) for a in pred])
+            print(gt, pred)
+            if isfloat(gt[0]) or is_exact_match(gt[0]):
+                score = ('-'.join(gt) == '-'.join(pred))
+            else:
+                score = min([anls_compute(gt_v, pred_v) for gt_v, pred_v in zip(gt, pred)])
+    return float(score)
+def MMLongBench_auxeval(model, line):
+    prompt = build_mmlongbench_gpt4_prompt(line)
+    log = ''
+    retry = 5
+    for i in range(retry):
+        prediction = line['prediction']
+        res = model.generate(prompt, temperature=i * 0.5)
+        if FAIL_MSG in res:
+            log += f'Try {i}: output is {prediction}, failed to parse.\n'
+        else:
+            log += 'Succeed'
+            try:
+                pred = res.split('Answer format:')[0].split('Extracted answer:')[1].strip()
+            except:
+                pred = ''
+            return dict(log=log, res=res, pred=pred)
+    log += 'All 5 retries failed.\n'
+    return dict(log=log, res='', pred='')
+def get_f1(data):
+    gt_pos_data = data[data.apply(lambda k: k['answer'] != 'Not answerable', axis=1)]
+    pred_pos_data = data[data.apply(lambda k: k['pred'] != 'Not answerable', axis=1)]
+    recall = sum(gt_pos_data['score'].tolist()) / len(gt_pos_data)
+    precision = sum(pred_pos_data['score'].tolist()) / len(pred_pos_data)
+    return 2 * recall * precision / (recall + precision)
+def MMLongBench_acc(result_file):
+    data = load(result_file)
+    overall_score = 0.0
+    score_list = list()
+    for i in range(len(data)):
+        item = data.iloc[i]
+        try:
+            score = eval_score(item['answer'], item['pred'], item['answer_format'])
+        except:
+            score = 0.0
+        score_list.append(score)
+        overall_score += score
+    data['score'] = score_list
+    dump(data, result_file)
+    data_chart = data[data.apply(lambda k: 'Chart' in eval(k['evidence_sources']), axis=1)]
+    data_table = data[data.apply(lambda k: 'Table' in eval(k['evidence_sources']), axis=1)]
+    data_image = data[data.apply(lambda k: 'Figure' in eval(k['evidence_sources']), axis=1)]
+    data_text = data[data.apply(lambda k: 'Pure-text (Plain-text)' in eval(k['evidence_sources']), axis=1)]
+    data_layout = data[data.apply(lambda k: 'Generalized-text (Layout)' in eval(k['evidence_sources']), axis=1)]
+    data_single = data[data.apply(lambda k: len(eval(k['evidence_pages'])) == 1, axis=1)]
+    data_multi = data[data.apply(lambda k: len(eval(k['evidence_pages'])) > 1, axis=1)]
+    data_unans = data[data.apply(lambda k: len(eval(k['evidence_pages'])) == 0, axis=1)]
+    res = dict()
+    res['category'] = [
+        'overall_f1', 'overall_acc', 'text', 'layout', 'table', 'chart',
+        'image', 'single-page', 'multi-page', 'unanswerable'
+    ]
+    res['num'] = [
+        len(data), len(data), len(data_text), len(data_layout), len(data_table),
+        len(data_chart), len(data_image), len(data_single), len(data_multi), len(data_unans)
+    ]
+    res['avg_score'] = [
+        get_f1(data),
+        overall_score / len(data),
+        sum(data_text['score'].tolist()) / len(data_text) if len(data_text) > 0 else 0.0,
+        sum(data_layout['score'].tolist()) / len(data_layout) if len(data_layout) > 0 else 0.0,
+        sum(data_table['score'].tolist()) / len(data_table) if len(data_table) > 0 else 0.0,
+        sum(data_chart['score'].tolist()) / len(data_chart) if len(data_chart) > 0 else 0.0,
+        sum(data_image['score'].tolist()) / len(data_image) if len(data_image) > 0 else 0.0,
+        sum(data_single['score'].tolist()) / len(data_single) if len(data_single) > 0 else 0.0,
+        sum(data_multi['score'].tolist()) / len(data_multi) if len(data_multi) > 0 else 0.0,
+        sum(data_unans['score'].tolist()) / len(data_unans) if len(data_unans) > 0 else 0.0,
+    ]
+    res = pd.DataFrame(res)
+    return res
+class MMLongBench(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'MMLongBench_DOC': 'https://opencompass.openxlab.space/utils/VLMEval/MMLongBench_DOC.tsv',
+    }
+    DATASET_MD5 = {
+        'MMLongBench_DOC': '9b393e1f4c52718380d50586197eac9b',
+    }
+    SUPPORTED_MODELS = {
+        'GPT4': (1, 1),
+        'GPT4V': (1, 1),
+        'GPT4V_HIGH': (1, 1),
+        'GPT4o': (1, 1),
+        'GPT4o_HIGH': (1, 1),
+        'GPT4o_MINI': (1, 1),
+        'MiniCPM-Llama3-V-2_5': (1, 5),
+        'InternVL-Chat-V1-5': (5, 2),
+        'XComposer2_4KHD': (1, 5),
+        'XComposer2d5': (1, -1),
+    }
+    def __init__(self, dataset, **kwargs):
+        self.model_list = list(self.SUPPORTED_MODELS.keys())
+        model_name = kwargs['model']
+        if not listinstr(self.model_list, model_name):
+            raise AssertionError("{} doesn't support the evaluation on MMLongBench_DOC.".format(model_name))
+        super(MMLongBench, self).__init__(dataset)
+        self.is_api = True if listinstr(['GPT4'], model_name) else False
+        self.max_pages = 120
+        concat_num, column_num = self.SUPPORTED_MODELS.get(model_name)
+        self.concat_num = concat_num
+        self.column_num = column_num
+    def dump_image(self, origin_line):
+        os.makedirs(self.img_root, exist_ok=True)
+        try:
+            import fitz
+        except Exception as e:
+            logging.critical(f'{type(e)}: {e}')
+            logging.critical('Please use `pip install pymupdf` to parse PDF files.')
+        line = origin_line.copy()
+        line['image_path'] = line['image_path'][:self.max_pages]
+        skip_pdf_parse = True
+        for im_name in line['image_path']:
+            path = osp.join(self.img_root, im_name)
+            if not read_ok(path):
+                skip_pdf_parse = False
+                break
+        # Just for being compatible with the zooped loop: zip(line['image'], line['image_path'])
+        if skip_pdf_parse:
+            line['image'] = line['image_path']
+        else:
+            pdf_data = base64.b64decode(line['image'])
+            pdf_file = io.BytesIO(pdf_data)
+            encoded_images = []
+            with fitz.open(stream=pdf_file, filetype='pdf') as doc:
+                doc = doc[:self.max_pages]
+                for page in doc:
+                    image = page.get_pixmap(dpi=144)
+                    image_file = io.BytesIO(image.tobytes(output='png'))
+                    image = Image.open(image_file)
+                    encoded_image = encode_image_to_base64(image)
+                    encoded_images.append(encoded_image)
+            line['image'] = encoded_images
+            print('process {}'.format(line['doc_id']))
+        if 'image' in line:
+            if isinstance(line['image'], list):
+                tgt_path = []
+                assert 'image_path' in line
+                for img, im_name in zip(line['image'], line['image_path']):
+                    path = osp.join(self.img_root, im_name)
+                    if not read_ok(path):
+                        decode_base64_to_image_file(img, path)
+                    tgt_path.append(path)
+            else:
+                tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
+                if not read_ok(tgt_path):
+                    decode_base64_to_image_file(line['image'], tgt_path)
+                tgt_path = [tgt_path]
+        else:
+            assert 'image_path' in line
+            tgt_path = toliststr(line['image_path'])
+        if self.concat_num > 0 and not self.is_api:
+            concatenated_images = concat_images(tgt_path, max_concat=self.concat_num, column_num=self.column_num)
+            old_tgt_path = tgt_path
+            assert isinstance(old_tgt_path, list)
+            if self.column_num != -1:
+                tgt_path = [
+                    '_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat{}_{}.jpg'.format(self.concat_num, i)
+                    for i in range(len(concatenated_images))
+                ]
+            else:
+                tgt_path = [
+                    '_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat_all_{}.jpg'.format(i)
+                    for i in range(len(concatenated_images))
+                ]
+            for path, concatenated_image in zip(tgt_path, concatenated_images):
+                if not read_ok(path):
+                    decode_base64_to_image_file(encode_image_to_base64(concatenated_image), path)
+                    num_images, image_size = len(old_tgt_path), concatenated_image.size
+                    print('concat {} images to a new one with size {}. save at {}'.format(num_images, image_size, path))
+        return tgt_path
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        logger = get_logger('Evaluation')
+        model = judge_kwargs['model']
+        suffix = eval_file.split('.')[-1]
+        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        if osp.exists(storage):
+            logger.warning(f'GPT scoring file {storage} already exists, will reuse it in MMLongBench_eval. ')
+        else:
+            data = load(eval_file)
+            model = build_judge(max_tokens=128, **judge_kwargs)
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = [line['index'] for line in lines]
+            ans = {}
+            if osp.exists(tmp_file):
+                ans = load(tmp_file)
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+            if len(indices):
+                new_results = list()
+                for model, line in tqdm(tups):
+                    res = MMLongBench_auxeval(model, line)
+                    new_results.append(res)
+            log_map, res_map, pred_map = {}, {}, {}
+            all_inds = [line['index'] for line in lines]
+            for k, v in zip(all_inds, new_results):
+                log_map[k] = v['log']
+                res_map[k] = v['res']
+                pred_map[k] = v['pred']
+            data['res'] = [res_map[idx] for idx in data['index']]
+            data['log'] = [log_map[idx] for idx in data['index']]
+            data['pred'] = [pred_map[idx] for idx in data['index']]
+            dump(data, storage)
+        score = MMLongBench_acc(storage)
+        score_pth = storage.replace('.xlsx', '_score.csv')
+        dump(score, score_pth)
+        logger.info(f'MMLongBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
+        logger.info('Score: ')
+        logger.info(score)

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mmmath.py ADDED Viewed

	@@ -0,0 +1,446 @@

+import re
+import json
+import sympy as sp
+import numpy as np
+from sympy import simplify, Eq, sympify, Pow, pi
+from sympy.parsing.latex import parse_latex
+import sys
+import math
+import os
+import argparse
+from .image_base import ImageBaseDataset
+from ..utils import track_progress_rich
+from ..smp import load, dump
+class AutoScoringJudge:
+    def __init__(self):
+        # Map of special symbols to their replacements
+        self.special_signal_map = {
+            "\\left": "",
+            "\\right": "",
+            "厘米":"",
+            # "∶": ":",
+            "，": ",",
+            "$": "",
+            "（":"(",
+            "）":")",
+            "\\infty":"oo",
+            "\\colon ":":",
+            # "\\approx": "=",
+            # "\\simeq": "=",
+            # "\\sim": "=",
+            # "^\\prime": "'",
+            # "^{\\prime}": "'",
+            "＋":"+",
+            "\\, ": "",
+            "\\,":"",
+            "^\\circ": "",
+            "^{\\circ}": "",
+            # "%": "",
+        }
+        self.pi = parse_latex("\\pi")
+        # MM-Math default precision
+        self.precision = 1e-2
+    def trans_greater_sign_to_interval(self, expr:str):
+        expr_tmp = expr.split("<")
+        return "(" + expr_tmp[0] + ", " + expr_tmp[-1] + ")"
+    def split_by_comma(self, expr: str):
+        # Splits expressions by commas outside of brackets
+        in_bracket_num = 0
+        splitted_expr = []
+        start_idx = 0
+        for i, char in enumerate(expr):
+            if char in ["(", "["]:
+                in_bracket_num += 1
+            elif char in [")", "]"]:
+                in_bracket_num -= 1
+            elif char == "," and in_bracket_num == 0:
+                splitted_expr.append(expr[start_idx:i].strip())
+                start_idx = i + 1
+        if start_idx < len(expr):
+            splitted_expr.append(expr[start_idx:].strip())
+        return splitted_expr
+    def trans_plus_minus_sign(self, expr_list: list):
+        # Translates plus-minus signs into separate expressions
+        new_expr_list = []
+        for expr in expr_list:
+            if "\\pm" in expr:
+                new_expr_list.append(expr.replace("\\pm", "+"))
+                new_expr_list.append(expr.replace("\\pm", "-"))
+            else:
+                new_expr_list.append(expr)
+        return new_expr_list
+    def judge(self, expression1, expression2, precision=1e-2):
+        # Judge if two expressions are equal (expression1 is considered as the Ground Truth)
+        # Default precision is a list for supporting multiple expressions
+        precision = precision if isinstance(precision, list) else [precision]
+        try:
+            expression1, expression2 = self.preprocess(expression1, expression2)
+        except:
+            return False
+        if expression1 == expression2:
+            # print("Exactly equal")
+            return True
+        # Remove Chinese characters from the string, as answers like "yes" or "no" in Chinese have been considered
+        expression1 = expression1 if re.fullmatch(r"[\u4e00-\u9fff]+", expression1) else re.sub(r'[\u4e00-\u9fff]+', '', expression1)  # noqa: E501
+        expression2 = expression2 if re.fullmatch(r'[\u4e00-\u9fff]+', expression2) else re.sub(r'[\u4e00-\u9fff]+', '', expression2)  # noqa: E501
+        # Check if two < or > in expression
+        if self.is_two_greater_sign(expression1):
+            expression1 = self.trans_greater_sign_to_interval(expression1)
+        if self.is_two_greater_sign(expression2):
+            expression2 = self.trans_greater_sign_to_interval(expression2)
+        expression1 = self.split_by_comma(expression1)
+        expression2 = self.split_by_comma(expression2)
+        temp_list1 = self.trans_plus_minus_sign(expression1)
+        temp_list2 = self.trans_plus_minus_sign(expression2)
+        # Set up a list for allowed errors
+        if len(precision) <= 1:
+            precision = precision * len(temp_list1)
+        if len(temp_list1) != len(temp_list2):
+            return False
+        # Check if elements in both lists can be paired and are equal
+        idx = -1
+        while len(temp_list1) != 0:
+            idx = (idx + 1) % len(temp_list1)
+            item1 = temp_list1[idx]
+            self.precision = precision[idx]
+            for item2 in temp_list2:
+                if self.is_equal(item1, item2):
+                    temp_list1.remove(item1)
+                    temp_list2.remove(item2)
+                    precision.remove(self.precision)
+                    break
+            else:
+                # If no match was found, return False
+                return False
+        # If all elements are matched, return True
+        return True
+    def is_interval(self, expr):
+        # Checks if an expression is an interval
+        return expr.startswith(("(", "[")) and expr.endswith((")", "]"))
+    def is_two_greater_sign(self, expr):
+        match = re.findall(r'<', expr)
+        return len(match) == 2
+    def sympy_sub_pi(self, expression_sympy):
+        # Replaces the symbol for pi in sympy expressions with its numerical value
+        return expression_sympy.subs(self.pi, math.pi)
+    def is_equal(self, expression1, expression2):
+        # Default first expression is ground truth. Check if expressions are equal in different aspects
+        if expression1 == expression2 and expression1 != "" and expression2 != "":
+            # print("Equivalent natively")
+            return True
+        # First check if both are intervals
+        if self.is_interval(expression1) and self.is_interval(expression2):
+            try:
+                if self.interval_equal(expression1, expression2):
+                    # print("Interval equivalent")
+                    return True
+            except:
+                return False
+        # Then check for numerical equality
+        try:
+            if self.numerical_equal(expression1, expression2):
+                # print("Numerically equivalent")
+                return True
+        except:
+            pass
+        # Then check if expressions are mathematically equal
+        try:
+            if self.expression_equal(expression1, expression2) and not ("=" in expression1 and "=" in expression2):
+                # print("Expression equivalent")
+                return True
+        except:
+            pass
+        # Lastly, check for equation equality
+        try:
+            if self.equation_equal(expression1, expression2):
+                # print("Equation equivalent")
+                return True
+        except:
+            pass
+        return False
+    def numerical_equal(self, expression1: str, expression2: str, include_percentage: bool = True):
+        # Check if two numerical values are equal within an allowed error range
+        # Includes possible percentage cases
+        reference = float(expression1)
+        prediction = float(expression2)
+        if include_percentage:
+            gt_result = [reference / 100, reference, reference * 100]
+        else:
+            gt_result = [reference]
+        for item in gt_result:
+            if abs(item - prediction) <= self.precision * 1.01:
+                return True
+        return False
+    def expression_equal(self, exp1, exp2):
+        # Check if two expressions are mathematically equivalent
+        # Extract expression and use sympy for equivalence checking
+        def extract_expression(expression):
+            if "=" in expression:
+                expression = expression.split("=")[1]
+            return expression.strip()
+        exp1 = extract_expression(exp1)
+        exp2 = extract_expression(exp2)
+        exp_too_long = len(exp1) > 300 or len(exp2) > 300
+        expr1_sym = sympify(parse_latex(exp1))
+        expr2_sym = sympify(parse_latex(exp2))
+        if expr1_sym == expr2_sym:
+            return True
+        else:
+            expr1_sym = self.sympy_sub_pi(expr1_sym)
+            expr2_sym = self.sympy_sub_pi(expr2_sym)
+            if (expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol)) or \
+                    (not expr1_sym.has(sp.Symbol) and expr2_sym.has(sp.Symbol)):
+                return False
+            elif not expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol):
+                try:
+                    if not (self.can_compute_power(expr1_sym) and self.can_compute_power(expr2_sym)):
+                        print("These two numbers cannot be calculated by the current computer for: "
+                              f"\"{str(expr1_sym)}\" and \"{str(expr2_sym)}\"")
+                        return False
+                    if exp_too_long:
+                        print(f'Expression {exp1} or {exp2} is too long to compute. ')
+                        return False
+                    if abs(expr1_sym.evalf() - expr2_sym.evalf()) <= self.precision * 1.01:
+                        return True
+                    else:
+                        return False
+                except:
+                    return False
+            elif exp_too_long:
+                print(f'Expression {exp1} or {exp2} is too long to compute. ')
+                return False
+            else:
+                try:
+                    simplified_expr = simplify(expr1_sym - expr2_sym)
+                    num_value = simplified_expr.evalf()
+                    return abs(num_value) < 1e-3
+                except:
+                    return False
+    def equation_equal(self, expression1, expression2):
+        # Check if two equations are mathematically equivalent
+        # Simplify equations and use sympy for equivalence checking
+        def simplify_equation(latex_eq):
+            lhs, rhs = latex_eq.split('=')
+            lhs_expr = parse_latex(lhs)
+            rhs_expr = parse_latex(rhs)
+            equation = Eq(lhs_expr, rhs_expr)
+            simplified_eq = simplify(equation.lhs - equation.rhs)
+            return simplified_eq
+        expr1_sym = simplify_equation(expression1)
+        expr2_sym = simplify_equation(expression2)
+        division_result_1 = simplify(expr1_sym / expr2_sym)
+        division_result_2 = simplify(expr2_sym / expr1_sym)
+        if ((division_result_1.is_Integer and division_result_1 != 0) or  # noqa: W504
+                (division_result_2.is_Integer and division_result_2 != 0)):
+            return True
+        else:
+            return False
+    def interval_equal(self, expression1, expression2):
+        # Check if two intervals are mathematically equivalent
+        def compare_two_interval(inter1, inter2):
+            if inter1[0] != inter2[0] or inter1[-1] != inter2[-1]:
+                return False
+            inter1 = inter1.strip('[]()')
+            inter2 = inter2.strip('[]()')
+            items_1 = inter1.split(',')
+            items_2 = inter2.split(',')
+            for item_1, item_2 in zip(items_1, items_2):
+                if not self.expression_equal(item_1, item_2):
+                    return False
+            return True
+        interval1 = expression1
+        interval2 = expression2
+        if interval1 == interval2:
+            return True
+        else:
+            inter_list1 = interval1.split("\\cup")
+            inter_list2 = interval2.split("\\cup")
+            if len(inter_list1) != len(inter_list2):
+                return False
+            else:
+                for inter1, inter2 in zip(inter_list1, inter_list2):
+                    if not compare_two_interval(inter1, inter2):
+                        return False
+                return True
+    def preprocess(self, expression1, expression2):
+        # Preprocess expressions to extract and replace special symbols
+        def extract_boxed_content(latex_str):
+            boxed_matches = re.finditer(r'\\boxed{', latex_str)
+            results = ""
+            for match in boxed_matches:
+                start_index = match.end()
+                end_index = start_index
+                stack = 1
+                while stack > 0 and end_index < len(latex_str):
+                    if latex_str[end_index] == '{':
+                        stack += 1
+                    elif latex_str[end_index] == '}':
+                        stack -= 1
+                    end_index += 1
+                if stack == 0:
+                    content = latex_str[start_index:end_index - 1]
+                    results += content + ","
+                else:
+                    raise ValueError("Mismatched braces in LaTeX string.")
+            if results == "":
+                last_line_ans = latex_str.strip().split("\n")[-1]
+                dollar_pattern = r"\$(.*?)\$"
+                answers = re.findall(dollar_pattern, last_line_ans)
+                if answers:
+                    for ans in answers:
+                        results += ans + ","
+                else:
+                    results = latex_str
+            return results
+        def sepcial_symbol_replace(expression):
+            expression = expression.replace("\\text{cm}^2", '').replace("\\text{cm}", "").replace("\\,cm", '').replace("\\text{ cm}", '').replace("cm", '').replace("\\text{分米}^2", '').replace("cm^{2}", '').replace("60 \\text{ cm}^2",'').replace("\\ \\text{m}", "").replace("\\text{米}","").strip()  # noqa: E501
+            expression = re.sub(r"(.+)m$", r"\1", expression)
+            if "\\in " in expression:
+                expression = expression.split("\\in ")[1]
+            for signal in self.special_signal_map:
+                expression = expression.replace(signal, self.special_signal_map[signal])
+            expression = re.sub(r'(\\sin|\\cos|\\tan)(\d+)', r'\1((\2/180)\\pi)', expression)
+            expression = expression.strip("\n,.:;^_=+`!@#%^&*~，。")
+            pattern = r'\\(?:mathrm|mathbf)\{~?([^}]*)\}'
+            expression = re.sub(pattern, r'\1', expression)
+            return expression
+        exp1, exp2 = extract_boxed_content(expression1), extract_boxed_content(expression2)
+        exp1, exp2 = sepcial_symbol_replace(exp1), sepcial_symbol_replace(exp2)
+        return exp1, exp2
+    def can_compute_power(self, expr):
+        # Checks if a power expression can be computed
+        if isinstance(expr, Pow):
+            base, exp = expr.as_base_exp()
+            if base.is_number and exp.is_number:
+                MAX_EXP = 1000  # Adjust based on computing environment
+                if abs(exp.evalf()) > MAX_EXP:
+                    return False
+                else:
+                    return True
+            else:
+                return False
+        else:
+            return True  # Not a power expression, can compute
+class MMMath(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'MM-Math': 'https://opencompass.openxlab.space/utils/VLMEval/MM-Math.tsv',
+    }
+    DATASET_MD5 = {
+        'MM-Math': '1f064ed7c4e0e8926a3fa65849419ca5',
+    }
+    @classmethod
+    def evaluate(self, eval_file, **kwargs):
+        data = load(eval_file)
+        judger = AutoScoringJudge()
+        func = judger.judge
+        tups = [dict(expression1=x, expression2=y) for x, y in zip(data['answer'], data['prediction'])]
+        res = track_progress_rich(func, tups, nproc=16)
+        data['hit'] = res
+        dump(data, eval_file)
+        score_file = eval_file.replace('.xlsx', '_score.json')
+        score = {}
+        score['overall'] = np.mean(data['hit'])
+        # Results by Difficulty
+        difficulties = set(data['difficulty'])
+        for d in difficulties:
+            score[f'Difficulty-{d}'] = np.mean(data[data['difficulty'] == d]['hit'])
+        # Results by Year
+        years = set(data['year'])
+        for y in years:
+            score[f'Year-{y}'] = np.mean(data[data['year'] == y]['hit'])
+        # Results by Knowledge-L1
+        points = set(data['knowledge_l1'])
+        for p in points:
+            score[f'Knowledge-L1-{p}'] = np.mean(data[data['knowledge_l1'] == p]['hit'])
+        # Results by Knowledge-L2
+        points = set(data['knowledge_l2'])
+        for p in points:
+            score[f'Knowledge-L2-{p}'] = np.mean(data[data['knowledge_l2'] == p]['hit'])
+        dump(score, score_file)
+        return score

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/mvbench.py ADDED Viewed

	@@ -0,0 +1,666 @@

+import huggingface_hub
+from huggingface_hub import snapshot_download
+from ..smp import *
+from .video_base import VideoBaseDataset
+from .utils import build_judge, DEBUG_MESSAGE
+from ..utils import track_progress_rich
+import torchvision.transforms as T
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+from decord import VideoReader, cpu
+import imageio
+import cv2
+import zipfile
+import os
+import glob
+from .utils.mvbench import *
+FAIL_MSG = 'Failed to obtain answer via API.'
+class MVBench(VideoBaseDataset):
+    MD5 = 'fd21d36522cdedd46d84dc46715ad832'
+    SYS = """Carefully watch the video and pay attention to the cause and sequence of events, \
+the detail and movement of objects, and the action and pose of persons. \
+Based on your observations, select the best option that accurately addresses the question.
+"""
+    TYPE = 'Video-MCQ'
+    def __init__(self, dataset='MVBench', nframe=0, fps=-1):
+        self.type_data_list = {
+            'Action Sequence': ('action_sequence.json',
+                                'your_data_path/star/Charades_v1_480/', 'video', True),  # has start & end
+            'Action Prediction': ('action_prediction.json',
+                                  'your_data_path/star/Charades_v1_480/', 'video', True),  # has start & end
+            'Action Antonym': ('action_antonym.json',
+                               'your_data_path/ssv2_video/', 'video', False),
+            'Fine-grained Action': ('fine_grained_action.json',
+                                    'your_data_path/Moments_in_Time_Raw/videos/', 'video', False),
+            'Unexpected Action': ('unexpected_action.json',
+                                  'your_data_path/FunQA_test/test/', 'video', False),
+            'Object Existence': ('object_existence.json',
+                                 'your_data_path/clevrer/video_validation/', 'video', False),
+            'Object Interaction': ('object_interaction.json',
+                                   'your_data_path/star/Charades_v1_480/', 'video', True),  # has start & end
+            'Object Shuffle': ('object_shuffle.json',
+                               'your_data_path/perception/videos/', 'video', False),
+            'Moving Direction': ('moving_direction.json',
+                                 'your_data_path/clevrer/video_validation/', 'video', False),
+            'Action Localization': ('action_localization.json',
+                                    'your_data_path/sta/sta_video/', 'video', True),   # has start & end
+            'Scene Transition': ('scene_transition.json',
+                                 'your_data_path/scene_qa/video/', 'video', False),
+            'Action Count': ('action_count.json',
+                             'your_data_path/perception/videos/', 'video', False),
+            'Moving Count': ('moving_count.json',
+                             'your_data_path/clevrer/video_validation/', 'video', False),
+            'Moving Attribute': ('moving_attribute.json',
+                                 'your_data_path/clevrer/video_validation/', 'video', False),
+            'State Change': ('state_change.json',
+                             'your_data_path/perception/videos/', 'video', False),
+            'Fine-grained Pose': ('fine_grained_pose.json',
+                                  'your_data_path/nturgbd/', 'video', False),
+            'Character Order': ('character_order.json',
+                                'your_data_path/perception/videos/', 'video', False),
+            'Egocentric Navigation': ('egocentric_navigation.json',
+                                      'your_data_path/vlnqa/', 'video', False),
+            'Episodic Reasoning': ('episodic_reasoning.json',
+                                   'your_data_path/tvqa/frames_fps3_hq/', 'frame', True),  # has start & end, read frame
+            'Counterfactual Inference': ('counterfactual_inference.json',
+                                         'your_data_path/clevrer/video_validation/', 'video', False),
+        }
+        super().__init__(dataset=dataset, nframe=nframe, fps=fps)
+    @classmethod
+    def supported_datasets(cls):
+        return ['MVBench']
+    def prepare_dataset(self, dataset_name='MVBench', repo_id='OpenGVLab/MVBench'):
+        def check_integrity(pth):
+            data_file = osp.join(pth, f'{dataset_name}.tsv')
+            if not os.path.exists(data_file):
+                return False
+            if md5(data_file) != self.MD5:
+                return False
+            data = load(data_file)
+            for idx, item in data.iterrows():
+                if not osp.exists(osp.join(pth, item['prefix'], item['video'])):
+                    return False
+            return True
+        if modelscope_flag_set():
+            repo_id = 'modelscope/MVBench'
+        cache_path = get_cache_path(repo_id, branch='main')
+        if cache_path is not None and check_integrity(cache_path):
+            dataset_path = cache_path
+        else:
+            def unzip_hf_zip(pth):
+                pth = os.path.join(pth, 'video/')
+                for filename in os.listdir(pth):
+                    if filename.endswith('.zip'):
+                        # 构建完整的文件路径
+                        zip_path = os.path.join(pth, filename)
+                        # 解压 ZIP 文件
+                        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+                            zip_ref.extractall(pth)
+            def generate_tsv(pth):
+                data_file = osp.join(pth, f'{dataset_name}.tsv')
+                if os.path.exists(data_file) and md5(data_file) == self.MD5:
+                    return
+                json_data_dir = os.path.join(pth, 'json')
+                self.data_list = []
+                for k, v in self.type_data_list.items():
+                    with open(os.path.join(json_data_dir, v[0]), 'r') as f:
+                        json_data = json.load(f)
+                    for data in json_data:
+                        if os.path.exists(os.path.join(pth, v[1].replace('your_data_path', 'video'), data['video'])):
+                            self.data_list.append({
+                                'task_type': k,
+                                'prefix': v[1].replace('your_data_path', 'video'),
+                                'data_type': v[2],
+                                'bound': v[3],
+                                'start': data['start'] if 'start' in data.keys() else None,
+                                'end': data['end'] if 'end' in data.keys() else None,
+                                'video': data['video'],
+                                'question': data['question'],
+                                'answer': data['answer'],
+                                'candidates': data['candidates']
+                            })
+                        else:
+                            print(
+                                'NTURGB-D zip file is removed according to MVBench, you can view it at '
+                                'https://huggingface.co/datasets/OpenGVLab/MVBench for detailed reason.'
+                            )
+                            raise Exception(
+                                f"{os.path.join(v[1].replace('your_data_path', 'video'), data['video'])} does not exist"
+                            )
+                data_df = pd.DataFrame(self.data_list)
+                data_df = data_df.assign(index=range(len(data_df)))
+                data_df.to_csv(data_file, sep='\t', index=False)
+            def move_files(pth):
+                src_folder = os.path.join(pth, 'video/data0613')
+                if not os.path.exists(src_folder):
+                    return
+                for subdir in os.listdir(src_folder):
+                    subdir_path = os.path.join(src_folder, subdir)
+                    if os.path.isdir(subdir_path):
+                        for subsubdir in os.listdir(subdir_path):
+                            subsubdir_path = os.path.join(subdir_path, subsubdir)
+                            if os.path.isdir(subsubdir_path):
+                                for item in os.listdir(subsubdir_path):
+                                    item_path = os.path.join(subsubdir_path, item)
+                                    target_folder = os.path.join(pth, 'video', subdir, subsubdir)
+                                    if not os.path.exists(target_folder):
+                                        os.makedirs(target_folder)
+                                    target_path = os.path.join(target_folder, item)
+                                    try:
+                                        shutil.move(item_path, target_path)
+                                    except Exception as e:
+                                        print(f"Error moving {item_path} to {target_path}: {e}")
+            if modelscope_flag_set():
+                from modelscope import dataset_snapshot_download
+                dataset_path = dataset_snapshot_download(dataset_id=repo_id, revision='master')
+            else:
+                hf_token = os.environ.get('HUGGINGFACE_TOKEN')
+                huggingface_hub.login(hf_token)
+                dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
+            unzip_hf_zip(dataset_path)
+            move_files(dataset_path)
+            generate_tsv(dataset_path)
+        data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
+        self.decord_method = {
+            'video': self.read_video,
+            'gif': self.read_gif,
+            'frame': self.read_frame,
+        }
+        self.nframe = 8
+        self.frame_fps = 3
+        # transform
+        self.transform = T.Compose([
+            Stack(),
+            ToTorchFormatTensor()
+        ])
+        return dict(root=dataset_path, data_file=data_file)
+    def get_index(self, bound, fps, max_frame, first_idx=0):
+        if bound:
+            start, end = bound[0], bound[1]
+        else:
+            start, end = -100000, 100000
+        start_idx = max(first_idx, round(start * fps))
+        end_idx = min(round(end * fps), max_frame)
+        seg_size = float(end_idx - start_idx) / self.num_segments
+        frame_indices = np.array([
+            int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
+            for idx in range(self.num_segments)
+        ])
+        return frame_indices
+    def read_video(self, video_path, bound=None):
+        vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+        max_frame = len(vr) - 1
+        fps = float(vr.get_avg_fps())
+        images_group = list()
+        frame_indices = self.get_index(bound, fps, max_frame, first_idx=0)
+        for frame_index in frame_indices:
+            img = Image.fromarray(vr[frame_index].asnumpy())
+            images_group.append(img)
+        torch_imgs = self.transform(images_group)
+        return torch_imgs
+    def read_gif(self, video_path, bound=None, fps=25):
+        gif = imageio.get_reader(video_path)
+        max_frame = len(gif) - 1
+        images_group = list()
+        frame_indices = self.get_index(bound, fps, max_frame, first_idx=0)
+        for index, frame in enumerate(gif):
+            if index in frame_indices:
+                img = cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB)
+                img = Image.fromarray(img)
+                images_group.append(img)
+        torch_imgs = self.transform(images_group)
+        return torch_imgs
+    def read_frame(self, video_path, bound=None, fps=3):
+        max_frame = len(os.listdir(video_path))
+        images_group = list()
+        frame_indices = self.get_index(bound, fps, max_frame, first_idx=1)  # frame_idx starts from 1
+        for frame_index in frame_indices:
+            img = Image.open(os.path.join(video_path, f'{frame_index:05d}.jpg'))
+            images_group.append(img)
+        torch_imgs = self.transform(images_group)
+        return torch_imgs
+    def save_video_frames(self, imgs, video_name, frames):
+        frame_paths = self.frame_paths(video_name)
+        flag = np.all([osp.exists(p) for p in frame_paths])
+        if not flag:
+            block_size = imgs.size(0) // frames
+            split_tensors = torch.split(imgs, block_size)
+            to_pil = transforms.ToPILImage()
+            images = [to_pil(arr) for arr in split_tensors]
+            for im, pth in zip(images, frame_paths):
+                if not osp.exists(pth):
+                    im.save(pth)
+        return frame_paths
+    def qa_template(self, data):
+        question = f"Question: {data['question']}\n"
+        question += 'Options:\n'
+        answer = data['answer']
+        answer_idx = -1
+        for idx, c in enumerate(eval(data['candidates'])):
+            question += f"({chr(ord('A') + idx)}) {c}\n"
+            if c == answer:
+                answer_idx = idx
+        question = question.rstrip()
+        answer = f"({chr(ord('A') + answer_idx)}) {answer}"
+        return question, answer
+    def load_into_video_and_process(self, line):
+        try:
+            from moviepy.editor import VideoFileClip, ImageSequenceClip
+        except:
+            raise ImportError(
+                'MoviePy is not installed, please install it by running "pip install moviepy==1.0.3"'
+            )
+        video_path = os.path.join(self.data_root, line['prefix'], line['video'])
+        if line['data_type'] in ['gif'] or os.path.splitext(video_path)[1] in ['.webm']:
+            processed_video_path = video_path.replace(os.path.splitext(video_path)[1], '.mp4')
+            if not os.path.exists(processed_video_path):
+                # using MoviePy to transform GIF, webm into mp4 format
+                gif_clip = VideoFileClip(video_path)
+                gif_clip.write_videofile(processed_video_path, codec='libx264')
+                gif_clip.close()
+        elif line['data_type'] in ['frame']:
+            input_images = os.path.join(video_path, '*.jpg')
+            processed_video_path = f'{video_path}.mp4'
+            if not os.path.exists(processed_video_path):
+                # using MoviePy to transform images into mp4
+                image_files = sorted(glob.glob(input_images))
+                image_clip = ImageSequenceClip(image_files, fps=self.frame_fps)
+                image_clip.write_videofile(processed_video_path, codec='libx264')
+                image_clip.close()
+        else:
+            processed_video_path = video_path
+        if line['bound']:
+            base_name, suffix = os.path.splitext(processed_video_path)
+            output_video_path = f'{base_name}_processed{suffix}'
+            if not os.path.exists(output_video_path):
+                video_clip = VideoFileClip(processed_video_path)
+                clip = video_clip.subclip(line['start'], min(line['end'], video_clip.duration))
+                clip.write_videofile(output_video_path)
+                clip.close()
+        else:
+            output_video_path = processed_video_path
+        return output_video_path
+    def save_video_into_images(self, line):
+        bound = None
+        if line['bound']:
+            bound = (
+                line['start'],
+                line['end'],
+            )
+        video_path = os.path.join(self.data_root, line['prefix'], line['video'])
+        decord_method = self.decord_method[line['data_type']]
+        self.num_segments = self.nframe
+        torch_imgs = decord_method(video_path, bound)
+        img_frame_paths = self.save_video_frames(torch_imgs, line['video'], self.num_segments)
+        return img_frame_paths
+    def build_prompt(self, line, video_llm):
+        if self.fps > 0:
+            raise ValueError('MVBench does not support fps setting, please transfer to MVBench_MP4!')
+        if isinstance(line, int):
+            assert line < len(self)
+            line = self.data.iloc[line]
+        question, answer = self.qa_template(line)
+        message = [dict(type='text', value=self.SYS, role='system')]
+        message.append(dict(type='text', value=question))
+        if video_llm:
+            new_video_path = self.load_into_video_and_process(line)
+            message.append(dict(type='video', value=new_video_path))
+        else:
+            img_frame_paths = self.save_video_into_images(line)
+            for im in img_frame_paths:
+                message.append(dict(type='image', value=im))
+        message.append(dict(type='text', value='\nOnly give the best option.'))
+        message.append(dict(type='text', value='Best option:(', role='assistant'))
+        return message
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
+        tgt_file = eval_file.replace('.xlsx', '_rating.json')
+        score_file = eval_file.replace('.xlsx', '_score.xlsx')
+        if not osp.exists(score_file):
+            model = judge_kwargs.setdefault('model', 'chatgpt-0125')
+            assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
+            if model == 'exact_matching':
+                model = None
+            elif gpt_key_set():
+                model = build_judge(**judge_kwargs)
+                if not model.working():
+                    warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
+                    warnings.warn(DEBUG_MESSAGE)
+                    model = None
+            else:
+                warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+                model = None
+            res = {} if not osp.exists(tmp_file) else load(tmp_file)
+            res = {k: v for k, v in res.items() if FAIL_MSG not in v}
+            data = load(eval_file)
+            data_un = data[~pd.isna(data['prediction'])]
+            for idx in data_un['index']:
+                ans = data.loc[data['index'] == idx, 'answer'].values[0]
+                pred = data.loc[data['index'] == idx, 'prediction'].values[0]
+                options = eval(data.loc[data['index'] == idx, 'candidates'].values[0])
+                answer_idx = -1
+                for id, c in enumerate(options):
+                    if c == ans:
+                        answer_idx = id
+                ans = f"({chr(ord('A') + answer_idx)}) {ans}"
+                input_item = data.loc[data['index'] == idx].to_dict(orient='records')[0]
+                for id, option_content in enumerate(eval(input_item['candidates'])):
+                    input_item[chr(ord('A') + id)] = option_content
+                    if option_content == input_item['answer']:
+                        input_item['answer'] = chr(ord('A') + id)
+                if FAIL_MSG in pred:
+                    data.loc[idx, 'score'] = -1
+                else:
+                    data.loc[idx, 'score'] = int(check_ans_with_model(
+                        pred, ans, model,
+                        input_item,
+                        'MVBench'
+                    ))
+            rejected = [x for x in data['score'] if x == -1]
+            print(
+                f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
+                f'failed to obtain the score for another {len(rejected)} questions. '
+                f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
+            )
+            dump(data, score_file)
+        rating = get_dimension_rating(score_file)
+        dump(rating, tgt_file)
+        return rating
+class MVBench_MP4(VideoBaseDataset):
+    MP4_MD5 = '5c8c6f8b7972c2de65a629590f7c42f5'
+    SYS = """Carefully watch the video and pay attention to the cause and sequence of events, \
+the detail and movement of objects, and the action and pose of persons. \
+Based on your observations, select the best option that accurately addresses the question.
+"""
+    TYPE = 'Video-MCQ'
+    def __init__(self, dataset='MVBench_MP4', nframe=0, fps=-1):
+        super().__init__(dataset=dataset, nframe=nframe, fps=fps)
+    @classmethod
+    def supported_datasets(cls):
+        return ['MVBench_MP4']
+    def prepare_dataset(self, dataset_name='MVBench_MP4', repo_id='OpenGVLab/MVBench'):
+        def check_integrity(pth):
+            data_file = osp.join(pth, f'{dataset_name}.tsv')
+            if not os.path.exists(data_file):
+                return False
+            if md5(data_file) != self.MP4_MD5:
+                return False
+            data = load(data_file)
+            for idx, item in data.iterrows():
+                if not osp.exists(osp.join(pth, item['prefix'], item['video'])):
+                    return False
+            return True
+        if modelscope_flag_set():
+            repo_id = 'modelscope/MVBench'
+        cache_path = get_cache_path(repo_id, branch='video')
+        if cache_path is not None and check_integrity(cache_path):
+            dataset_path = cache_path
+        else:
+            def generate_tsv(pth):
+                data_file = osp.join(pth, f'{dataset_name}.tsv')
+                if os.path.exists(data_file) and md5(data_file) == self.MP4_MD5:
+                    return
+                json_data_path = os.path.join(dataset_path, 'test.json')
+                json_data = load(json_data_path)
+                root_data_dict = json_data['root']
+                self.data_list = []
+                for k, v in json_data['meta'].items():
+                    for item in v:
+                        self.data_list.append({
+                            'task_type': k,
+                            'prefix': root_data_dict[k],
+                            'video': item['video'],
+                            'question': item['question'],
+                            'answer': item['answer'],
+                            'candidates': item['candidates']
+                        })
+                data_df = pd.DataFrame(self.data_list)
+                data_df = data_df.assign(index=range(len(data_df)))
+                data_df.to_csv(data_file, sep='\t', index=False)
+            if modelscope_flag_set():
+                from modelscope import dataset_snapshot_download
+                dataset_path = dataset_snapshot_download(dataset_id=repo_id, revision='video')
+            else:
+                hf_token = os.environ.get('HUGGINGFACE_TOKEN')
+                huggingface_hub.login(hf_token)
+                dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset', revision='video')
+            generate_tsv(dataset_path)
+        data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
+        # transform
+        self.transform = T.Compose([
+            Stack(),
+            ToTorchFormatTensor()
+        ])
+        return dict(root=dataset_path, data_file=data_file)
+    def qa_template(self, data):
+        question = f"Question: {data['question']}\n"
+        question += 'Options:\n'
+        answer = data['answer']
+        answer_idx = -1
+        for idx, c in enumerate(eval(data['candidates'])):
+            question += f"({chr(ord('A') + idx)}) {c}\n"
+            if c == answer:
+                answer_idx = idx
+        question = question.rstrip()
+        answer = f"({chr(ord('A') + answer_idx)}) {answer}"
+        return question, answer
+    def get_index_by_frame(self, max_frame):
+        seg_size = float(max_frame) / self.num_segments
+        frame_indices = np.array([
+            int((seg_size / 2) + np.round(seg_size * idx))
+            for idx in range(self.num_segments)
+        ])
+        return frame_indices
+    def get_index_by_fps(self, vid, fps):
+        total_frames = len(vid)
+        video_fps = vid.get_avg_fps()
+        total_duration = total_frames / video_fps
+        required_frames = int(total_duration * fps)
+        step_size = video_fps / fps
+        frame_indices = np.array([int(i * step_size) for i in range(required_frames)])
+        self.num_segments = len(frame_indices)
+        return frame_indices
+    def read_video(self, video_path):
+        vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+        max_frame = len(vr) - 1
+        images_group = list()
+        if self.fps < 0:
+            frame_indices = self.get_index_by_frame(max_frame)
+        else:
+            frame_indices = self.get_index_by_fps(vr, self.fps)
+        for frame_index in frame_indices:
+            img = Image.fromarray(vr[frame_index].asnumpy())
+            images_group.append(img)
+        torch_imgs = self.transform(images_group)
+        return torch_imgs
+    def save_video_frames(self, imgs, video_name, frames):
+        if self.fps > 0:
+            frame_paths = self.frame_paths_fps(video_name, frames)
+        else:
+            frame_paths = self.frame_paths(video_name)
+        flag = np.all([osp.exists(p) for p in frame_paths])
+        if not flag:
+            block_size = imgs.size(0) // frames
+            split_tensors = torch.split(imgs, block_size)
+            to_pil = transforms.ToPILImage()
+            images = [to_pil(arr) for arr in split_tensors]
+            for im, pth in zip(images, frame_paths):
+                if not osp.exists(pth):
+                    im.save(pth)
+        return frame_paths
+    def save_video_into_images(self, line):
+        video_path = os.path.join(self.data_root, line['prefix'], line['video'])
+        if self.fps <= 0:
+            self.num_segments = self.nframe
+        else:
+            self.num_segments = 0
+        torch_imgs = self.read_video(video_path)
+        img_frame_paths = self.save_video_frames(torch_imgs, line['video'], self.num_segments)
+        return img_frame_paths
+    def build_prompt(self, line, video_llm):
+        if isinstance(line, int):
+            assert line < len(self)
+            line = self.data.iloc[line]
+        question, answer = self.qa_template(line)
+        message = [dict(type='text', value=self.SYS, role='system')]
+        message.append(dict(type='text', value=question))
+        video_path = os.path.join(self.data_root, line['prefix'], line['video'])
+        if video_llm:
+            message.append(dict(type='video', value=video_path))
+        else:
+            img_frame_paths = self.save_video_into_images(line)
+            for im in img_frame_paths:
+                message.append(dict(type='image', value=im))
+        message.append(dict(type='text', value='\nOnly give the best option.'))
+        message.append(dict(type='text', value='Best option:(', role='assistant'))
+        return message
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
+        tgt_file = eval_file.replace('.xlsx', '_rating.json')
+        score_file = eval_file.replace('.xlsx', '_score.xlsx')
+        if not osp.exists(score_file):
+            model = judge_kwargs.setdefault('model', 'chatgpt-0125')
+            assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
+            if model == 'exact_matching':
+                model = None
+            elif gpt_key_set():
+                model = build_judge(**judge_kwargs)
+                if not model.working():
+                    warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
+                    warnings.warn(DEBUG_MESSAGE)
+                    model = None
+            else:
+                warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+                model = None
+            res = {} if not osp.exists(tmp_file) else load(tmp_file)
+            res = {k: v for k, v in res.items() if FAIL_MSG not in v}
+            data = load(eval_file)
+            data_un = data[~pd.isna(data['prediction'])]
+            for idx in data_un['index']:
+                ans = data.loc[data['index'] == idx, 'answer'].values[0]
+                pred = data.loc[data['index'] == idx, 'prediction'].values[0]
+                options = eval(data.loc[data['index'] == idx, 'candidates'].values[0])
+                answer_idx = -1
+                for id, c in enumerate(options):
+                    if c == ans:
+                        answer_idx = id
+                ans = f"({chr(ord('A') + answer_idx)}) {ans}"
+                input_item = data.loc[data['index'] == idx].to_dict(orient='records')[0]
+                for id, option_content in enumerate(eval(input_item['candidates'])):
+                    input_item[chr(ord('A') + id)] = option_content
+                    if option_content == input_item['answer']:
+                        input_item['answer'] = chr(ord('A') + id)
+                if FAIL_MSG in pred:
+                    data.loc[idx, 'score'] = -1
+                else:
+                    data.loc[idx, 'score'] = int(check_ans_with_model(
+                        pred, ans, model,
+                        input_item,
+                        'MVBench_MP4'
+                    ))
+            rejected = [x for x in data['score'] if x == -1]
+            print(
+                f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
+                f'failed to obtain the score for another {len(rejected)} questions. '
+                f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
+            )
+            dump(data, score_file)
+        rating = get_dimension_rating(score_file)
+        dump(rating, tgt_file)
+        return rating

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/slidevqa.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import re
+import math
+from typing import List
+from vlmeval.dataset.utils.judge_util import build_judge
+from vlmeval.smp import *
+from .image_base import ImageBaseDataset
+from .mmlongbench import concat_images, MMLongBench_auxeval, anls_compute
+FAIL_MSG = 'Failed to obtain answer via API.'
+def get_f1(gt, pred):
+    gt_bow, pred_bow = gt.strip().split(), pred.strip().split()
+    if not gt_bow or not pred_bow:
+        return 0.0
+    recall = len([pred_e for pred_e in pred_bow if pred_e in gt_bow]) / len(gt_bow)
+    precision = len([pred_e for pred_e in pred_bow if pred_e in gt_bow]) / len(pred_bow)
+    f1 = 2 * recall * precision / (recall + precision) if (recall + precision) > 1e-4 else 0.0
+    return f1
+def SlideVQA_acc(result_file):
+    data = load(result_file)
+    anls_list, em_list, f1_list = list(), list(), list()
+    for i in range(len(data)):
+        item = data.iloc[i]
+        if isinstance(item['answer'], float) and math.isnan(item['answer']):
+            item['answer'] = 'Not answerable'
+        item['answer'] = re.sub('\n', '', item['answer']).lower()
+        item['pred'] = str(item['pred']).lower()
+        anls_score = anls_compute(item['answer'], item['pred'])
+        em_score = (item['answer'].strip() == item['pred'].strip())
+        f1_score = get_f1(item['answer'], item['pred'])
+        anls_list.append(anls_score)
+        em_list.append(em_score)
+        f1_list.append(f1_score)
+        print('---------------------')
+        print(item['answer'], item['pred'], anls_score, em_score, f1_score)
+    data['anls'] = anls_list
+    data['em'] = em_list
+    data['f1'] = f1_list
+    dump(data, result_file)
+    res = dict()
+    res['category'], res['num'] = ['anls', 'EM', 'F1'], [len(data), len(data), len(data)]
+    res['avg'] = [sum(anls_list) / len(data), sum(em_list) / len(data), sum(f1_list) / len(data)]
+    res = pd.DataFrame(res)
+    return res
+class SlideVQA(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'SLIDEVQA_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/SLIDEVQA_MINI.tsv',
+        'SLIDEVQA': 'https://opencompass.openxlab.space/utils/VLMEval/SLIDEVQA.tsv',
+    }
+    DATASET_MD5 = {
+        'SLIDEVQA_MINI': '6d9a8d8814fa5b7669deb2af3a3208eb',
+        'SLIDEVQA': '5e822c2f800e94c1e23badfd478326b6',
+    }
+    SUPPORTED_MODELS = {
+        'GPT4': (1, 1),
+        'GPT4V': (1, 1),
+        'GPT4V_HIGH': (1, 1),
+        'GPT4o': (1, 1),
+        'GPT4o_HIGH': (1, 1),
+        'GPT4o_MINI': (1, 1),
+        'XComposer2d5': (1, -1),
+        'XComposer2_4KHD': (1, -1),
+        'MiniCPM-Llama3-V-2_5': (1, 5),
+        'InternVL-Chat-V1-5': (5, 2),
+    }
+    def __init__(self, dataset, **kwargs):
+        self.model_list = list(self.SUPPORTED_MODELS.keys())
+        model_name = kwargs['model']
+        if not listinstr(self.model_list, model_name):
+            raise AssertionError("{} doesn't support the evaluation on SlideVQA.".format(model_name))
+        super(SlideVQA, self).__init__(dataset)
+        self.is_api = True if listinstr(['GPT4'], model_name) else False
+        self.max_pages = 120
+        concat_num, column_num = self.SUPPORTED_MODELS.get(model_name)
+        self.concat_num = concat_num
+        self.column_num = column_num
+    def dump_image(self, origin_line):
+        os.makedirs(self.img_root, exist_ok=True)
+        line = origin_line.copy()
+        if not isinstance(line['image_path'], List):
+            line['image_path'] = [line['image_path']]
+        line['image_path'] = line['image_path'][:self.max_pages]
+        if 'image' in line:
+            if isinstance(line['image'], list):
+                tgt_path = []
+                assert 'image_path' in line
+                for img, im_name in zip(line['image'], line['image_path']):
+                    path = osp.join(self.img_root, im_name)
+                    if not read_ok(path):
+                        decode_base64_to_image_file(img, path)
+                    tgt_path.append(path)
+            else:
+                tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
+                if not read_ok(tgt_path):
+                    decode_base64_to_image_file(line['image'], tgt_path)
+                tgt_path = [tgt_path]
+        else:
+            assert 'image_path' in line
+            tgt_path = toliststr(line['image_path'])
+        if self.concat_num > 0 and not self.is_api:
+            concatenated_images = concat_images(tgt_path, max_concat=self.concat_num, column_num=self.column_num)
+            old_tgt_path = tgt_path
+            assert isinstance(old_tgt_path, list)
+            if self.column_num != -1:
+                tgt_path = [
+                    '_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat{}_{}.jpg'.format(self.concat_num, i)
+                    for i in range(len(concatenated_images))
+                ]
+            else:
+                tgt_path = ['_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat_all.jpg']
+            for path, concatenated_image in zip(tgt_path, concatenated_images):
+                if not read_ok(path):
+                    decode_base64_to_image_file(encode_image_to_base64(concatenated_image), path)
+                    num_images, image_size = len(old_tgt_path), concatenated_image.size
+                    print('concat {} images to a new one with size {}. save at {}'.format(num_images, image_size, path))
+        return tgt_path
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        logger = get_logger('Evaluation')
+        model = judge_kwargs['model']
+        suffix = eval_file.split('.')[-1]
+        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        if osp.exists(storage):
+            logger.warning(f'GPT scoring file {storage} already exists, will reuse it in SlideVQA_eval. ')
+        else:
+            data = load(eval_file)
+            model = build_judge(max_tokens=128, **judge_kwargs)
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = [line['index'] for line in lines]
+            ans = {}
+            if osp.exists(tmp_file):
+                ans = load(tmp_file)
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+            if len(indices):
+                new_results = list()
+                for model, line in tqdm(tups):
+                    res = MMLongBench_auxeval(model, line)
+                    new_results.append(res)
+            log_map, res_map, pred_map = {}, {}, {}
+            all_inds = [line['index'] for line in lines]
+            for k, v in zip(all_inds, new_results):
+                log_map[k] = v['log']
+                res_map[k] = v['res']
+                pred_map[k] = v['pred']
+            data['res'] = [res_map[idx] for idx in data['index']]
+            data['log'] = [log_map[idx] for idx in data['index']]
+            data['pred'] = [pred_map[idx] for idx in data['index']]
+            dump(data, storage)
+        score = SlideVQA_acc(storage)
+        score_pth = storage.replace('.xlsx', '_score.csv')
+        dump(score, score_pth)
+        logger.info(f'SlideVQA successfully finished evaluating {eval_file}, results saved in {score_pth}')
+        logger.info('Score: ')
+        logger.info(score)

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/tempcompass.py ADDED Viewed

	@@ -0,0 +1,639 @@

+import huggingface_hub
+from huggingface_hub import snapshot_download
+from ..smp import *
+from .video_concat_dataset import ConcatVideoDataset
+from .video_base import VideoBaseDataset
+from .utils import build_judge, DEBUG_MESSAGE
+from ..utils import track_progress_rich
+import torchvision.transforms as T
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+from decord import VideoReader, cpu
+from .utils.tempcompass import *
+FAIL_MSG = 'Failed to obtain answer via API.'
+class TempCompass(ConcatVideoDataset):
+    def __init__(self, dataset='TempCompass', nframe=0, fps=-1):
+        self.DATASET_SETS[dataset] = ['TempCompass_MCQ', 'TempCompass_Captioning', 'TempCompass_YorN']
+        super().__init__(dataset=dataset, nframe=nframe, fps=fps)
+    @classmethod
+    def supported_datasets(cls):
+        return ['TempCompass']
+    def evaluate(self, eval_file, **judge_kwargs):
+        result = super().evaluate(eval_file=eval_file, **judge_kwargs)
+        suffix = eval_file.split('.')[-1]
+        result = result.reset_index().rename(columns={'index': 'dim.task_type'})
+        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        avg_dict = {}
+        for idx, item in result.iterrows():
+            dim, task_type = item['dim.task_type'].split('. ')
+            if dim not in avg_dict:
+                avg_dict[dim] = {'success': 0.0, 'overall': 0.0}
+            if task_type not in avg_dict:
+                avg_dict[task_type] = {'success': 0.0, 'overall': 0.0}
+            if 'overall' not in avg_dict:
+                avg_dict['overall'] = {'success': 0.0, 'overall': 0.0}
+            avg_dict[dim]['success'] += item['success']
+            avg_dict[dim]['overall'] += item['overall']
+            avg_dict[task_type]['success'] += item['success']
+            avg_dict[task_type]['overall'] += item['overall']
+            avg_dict['overall']['success'] += item['success']
+            avg_dict['overall']['overall'] += item['overall']
+            result.loc[idx, 'acc'] = round(item['success'] / item['overall'] * 100, 2)
+        for key, value in avg_dict.items():
+            # 使用 loc 方法添加新行
+            result.loc[len(result)] = {
+                'dim.task_type': key,
+                'success': value['success'],
+                'overall': value['overall'],
+                'acc': round(value['success'] / value['overall'] * 100, 2)
+            }
+        dump(result, score_file)
+        return result
+class TempCompass_MCQ(VideoBaseDataset):
+    MD5 = '7efbb9e6d9dabacd22daf274852691dd'
+    TYPE = 'Video-MCQ'
+    def __init__(self, dataset='TempCompass_MCQ', nframe=0, fps=-1):
+        self.type_data_list = {
+            'multi-choice': ('multi-choice.json', './videos', '.mp4'),
+            'caption_matching': ('caption_matching.json', './videos', '.mp4'),
+        }
+        super().__init__(dataset=dataset, nframe=nframe, fps=fps)
+    @classmethod
+    def supported_datasets(cls):
+        return ['TempCompass_MCQ']
+    def prepare_dataset(self, dataset_name='TempCompass_MCQ', repo_id='lmms-lab/TempCompass'):
+        def check_integrity(pth):
+            data_file = osp.join(pth, f'{dataset_name}.tsv')
+            if not osp.exists(data_file):
+                return False
+            if md5(data_file) != self.MD5:
+                return False
+            data = load(data_file)
+            for idx, item in data.iterrows():
+                if not osp.exists(osp.join(pth, item['prefix'], item['video'] + item['suffix'])):
+                    return False
+            return True
+        cache_path = get_cache_path(repo_id)
+        if cache_path is not None and check_integrity(cache_path):
+            dataset_path = cache_path
+        else:
+            def read_parquet(pth):
+                import pandas as pd
+                for task_name in self.type_data_list.keys():
+                    if not osp.exists(osp.join(pth, f'{task_name}.json')):
+                        data = pd.read_parquet(osp.join(pth, task_name, 'test-00000-of-00001.parquet'))
+                        data.to_json(osp.join(pth, f'{task_name}.json'), orient='records', lines=False)
+            def unzip_videos(pth):
+                import zipfile
+                if not osp.exists(osp.join(pth, 'videos')):
+                    zip_file = osp.join(pth, 'tempcompass_videos.zip')
+                    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
+                        zip_ref.extractall(pth)
+            def generate_tsv(pth):
+                data_file = osp.join(pth, f'{dataset_name}.tsv')
+                if osp.exists(data_file) and md5(data_file) == self.MD5:
+                    return
+                self.data_list = []
+                for k, v in self.type_data_list.items():
+                    with open(osp.join(pth, v[0]), 'r') as f:
+                        json_data = json.load(f)
+                    for data in json_data:
+                        self.data_list.append({
+                            'task_type': k,
+                            'prefix': v[1],
+                            'suffix': v[2],
+                            'video': data['video_id'],
+                            'question': data['question'].split('\n')[0],
+                            'answer': data['answer'],
+                            'dim': data['dim'],
+                            'candidates': data['question'].split('\n')[1:],
+                        })
+                data_df = pd.DataFrame(self.data_list)
+                data_df = data_df.assign(index=range(len(data_df)))
+                data_df.to_csv(data_file, sep='\t', index=False)
+            if modelscope_flag_set():
+                from modelscope import dataset_snapshot_download
+                dataset_path = dataset_snapshot_download(dataset_id=repo_id)
+            else:
+                dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
+            read_parquet(dataset_path)
+            unzip_videos(dataset_path)
+            generate_tsv(dataset_path)
+        data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
+        return dict(root=dataset_path, data_file=data_file)
+    def qa_template(self, data):
+        question = data['question'] + '\n' + '\n'.join(eval(data['candidates']))
+        answer = data['answer']
+        return question, answer
+    def save_video_frames(self, line):
+        vid_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix'])
+        vid = decord.VideoReader(vid_path)
+        video_info = {
+            'fps': vid.get_avg_fps(),
+            'n_frames': len(vid),
+        }
+        if self.nframe > 0 and self.fps < 0:
+            step_size = len(vid) / (self.nframe + 1)
+            indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
+            frame_paths = self.frame_paths(line['video'])
+        elif self.fps > 0:
+            # not constrained by num_frames, get frames by fps
+            total_duration = video_info['n_frames'] / video_info['fps']
+            required_frames = int(total_duration * self.fps)
+            step_size = video_info['fps'] / self.fps
+            indices = [int(i * step_size) for i in range(required_frames)]
+            frame_paths = self.frame_paths_fps(line['video'], len(indices))
+        flag = np.all([osp.exists(p) for p in frame_paths])
+        if not flag:
+            images = [vid[i].asnumpy() for i in indices]
+            images = [Image.fromarray(arr) for arr in images]
+            for im, pth in zip(images, frame_paths):
+                if not osp.exists(pth):
+                    im.save(pth)
+        return frame_paths
+    def save_video_into_images(self, line):
+        frame_paths = self.save_video_frames(line)
+        return frame_paths
+    def build_prompt(self, line, video_llm):
+        if isinstance(line, int):
+            assert line < len(self)
+            line = self.data.iloc[line]
+        question, answer = self.qa_template(line)
+        message = []
+        message.append(dict(type='text', value=question))
+        video_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix'])
+        if video_llm:
+            message.append(dict(type='video', value=video_path))
+        else:
+            img_frame_paths = self.save_video_into_images(line)
+            for im in img_frame_paths:
+                message.append(dict(type='image', value=im))
+        message.append(dict(type='text', value='\nPlease directly give the best option:'))
+        return message
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        model = judge_kwargs.get('model', 'exact_matching')
+        assert model in ['chatgpt-1106', 'exact_matching']
+        judge_kwargs.update({
+            "max_tokens": 128,
+            "temperature": 1.0,
+            "top_p": 1,
+            "presence_penalty": 1,
+        })
+        suffix = eval_file.split('.')[-1]
+        score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        nproc = judge_kwargs.pop('nproc', 4)
+        if not osp.exists(score_file):
+            data = load(eval_file)
+            if model != 'exact_matching':
+                model = build_judge(system_prompt=sys_prompt, **judge_kwargs)
+            else:
+                model = None
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = [line['index'] for line in lines]
+            ans = {}
+            if osp.exists(tmp_file):
+                ans = load(tmp_file)
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+            if len(indices):
+                _ = track_progress_rich(
+                    evaluate_tempcompass_mcq,
+                    tups,
+                    nproc=nproc,
+                    chunksize=nproc,
+                    keys=indices,
+                    save=tmp_file,
+                )
+            ans = load(tmp_file)
+            for idx, item in data.iterrows():
+                data.loc[idx, 'score'] = ans[idx]['rating']
+            dump(data, score_file)
+        rating = get_dimension_rating(score_file)
+        return rating
+class TempCompass_Captioning(VideoBaseDataset):
+    MD5 = '35be9bf2581ea7767f02e9a8f37ae1ab'
+    TYPE = 'Video-VQA'
+    def __init__(self, dataset='TempCompass_Captioning', nframe=0, fps=-1):
+        self.type_data_list = {
+            'captioning': ('captioning.json', './videos', '.mp4'),
+        }
+        super().__init__(dataset=dataset, nframe=nframe, fps=fps)
+    @classmethod
+    def supported_datasets(cls):
+        return ['TempCompass_Captioning']
+    def prepare_dataset(self, dataset_name='TempCompass_Captioning', repo_id='lmms-lab/TempCompass'):
+        def check_integrity(pth):
+            data_file = osp.join(pth, f'{dataset_name}.tsv')
+            if not osp.exists(data_file):
+                return False
+            if md5(data_file) != self.MD5:
+                return False
+            data = load(data_file)
+            for idx, item in data.iterrows():
+                if not osp.exists(osp.join(pth, item['prefix'], item['video'] + item['suffix'])):
+                    return False
+            return True
+        cache_path = get_cache_path(repo_id)
+        if cache_path is not None and check_integrity(cache_path):
+            dataset_path = cache_path
+        else:
+            def read_parquet(pth):
+                import pandas as pd
+                for task_name in self.type_data_list.keys():
+                    if not osp.exists(osp.join(pth, f'{task_name}.json')):
+                        data = pd.read_parquet(osp.join(pth, task_name, 'test-00000-of-00001.parquet'))
+                        data.to_json(osp.join(pth, f'{task_name}.json'), orient='records', lines=False)
+            def unzip_videos(pth):
+                import zipfile
+                if not osp.exists(osp.join(pth, 'videos')):
+                    zip_file = osp.join(pth, 'tempcompass_videos.zip')
+                    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
+                        zip_ref.extractall(pth)
+            def generate_tsv(pth):
+                data_file = osp.join(pth, f'{dataset_name}.tsv')
+                if osp.exists(data_file) and md5(data_file) == self.MD5:
+                    return
+                self.data_list = []
+                for k, v in self.type_data_list.items():
+                    with open(osp.join(pth, v[0]), 'r') as f:
+                        json_data = json.load(f)
+                    for data in json_data:
+                        self.data_list.append({
+                            'task_type': k,
+                            'prefix': v[1],
+                            'suffix': v[2],
+                            'video': data['video_id'],
+                            'question': data['question'],
+                            'answer': data['answer'],
+                            'dim': data['dim'],
+                            'mc_question': data['mc_question'],
+                            'mc_answer': data['mc_answer'],
+                        })
+                data_df = pd.DataFrame(self.data_list)
+                data_df = data_df.assign(index=range(len(data_df)))
+                data_df.to_csv(data_file, sep='\t', index=False)
+            if modelscope_flag_set():
+                from modelscope import dataset_snapshot_download
+                dataset_path = dataset_snapshot_download(dataset_id=repo_id)
+            else:
+                dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
+            read_parquet(dataset_path)
+            unzip_videos(dataset_path)
+            generate_tsv(dataset_path)
+        data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
+        return dict(root=dataset_path, data_file=data_file)
+    def qa_template(self, data):
+        question = data['question']
+        answer = data['answer']
+        return question, answer
+    def save_video_frames(self, line):
+        vid_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix'])
+        vid = decord.VideoReader(vid_path)
+        video_info = {
+            'fps': vid.get_avg_fps(),
+            'n_frames': len(vid),
+        }
+        if self.nframe > 0 and self.fps < 0:
+            step_size = len(vid) / (self.nframe + 1)
+            indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
+            frame_paths = self.frame_paths(line['video'])
+        elif self.fps > 0:
+            # not constrained by num_frames, get frames by fps
+            total_duration = video_info['n_frames'] / video_info['fps']
+            required_frames = int(total_duration * self.fps)
+            step_size = video_info['fps'] / self.fps
+            indices = [int(i * step_size) for i in range(required_frames)]
+            frame_paths = self.frame_paths_fps(line['video'], len(indices))
+        flag = np.all([osp.exists(p) for p in frame_paths])
+        if not flag:
+            images = [vid[i].asnumpy() for i in indices]
+            images = [Image.fromarray(arr) for arr in images]
+            for im, pth in zip(images, frame_paths):
+                if not osp.exists(pth):
+                    im.save(pth)
+        return frame_paths
+    def save_video_into_images(self, line):
+        frame_paths = self.save_video_frames(line)
+        return frame_paths
+    def build_prompt(self, line, video_llm):
+        if isinstance(line, int):
+            assert line < len(self)
+            line = self.data.iloc[line]
+        question, answer = self.qa_template(line)
+        message = []
+        message.append(dict(type='text', value=question))
+        video_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix'])
+        if video_llm:
+            message.append(dict(type='video', value=video_path))
+        else:
+            img_frame_paths = self.save_video_into_images(line)
+            for im in img_frame_paths:
+                message.append(dict(type='image', value=im))
+        return message
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        model = judge_kwargs.get('model', 'exact_matching')
+        assert model in ['chatgpt-1106', 'exact_matching']
+        judge_kwargs.update({
+            "max_tokens": 128,
+            "temperature": 1.0,
+            "top_p": 1,
+            "presence_penalty": 1,
+        })
+        suffix = eval_file.split('.')[-1]
+        score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        nproc = judge_kwargs.pop('nproc', 4)
+        if not osp.exists(score_file):
+            data = load(eval_file)
+            if model != 'exact_matching':
+                model = build_judge(system_prompt=sys_prompt, **judge_kwargs)
+            else:
+                model = None
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = [line['index'] for line in lines]
+            ans = {}
+            if osp.exists(tmp_file):
+                ans = load(tmp_file)
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+            if len(indices):
+                _ = track_progress_rich(
+                    evaluate_tempcompass_captioning,
+                    tups,
+                    nproc=nproc,
+                    chunksize=nproc,
+                    keys=indices,
+                    save=tmp_file,
+                )
+            ans = load(tmp_file)
+            for idx, item in data.iterrows():
+                data.loc[idx, 'score'] = ans[idx]['rating']
+            dump(data, score_file)
+        rating = get_dimension_rating(score_file)
+        return rating
+class TempCompass_YorN(VideoBaseDataset):
+    MD5 = 'c72c046d7fa0e82c8cd7462f2e844ea8'
+    TYPE = 'Video-Y/N'
+    def __init__(self, dataset='TempCompass_YorN', nframe=0, fps=-1):
+        self.type_data_list = {
+            'yes_no': ('yes_no.json', './videos', '.mp4'),
+        }
+        super().__init__(dataset=dataset, nframe=nframe, fps=fps)
+    @classmethod
+    def supported_datasets(cls):
+        return ['TempCompass_YorN']
+    def prepare_dataset(self, dataset_name='TempCompass_YorN', repo_id='lmms-lab/TempCompass'):
+        def check_integrity(pth):
+            data_file = osp.join(pth, f'{dataset_name}.tsv')
+            if not osp.exists(data_file):
+                return False
+            if md5(data_file) != self.MD5:
+                return False
+            data = load(data_file)
+            for idx, item in data.iterrows():
+                if not osp.exists(osp.join(pth, item['prefix'], item['video'] + item['suffix'])):
+                    return False
+            return True
+        cache_path = get_cache_path(repo_id)
+        if cache_path is not None and check_integrity(cache_path):
+            dataset_path = cache_path
+        else:
+            def read_parquet(pth):
+                import pandas as pd
+                for task_name in self.type_data_list.keys():
+                    if not osp.exists(osp.join(pth, f'{task_name}.json')):
+                        data = pd.read_parquet(osp.join(pth, task_name, 'test-00000-of-00001.parquet'))
+                        data.to_json(osp.join(pth, f'{task_name}.json'), orient='records', lines=False)
+            def unzip_videos(pth):
+                import zipfile
+                if not osp.exists(osp.join(pth, 'videos')):
+                    zip_file = osp.join(pth, 'tempcompass_videos.zip')
+                    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
+                        zip_ref.extractall(pth)
+            def generate_tsv(pth):
+                data_file = osp.join(pth, f'{dataset_name}.tsv')
+                if osp.exists(data_file) and md5(data_file) == self.MD5:
+                    return
+                self.data_list = []
+                for k, v in self.type_data_list.items():
+                    with open(osp.join(pth, v[0]), 'r') as f:
+                        json_data = json.load(f)
+                    for data in json_data:
+                        self.data_list.append({
+                            'task_type': k,
+                            'prefix': v[1],
+                            'suffix': v[2],
+                            'video': data['video_id'],
+                            'question': data['question'].split('\n')[0],
+                            'answer': data['answer'],
+                            'dim': data['dim']
+                        })
+                data_df = pd.DataFrame(self.data_list)
+                data_df = data_df.assign(index=range(len(data_df)))
+                data_df.to_csv(data_file, sep='\t', index=False)
+            if modelscope_flag_set():
+                from modelscope import dataset_snapshot_download
+                dataset_path = dataset_snapshot_download(dataset_id=repo_id)
+            else:
+                dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
+            read_parquet(dataset_path)
+            unzip_videos(dataset_path)
+            generate_tsv(dataset_path)
+        data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
+        return dict(root=dataset_path, data_file=data_file)
+    def qa_template(self, data):
+        question = data['question']
+        answer = data['answer']
+        return question, answer
+    def save_video_frames(self, line):
+        vid_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix'])
+        vid = decord.VideoReader(vid_path)
+        video_info = {
+            'fps': vid.get_avg_fps(),
+            'n_frames': len(vid),
+        }
+        if self.nframe > 0 and self.fps < 0:
+            step_size = len(vid) / (self.nframe + 1)
+            indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
+            frame_paths = self.frame_paths(line['video'])
+        elif self.fps > 0:
+            # not constrained by num_frames, get frames by fps
+            total_duration = video_info['n_frames'] / video_info['fps']
+            required_frames = int(total_duration * self.fps)
+            step_size = video_info['fps'] / self.fps
+            indices = [int(i * step_size) for i in range(required_frames)]
+            frame_paths = self.frame_paths_fps(line['video'], len(indices))
+        flag = np.all([osp.exists(p) for p in frame_paths])
+        if not flag:
+            images = [vid[i].asnumpy() for i in indices]
+            images = [Image.fromarray(arr) for arr in images]
+            for im, pth in zip(images, frame_paths):
+                if not osp.exists(pth):
+                    im.save(pth)
+        return frame_paths
+    def save_video_into_images(self, line):
+        frame_paths = self.save_video_frames(line)
+        return frame_paths
+    def build_prompt(self, line, video_llm):
+        if isinstance(line, int):
+            assert line < len(self)
+            line = self.data.iloc[line]
+        question, answer = self.qa_template(line)
+        message = []
+        message.append(dict(type='text', value=question))
+        video_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix'])
+        if video_llm:
+            message.append(dict(type='video', value=video_path))
+        else:
+            img_frame_paths = self.save_video_into_images(line)
+            for im in img_frame_paths:
+                message.append(dict(type='image', value=im))
+        message.append(dict(type='text', value='\nPlease answer yes or no:'))
+        return message
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        model = judge_kwargs.get('model', 'exact_matching')
+        assert model in ['chatgpt-1106', 'exact_matching']
+        judge_kwargs.update({
+            "max_tokens": 128,
+            "temperature": 1.0,
+            "top_p": 1,
+            "presence_penalty": 1,
+        })
+        suffix = eval_file.split('.')[-1]
+        score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        nproc = judge_kwargs.pop('nproc', 4)
+        if not osp.exists(score_file):
+            data = load(eval_file)
+            if model != 'exact_matching':
+                model = build_judge(system_prompt=sys_prompt, **judge_kwargs)
+            else:
+                model = None
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = [line['index'] for line in lines]
+            ans = {}
+            if osp.exists(tmp_file):
+                ans = load(tmp_file)
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+            if len(indices):
+                _ = track_progress_rich(
+                    evaluate_tempcompass_YorN,
+                    tups,
+                    nproc=nproc,
+                    chunksize=nproc,
+                    keys=indices,
+                    save=tmp_file,
+                )
+            ans = load(tmp_file)
+            for idx, item in data.iterrows():
+                data.loc[idx, 'score'] = ans[idx]['rating']
+            dump(data, score_file)
+        rating = get_dimension_rating(score_file)
+        return rating

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/text_base.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from abc import abstractmethod
+from ..smp import *
+class TextBaseDataset:
+    MODALITY = 'TEXT'
+    DATASET_URL = {}
+    DATASET_MD5 = {}
+    def __init__(self, dataset='MMBench', **kwargs):
+        self.dataset_name = dataset
+        data = self.load_data(dataset)
+        data['index'] = [str(x) for x in data['index']]
+        if np.all([istype(x, int) for x in data['index']]):
+            data['index'] = [int(x) for x in data['index']]
+        self.data = data
+        self.post_build(dataset)
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        return dict(self.data.iloc[idx])
+    def prepare_tsv(self, url, file_md5=None):
+        data_root = LMUDataRoot()
+        os.makedirs(data_root, exist_ok=True)
+        update_flag = False
+        file_name = url.split('/')[-1]
+        data_path = osp.join(data_root, file_name)
+        if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
+            pass
+        else:
+            warnings.warn('The dataset tsv is not downloaded')
+            download_file(url, data_path)
+            update_flag = True
+        if file_size(data_path, 'GB') > 1:
+            local_path = data_path.replace('.tsv', '_local.tsv')
+            if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag:
+                from ..tools import LOCALIZE
+                LOCALIZE(data_path, local_path)
+            data_path = local_path
+        return load(data_path)
+    def dump_image(self, line):
+        return []
+    def display(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        assert isinstance(line, pd.Series) or isinstance(line, dict)
+        mmqa_display(line)
+    # Return a list of dataset names that are supported by this class, can override
+    @classmethod
+    def supported_datasets(cls):
+        return list(cls.DATASET_URL)
+    # Given the dataset name, return the dataset as a pandas dataframe, can override
+    def load_data(self, dataset):
+        url = self.DATASET_URL[dataset]
+        file_md5 = self.DATASET_MD5[dataset]
+        return self.prepare_tsv(url, file_md5)
+    # Post built hook, will be called after the dataset is built, can override
+    def post_build(self, dataset):
+        pass
+    # Given one data record, return the built prompt (a multi-modal message), can override
+    def build_prompt(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        question = line['question']
+        msgs = []
+        msgs.append(dict(type='text', value=question))
+        return msgs
+    # Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
+    @abstractmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        pass

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/text_mcq.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from .text_base import TextBaseDataset
+from .utils import build_judge, DEBUG_MESSAGE
+from ..smp import *
+class TextMCQDataset(TextBaseDataset):
+    TYPE = 'MCQ'
+    DATASET_URL = {}
+    DATASET_MD5 = {}
+    def build_prompt(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        question = line['question']
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        options_prompt = 'Options:\n'
+        for key, item in options.items():
+            options_prompt += f'{key}. {item}\n'
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        prompt = ''
+        if hint is not None:
+            prompt += f'Hint: {hint}\n'
+        prompt += f'Question: {question}\n'
+        if len(options):
+            prompt += options_prompt
+            prompt += 'Please select the correct answer from the options above. \n'
+        msgs = []
+        msgs.append(dict(type='text', value=prompt))
+        return msgs
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.multiple_choice import report_acc, report_acc_MMT, mcq_circular_eval, mcq_vanilla_eval
+        # assert dataset is not None
+        dataset_map = {
+            'MMBench_TEST_EN': 'MMBench', 'MMBench_TEST_EN_V11': 'MMBench_V11',
+            'MMBench_TEST_CN': 'MMBench_CN', 'MMBench_TEST_CN_V11': 'MMBench_CN_V11'
+        }
+        dataset = self.dataset_name
+        if dataset in dataset_map:
+            dataset = dataset_map[dataset]
+        nproc = judge_kwargs.pop('nproc', 4)
+        circular = False
+        suffix = eval_file.split('.')[-1]
+        model = judge_kwargs.get('model', 'exact_matching')
+        assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
+        name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
+        name_str = name_str_map[model] if model in name_str_map else model
+        if model == 'exact_matching':
+            model = None
+        elif gpt_key_set():
+            model = build_judge(**judge_kwargs)
+            if not model.working():
+                warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
+                warnings.warn(DEBUG_MESSAGE)
+                model = None
+        else:
+            warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+            model = None
+        result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
+        data = load(eval_file)
+        data = data.sort_values(by='index')
+        data['prediction'] = [str(x) for x in data['prediction']]
+        # If not choice label, then use lower case
+        for k in data.keys():
+            data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
+        meta = self.data
+        meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
+        data_map = {x: y for x, y in zip(data['index'], data['question'])}
+        for k in data_map:
+            assert k in meta_q_map, (
+                f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
+            )
+        if circular:
+            data = mcq_circular_eval(model, data, meta, nproc, result_file, self.dataset_name)
+        else:
+            data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
+        # load split
+        dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+        data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+        # May have different report acc functions for different datasets
+        if 'MMT' in dataset:
+            acc = report_acc_MMT(data)
+        else:
+            acc = report_acc(data)
+        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        dump(acc, score_file)
+        return acc
+class CustomTextMCQDataset(TextMCQDataset):
+    def load_data(self, dataset):
+        data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
+        if file_size(data_path, 'GB') > 1:
+            local_path = data_path.replace('.tsv', '_local.tsv')
+            if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None):
+                from ..tools import LOCALIZE
+                LOCALIZE(data_path, local_path)
+            data_path = local_path
+        return load(data_path)