1f commited on Jun 7, 2025

Commit

1ccf6d6

verified ·

1 Parent(s): fa29beb

Add files using upload-large-folder tool

Browse files

Files changed (20) hide show

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/ocr_evaluator.py +106 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/smp/__init__.py +4 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/smp/file.py +344 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/smp/log.py +47 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/smp/misc.py +291 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/smp/vlm.py +179 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/utils/__init__.py +7 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/utils/matching_util.py +69 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/utils/mp_util.py +72 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/utils/result_transfer.py +97 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/vlm/__init__.py +6 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/vlm/base.py +198 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/vlm/minicpm_v.py +727 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vqaeval/README.md +3 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vqaeval/datasets/__init__.py +0 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vqaeval/datasets/vqa_dataset.py +116 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vqaeval/eval.py +106 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vqaeval/eval_utils/cal_metric.py +40 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vqaeval/requirements.txt +49 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vqaeval/transform_docvqatest_for_submission.py +16 -0

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/ocr_evaluator.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import os
+import sys
+import json
+import re
+from collections import Counter
+# local import
+from .common import BaseMetric
+def token_normalize(token_text, is_lower=False, is_alphanum_only=False):
+    """
+    """
+    if is_lower:
+        token_text = token_text.lower()
+    if is_alphanum_only:
+        token_text = re.sub('[^A-Za-z0-9]+', '', token_text)
+    return token_text
+def text_normalize_and_tokenize(text, is_keep_blank=True, is_lower=True, is_alphanum_only=False):
+    text = text.replace("\t", " ").replace("\n", " ").replace("###", "").replace("***", "")
+    text = re.sub(r'\s+', ' ', text)
+    if not is_keep_blank:
+        text = text.replace(" ", "")
+    text_tokens = text.split(" ") if is_keep_blank else list(text)
+    text_token_normalized = [token_normalize(t, is_lower, is_alphanum_only) for t in text_tokens]
+    text_token_normalized = [x for x in text_token_normalized if len(x) > 0]
+    return text_token_normalized
+def evaluate_single_sample(gts, preds):
+    right_num = 0
+    gt_counter_info = dict(Counter(gts))
+    pdt_counter_info = dict(Counter(preds))
+    for gt_token, gt_count in gt_counter_info.items():
+        pred_count = pdt_counter_info.get(gt_token, 0)
+        right_num += min(gt_count, pred_count)
+    return right_num
+def calculate_metrics(response_info, gt_info, is_verbose=False):
+    """
+    """
+    macro_recall_list, macro_precision_list, macro_f1_list = [], [], []
+    total_gt_num, total_pred_num, total_right_num = 0, 0, 0
+    for file_name, fullbox_gts in gt_info.items():
+        fullbox_preds = response_info.get(file_name, [])
+        right_num = evaluate_single_sample(fullbox_gts, fullbox_preds)
+        total_right_num += right_num
+        total_gt_num += len(fullbox_gts)
+        total_pred_num += len(fullbox_preds)
+        macro_recall = right_num / (len(fullbox_gts) + 1e-9)
+        macro_precision = right_num / (len(fullbox_preds) + 1e-9)
+        macro_f1 = 2 * macro_recall * macro_precision / (macro_recall + macro_precision + 1e-9)
+        macro_recall_list.append(macro_recall)
+        macro_precision_list.append(macro_precision)
+        macro_f1_list.append(macro_f1)
+    # marco
+    final_macro_recall = sum(macro_recall_list) / (len(macro_recall_list) + 1e-9)
+    final_macro_precision = sum(macro_precision_list) / (len(macro_precision_list) + 1e-9)
+    final_macro_f1 = sum(macro_f1_list) / (len(macro_f1_list) + 1e-9)
+    # micro
+    recall_acc = total_right_num / (total_gt_num + 1e-9)
+    preci_acc = total_right_num / (total_pred_num + 1e-9)
+    hmean = 2 * recall_acc * preci_acc / (recall_acc + preci_acc + 1e-9)
+    vbs_eval_result = {
+        'macro_recall': final_macro_recall, 'macro_precision': final_macro_precision, 'macro_f1_score': final_macro_f1,
+        'micro_recall': recall_acc, 'micro_precision': preci_acc, 'mirco_f1_score': hmean
+    }
+    eval_result = vbs_eval_result if is_verbose else {'macro_f1_score': final_macro_f1, 'mirco_f1_score': hmean}
+    return eval_result
+class OcrEvaluator(BaseMetric):
+    def response_post_func(self, response_text, **kwargs):
+        return response_text
+    def evaluate(self, response_info, gt_info, **kwargs):
+        # hard code here
+        dataset_name = kwargs['dataset']
+        is_word_level, is_lower, is_alphanum_only = True, True, False
+        if dataset_name in ["Arabic", "Japanese", "Korean"] or "zh" in dataset_name:
+            is_word_level = False
+        if "multi_scene_ocr" in self.group_name and is_word_level:
+            is_alphanum_only = True
+        eval_config = {"word_level": is_word_level, "alphanum_only": is_alphanum_only, "lowercase": is_lower}
+        image_pdt_info, image_gt_info = {}, {}
+        for file_name, gt_src in gt_info.items():
+            pred_src = response_info.get(file_name, "")
+            pdt_token_list = text_normalize_and_tokenize(
+                str(pred_src).strip(), is_word_level, is_lower, is_alphanum_only)
+            gt_token_list = text_normalize_and_tokenize(
+                str(gt_src).strip(), is_word_level, is_lower, is_alphanum_only)
+            image_pdt_info[file_name] = pdt_token_list
+            image_gt_info[file_name] = gt_token_list
+        eval_result = calculate_metrics(image_pdt_info, image_gt_info, is_verbose=False)
+        return {"summary": eval_result, "metric_config": eval_config}
+if __name__ == '__main__':
+    pass

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/smp/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .file import *
+from .vlm import *
+from .misc import *
+from .log import *

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/smp/file.py ADDED Viewed

	@@ -0,0 +1,344 @@

+import json
+import pickle
+import pandas as pd
+import os
+import csv
+import hashlib
+import os.path as osp
+import time
+import numpy as np
+import validators
+import mimetypes
+import multiprocessing as mp
+from .misc import toliststr
+from .vlm import decode_base64_to_image_file
+def decode_img_omni(tup):
+    root, im, p = tup
+    images = toliststr(im)
+    paths = toliststr(p)
+    if len(images) > 1 and len(paths) == 1:
+        paths = [osp.splitext(p)[0] + f'_{i}' + osp.splitext(p)[1] for i in range(len(images))]
+    assert len(images) == len(paths)
+    paths = [osp.join(root, p) for p in paths]
+    for p, im in zip(paths, images):
+        if osp.exists(p):
+            continue
+        if isinstance(im, str) and len(im) > 64:
+            decode_base64_to_image_file(im, p)
+    return paths
+def localize_df(data, dname, nproc=32):
+    assert 'image' in data
+    indices = list(data['index'])
+    indices_str = [str(x) for x in indices]
+    images = list(data['image'])
+    image_map = {x: y for x, y in zip(indices_str, images)}
+    root = LMUDataRoot()
+    root = osp.join(root, 'images', dname)
+    os.makedirs(root, exist_ok=True)
+    if 'image_path' in data:
+        img_paths = list(data['image_path'])
+    else:
+        img_paths = []
+        for i in indices_str:
+            if len(image_map[i]) <= 64:
+                idx = image_map[i]
+                assert idx in image_map and len(image_map[idx]) > 64
+                img_paths.append(f'{idx}.jpg')
+            else:
+                img_paths.append(f'{i}.jpg')
+    tups = [(root, im, p) for p, im in zip(img_paths, images)]
+    pool = mp.Pool(32)
+    ret = pool.map(decode_img_omni, tups)
+    pool.close()
+    data.pop('image')
+    if 'image_path' not in data:
+        data['image_path'] = [x[0] if len(x) == 1 else x for x in ret]
+    return data
+def LMUDataRoot():
+    if 'LMUData' in os.environ and osp.exists(os.environ['LMUData']):
+        return os.environ['LMUData']
+    home = osp.expanduser('~')
+    root = osp.join(home, 'LMUData')
+    os.makedirs(root, exist_ok=True)
+    return root
+def HFCacheRoot():
+    cache_list = ['HUGGINGFACE_HUB_CACHE', 'HF_HOME']
+    for cache_name in cache_list:
+        if cache_name in os.environ and osp.exists(os.environ[cache_name]):
+            if os.environ[cache_name].split('/')[-1] == 'hub':
+                return os.environ[cache_name]
+            else:
+                return osp.join(os.environ[cache_name], 'hub')
+    home = osp.expanduser('~')
+    root = osp.join(home, '.cache', 'huggingface', 'hub')
+    os.makedirs(root, exist_ok=True)
+    return root
+def MMBenchOfficialServer(dataset_name):
+    root = LMUDataRoot()
+    if dataset_name in ['MMBench', 'MMBench_V11', 'MMBench_CN', 'MMBench_CN_V11']:
+        ans_file = f'{root}/{dataset_name}.tsv'
+        if osp.exists(ans_file):
+            data = load(ans_file)
+            if 'answer' in data and sum([pd.isna(x) for x in data['answer']]) == 0:
+                return True
+    if dataset_name in ['MMBench_TEST_EN', 'MMBench_TEST_CN', 'MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11']:
+        ans_file1 = f'{root}/{dataset_name}.tsv'
+        mapp = {
+            'MMBench_TEST_EN': 'MMBench', 'MMBench_TEST_CN': 'MMBench_CN',
+            'MMBench_TEST_EN_V11': 'MMBench_V11', 'MMBench_TEST_CN_V11': 'MMBench_CN_V11',
+        }
+        ans_file2 = f'{root}/{mapp[dataset_name]}.tsv'
+        for f in [ans_file1, ans_file2]:
+            if osp.exists(f):
+                data = load(f)
+                if 'answer' in data and sum([pd.isna(x) for x in data['answer']]) == 0:
+                    return True
+    return False
+class NumpyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
+                            np.int16, np.int32, np.int64, np.uint8,
+                            np.uint16, np.uint32, np.uint64)):
+            return int(obj)
+        elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)):
+            return float(obj)
+        elif isinstance(obj, (np.complex_, np.complex64, np.complex128)):
+            return {'real': obj.real, 'imag': obj.imag}
+        elif isinstance(obj, (np.ndarray,)):
+            return obj.tolist()
+        elif isinstance(obj, (np.bool_)):
+            return bool(obj)
+        elif isinstance(obj, (np.void)):
+            return None
+        return json.JSONEncoder.default(self, obj)
+# LOAD & DUMP
+def dump(data, f, **kwargs):
+    def dump_pkl(data, pth, **kwargs):
+        pickle.dump(data, open(pth, 'wb'))
+    def dump_json(data, pth, **kwargs):
+        json.dump(data, open(pth, 'w'), indent=4, ensure_ascii=False, cls=NumpyEncoder)
+    def dump_jsonl(data, f, **kwargs):
+        lines = [json.dumps(x, ensure_ascii=False, cls=NumpyEncoder) for x in data]
+        with open(f, 'w', encoding='utf8') as fout:
+            fout.write('\n'.join(lines))
+    def dump_xlsx(data, f, **kwargs):
+        data.to_excel(f, index=False, engine='xlsxwriter')
+    def dump_csv(data, f, quoting=csv.QUOTE_ALL):
+        data.to_csv(f, index=False, encoding='utf-8', quoting=quoting)
+    def dump_tsv(data, f, quoting=csv.QUOTE_ALL):
+        data.to_csv(f, sep='\t', index=False, encoding='utf-8', quoting=quoting)
+    handlers = dict(pkl=dump_pkl, json=dump_json, jsonl=dump_jsonl, xlsx=dump_xlsx, csv=dump_csv, tsv=dump_tsv)
+    suffix = f.split('.')[-1]
+    return handlers[suffix](data, f, **kwargs)
+def load(f, fmt=None):
+    def load_pkl(pth):
+        return pickle.load(open(pth, 'rb'))
+    def load_json(pth):
+        return json.load(open(pth, 'r', encoding='utf-8'))
+    def load_jsonl(f):
+        lines = open(f, encoding='utf-8').readlines()
+        lines = [x.strip() for x in lines]
+        if lines[-1] == '':
+            lines = lines[:-1]
+        data = [json.loads(x) for x in lines]
+        return data
+    def load_xlsx(f):
+        return pd.read_excel(f)
+    def load_csv(f):
+        return pd.read_csv(f)
+    def load_tsv(f):
+        return pd.read_csv(f, sep='\t')
+    handlers = dict(pkl=load_pkl, json=load_json, jsonl=load_jsonl, xlsx=load_xlsx, csv=load_csv, tsv=load_tsv)
+    if fmt is not None:
+        return handlers[fmt](f)
+    suffix = f.split('.')[-1]
+    return handlers[suffix](f)
+def download_file(url, filename=None):
+    import urllib.request
+    from tqdm import tqdm
+    class DownloadProgressBar(tqdm):
+        def update_to(self, b=1, bsize=1, tsize=None):
+            if tsize is not None:
+                self.total = tsize
+            self.update(b * bsize - self.n)
+    if filename is None:
+        filename = url.split('/')[-1]
+    try:
+        with DownloadProgressBar(unit='B', unit_scale=True, miniters=1, desc=url.split('/')[-1]) as t:
+            urllib.request.urlretrieve(url, filename=filename, reporthook=t.update_to)
+    except Exception as e:
+        import logging
+        logging.warning(f'{type(e)}: {e}')
+        # Handle Failed Downloads from huggingface.co
+        if 'huggingface.co' in url:
+            url_new = url.replace('huggingface.co', 'hf-mirror.com')
+            try:
+                download_file(url_new, filename)
+                return filename
+            except Exception as e:
+                logging.warning(f'{type(e)}: {e}')
+                raise Exception(f'Failed to download {url}')
+        else:
+            raise Exception(f'Failed to download {url}')
+    return filename
+def ls(dirname='.', match=[], mode='all', level=1):
+    if isinstance(level, str):
+        assert '+' in level
+        level = int(level[:-1])
+        res = []
+        for i in range(1, level + 1):
+            res.extend(ls(dirname, match=match, mode='file', level=i))
+        return res
+    if dirname == '.':
+        ans = os.listdir(dirname)
+    else:
+        ans = [osp.join(dirname, x) for x in os.listdir(dirname)]
+    assert mode in ['all', 'dir', 'file']
+    assert level >= 1 and isinstance(level, int)
+    if level == 1:
+        if isinstance(match, str):
+            match = [match]
+        for m in match:
+            if len(m) == 0:
+                continue
+            if m[0] != '!':
+                ans = [x for x in ans if m in x]
+            else:
+                ans = [x for x in ans if m[1:] not in x]
+        if mode == 'dir':
+            ans = [x for x in ans if osp.isdir(x)]
+        elif mode == 'file':
+            ans = [x for x in ans if not osp.isdir(x)]
+        return ans
+    else:
+        dirs = [x for x in ans if osp.isdir(x)]
+        res = []
+        for d in dirs:
+            res.extend(ls(d, match=match, mode=mode, level=level - 1))
+        return res
+def mrlines(fname, sp='\n'):
+    f = open(fname).read().split(sp)
+    while f != [] and f[-1] == '':
+        f = f[:-1]
+    return f
+def mwlines(lines, fname):
+    with open(fname, 'w') as fout:
+        fout.write('\n'.join(lines))
+def md5(s):
+    hash = hashlib.new('md5')
+    if osp.exists(s):
+        with open(s, 'rb') as f:
+            for chunk in iter(lambda: f.read(2**20), b''):
+                hash.update(chunk)
+    else:
+        hash.update(s.encode('utf-8'))
+    return str(hash.hexdigest())
+def last_modified(pth):
+    stamp = osp.getmtime(pth)
+    m_ti = time.ctime(stamp)
+    t_obj = time.strptime(m_ti)
+    t = time.strftime('%Y%m%d%H%M%S', t_obj)[2:]
+    return t
+def parse_file(s):
+    if osp.exists(s) and s != '.':
+        assert osp.isfile(s)
+        suffix = osp.splitext(s)[1].lower()
+        mime = mimetypes.types_map.get(suffix, 'unknown')
+        return (mime, s)
+    elif s.startswith('data:image/'):
+        # To be compatible with OPENAI base64 format
+        content = s[11:]
+        mime = content.split(';')[0]
+        content = ';'.join(content.split(';')[1:])
+        dname = osp.join(LMUDataRoot(), 'files')
+        assert content.startswith('base64,')
+        b64 = content[7:]
+        os.makedirs(dname, exist_ok=True)
+        tgt = osp.join(dname, md5(b64) + '.png')
+        decode_base64_to_image_file(b64, tgt)
+        return parse_file(tgt)
+    elif validators.url(s):
+        suffix = osp.splitext(s)[1].lower()
+        if suffix in mimetypes.types_map:
+            mime = mimetypes.types_map[suffix]
+            dname = osp.join(LMUDataRoot(), 'files')
+            os.makedirs(dname, exist_ok=True)
+            tgt = osp.join(dname, md5(s) + suffix)
+            download_file(s, tgt)
+            return (mime, tgt)
+        else:
+            return ('url', s)
+    else:
+        return (None, s)
+def file_size(f, unit='GB'):
+    stats = os.stat(f)
+    div_map = {
+        'GB': 2 ** 30,
+        'MB': 2 ** 20,
+        'KB': 2 ** 10,
+    }
+    return stats.st_size / div_map[unit]
+def parquet_to_tsv(file_path):
+    data = pd.read_parquet(file_path)
+    pth = '/'.join(file_path.split('/')[:-1])
+    data_name = file_path.split('/')[-1].split('.')[0]
+    data.to_csv(osp.join(pth, f'{data_name}.tsv'), sep='\t', index=False)

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/smp/log.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import logging
+logging.basicConfig(
+    format='[%(asctime)s] %(levelname)s - %(filename)s: %(funcName)s - %(lineno)d: %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S')
+logger_initialized = {}
+def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'):
+    logger = logging.getLogger(name)
+    if name in logger_initialized:
+        return logger
+    for logger_name in logger_initialized:
+        if name.startswith(logger_name):
+            return logger
+    stream_handler = logging.StreamHandler()
+    handlers = [stream_handler]
+    try:
+        import torch.distributed as dist
+        if dist.is_available() and dist.is_initialized():
+            rank = dist.get_rank()
+        else:
+            rank = 0
+    except ImportError:
+        rank = 0
+    if rank == 0 and log_file is not None:
+        file_handler = logging.FileHandler(log_file, file_mode)
+        handlers.append(file_handler)
+    formatter = logging.Formatter(
+        '[%(asctime)s] %(levelname)s - %(name)s - %(filename)s: %(funcName)s - %(lineno)d: %(message)s')
+    for handler in handlers:
+        handler.setFormatter(formatter)
+        handler.setLevel(log_level)
+        logger.addHandler(handler)
+    if rank == 0:
+        logger.setLevel(log_level)
+    else:
+        logger.setLevel(logging.ERROR)
+    logger_initialized[name] = True
+    return logger

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/smp/misc.py ADDED Viewed

	@@ -0,0 +1,291 @@

+# flake8: noqa: F401, F403
+import abc
+import argparse
+import csv
+import multiprocessing as mp
+import os
+import os.path as osp
+from pathlib import Path
+import copy as cp
+import random as rd
+import requests
+import shutil
+import subprocess
+import warnings
+import pandas as pd
+from collections import OrderedDict, defaultdict
+from multiprocessing import Pool, current_process
+from tqdm import tqdm
+import datetime
+import matplotlib.pyplot as plt
+from tabulate import tabulate
+from json import JSONDecoder
+from huggingface_hub import scan_cache_dir
+from huggingface_hub.utils._cache_manager import _scan_cached_repo
+from sty import fg, bg, ef, rs
+def modelscope_flag_set():
+    return os.environ.get('VLMEVALKIT_USE_MODELSCOPE', None) in ['1', 'True']
+def process_punctuation(inText):
+    import re
+    outText = inText
+    punct = [
+        ';', r'/', '[', ']', '"', '{', '}', '(', ')', '=', '+', '\\', '_', '-',
+        '>', '<', '@', '`', ',', '?', '!'
+    ]
+    commaStrip = re.compile('(\d)(,)(\d)')  # noqa: W605
+    periodStrip = re.compile('(?!<=\d)(\.)(?!\d)')  # noqa: W605
+    for p in punct:
+        if (p + ' ' in inText or ' ' + p in inText) or (re.search(
+                commaStrip, inText) is not None):
+            outText = outText.replace(p, '')
+        else:
+            outText = outText.replace(p, ' ')
+    outText = periodStrip.sub('', outText, re.UNICODE)
+    return outText
+def h2r(value):
+    if value[0] == '#':
+        value = value[1:]
+    assert len(value) == 6
+    return tuple(int(value[i:i + 2], 16) for i in range(0, 6, 2))
+def r2h(rgb):
+    return '#%02x%02x%02x' % rgb
+def colored(s, color):
+    if isinstance(color, str):
+        if hasattr(fg, color):
+            return getattr(fg, color) + s + fg.rs
+        color = h2r(color)
+    return fg(*color) + s + fg.rs
+def istype(s, type):
+    if isinstance(s, type):
+        return True
+    try:
+        return isinstance(eval(s), type)
+    except Exception as _:
+        return False
+def bincount(lst):
+    bins = defaultdict(lambda: 0)
+    for item in lst:
+        bins[item] += 1
+    return bins
+def get_cache_path(repo_id, branch='main', repo_type='datasets'):
+    try:
+        if modelscope_flag_set():
+            from modelscope.hub.file_download import create_temporary_directory_and_cache
+            if repo_type == 'datasets':
+                repo_type = 'dataset'
+            _, cache = create_temporary_directory_and_cache(model_id=repo_id, repo_type=repo_type)
+            cache_path = cache.get_root_location()
+            return cache_path
+        else:
+            from .file import HFCacheRoot
+            cache_path = HFCacheRoot()
+            org, repo_name = repo_id.split('/')
+            repo_path = Path(osp.join(cache_path, f'{repo_type}--{org}--{repo_name}/'))
+            hf_cache_info = _scan_cached_repo(repo_path=repo_path)
+            revs = {r.refs: r for r in hf_cache_info.revisions}
+            if branch is not None:
+                revs = {refs: r for refs, r in revs.items() if branch in refs}
+            rev2keep = max(revs.values(), key=lambda r: r.last_modified)
+            return str(rev2keep.snapshot_path)
+    except Exception as e:
+        import logging
+        logging.warning(f'{type(e)}: {e}')
+        return None
+def proxy_set(s):
+    import os
+    for key in ['http_proxy', 'HTTP_PROXY', 'https_proxy', 'HTTPS_PROXY']:
+        os.environ[key] = s
+def get_rank_and_world_size():
+    rank = int(os.environ.get('RANK', 0))
+    world_size = int(os.environ.get('WORLD_SIZE', 1))
+    return rank, world_size
+def splitlen(s, sym='/'):
+    return len(s.split(sym))
+def listinstr(lst, s):
+    assert isinstance(lst, list)
+    for item in lst:
+        if item in s:
+            return True
+    return False
+def d2df(D):
+    return pd.DataFrame({x: [D[x]] for x in D})
+def cn_string(s):
+    import re
+    if re.search(u'[\u4e00-\u9fff]', s):
+        return True
+    return False
+try:
+    import decord
+except ImportError:
+    pass
+def timestr(granularity='second'):
+    s = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
+    assert granularity in ['second', 'minute', 'hour', 'day']
+    if granularity == 'second':
+        return s
+    elif granularity == 'minute':
+        return s[:-2]
+    elif granularity == 'hour':
+        return s[:-4]
+    elif granularity == 'day':
+        return s[:-6]
+def _minimal_ext_cmd(cmd, cwd=None):
+    env = {}
+    for k in ['SYSTEMROOT', 'PATH', 'HOME']:
+        v = os.environ.get(k)
+        if v is not None:
+            env[k] = v
+    env['LANGUAGE'] = 'C'
+    env['LANG'] = 'C'
+    env['LC_ALL'] = 'C'
+    out = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env, cwd=cwd).communicate()[0]
+    return out
+def githash(fallback='unknown', digits=8):
+    if digits is not None and not isinstance(digits, int):
+        raise TypeError('digits must be None or an integer')
+    try:
+        import vlmeval
+    except ImportError as e:
+        import logging
+        logging.error(f'ImportError: {str(e)}')
+        return fallback
+    try:
+        out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'], cwd=vlmeval.__path__[0])
+        sha = out.strip().decode('ascii')
+        if digits is not None:
+            sha = sha[:digits]
+    except OSError:
+        sha = fallback
+    return sha
+def dict_merge(dct, merge_dct):
+    for k, _ in merge_dct.items():
+        if (k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], dict)):  #noqa
+            dict_merge(dct[k], merge_dct[k])
+        else:
+            dct[k] = merge_dct[k]
+def youtube_dl(idx):
+    cmd = f'youtube-dl -f best -f mp4 "{idx}"  -o {idx}.mp4'
+    os.system(cmd)
+def run_command(cmd):
+    if isinstance(cmd, str):
+        cmd = cmd.split()
+    return subprocess.check_output(cmd).decode()
+def load_env():
+    import logging
+    logging.basicConfig(
+        format='[%(asctime)s] %(levelname)s - %(filename)s: %(funcName)s - %(lineno)d: %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S')
+    try:
+        import vlmeval
+    except ImportError:
+        logging.error('VLMEval is not installed. Failed to import environment variables from .env file. ')
+        return
+    pth = osp.realpath(vlmeval.__path__[0])
+    pth = osp.join(pth, '../.env')
+    pth = osp.realpath(pth)
+    if not osp.exists(pth):
+        logging.error(f'Did not detect the .env file at {pth}, failed to load. ')
+        return
+    from dotenv import dotenv_values
+    values = dotenv_values(pth)
+    for k, v in values.items():
+        if v is not None and len(v):
+            os.environ[k] = v
+    logging.info(f'API Keys successfully loaded from {pth}')
+def pip_install_robust(package):
+    import sys
+    retry = 3
+    while retry > 0:
+        try:
+            package_base = package.split('=')[0]
+            module = __import__(package)
+            return True
+        except ImportError:
+            subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])
+            retry -= 1
+    return False
+def version_cmp(v1, v2, op='eq'):
+    from packaging import version
+    import operator
+    op_func = getattr(operator, op)
+    return op_func(version.parse(v1), version.parse(v2))
+def toliststr(s):
+    if isinstance(s, str) and (s[0] == '[') and (s[-1] == ']'):
+        return [str(x) for x in eval(s)]
+    elif isinstance(s, str):
+        return [s]
+    elif isinstance(s, list):
+        return [str(x) for x in s]
+    raise NotImplementedError
+def extract_json_objects(text, decoder=JSONDecoder()):
+    pos = 0
+    while True:
+        match = text.find('{', pos)
+        if match == -1: break
+        try:
+            result, index = decoder.raw_decode(text[match:])
+            yield result
+            pos = match + index
+        except ValueError:
+            pos = match + 1
+def get_gpu_memory():
+    import subprocess
+    try:
+        command = "nvidia-smi --query-gpu=memory.free --format=csv"
+        memory_free_info = subprocess.check_output(command.split()).decode('ascii').split('\n')[:-1][1:]
+        memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
+        return memory_free_values
+    except Exception as e:
+        print(f'{type(e)}: {str(e)}')
+        return []
+def auto_split_flag():
+    flag = os.environ.get('AUTO_SPLIT', '0')
+    if flag == '1':
+        return True
+    _, world_size = get_rank_and_world_size()
+    try:
+        import torch
+        device_count = torch.cuda.device_count()
+        if device_count > world_size and device_count % world_size == 0:
+            return True
+        else:
+            return False
+    except:
+        return False

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/smp/vlm.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import os
+import io
+import pandas as pd
+import numpy as np
+import string
+from uuid import uuid4
+import os.path as osp
+import base64
+from PIL import Image
+import sys
+Image.MAX_IMAGE_PIXELS = 1e9
+def rescale_img(img, tgt=None):
+    assert isinstance(tgt, tuple) and -1 in tgt
+    w, h = img.size
+    if tgt[0] != -1:
+        new_w, new_h = tgt[0], int(tgt[0] / w * h)
+    elif tgt[1] != -1:
+        new_w, new_h = int(tgt[1] / h * w), tgt[1]
+    img = img.resize((new_w, new_h))
+    return img
+def concat_images_vlmeval(images, target_size=-1, mode='h', return_image=False):
+    from .file import md5
+    ims = [Image.open(im) for im in images]
+    if target_size != -1:
+        ims = [
+            rescale_img(im, (-1, target_size) if mode == 'h' else (target_size, -1))
+            for im in ims
+        ]
+    ws, hs = [x.width for x in ims], [x.height for x in ims]
+    if mode == 'h':
+        new_w, new_h = sum(ws), max(hs)
+        dst = Image.new('RGB', (new_w, new_h))
+        for i, im in enumerate(ims):
+            dst.paste(im, (sum(ws[:i]), 0))
+    elif mode == 'v':
+        new_w, new_h = max(ws), sum(hs)
+        dst = Image.new('RGB', (new_w, new_h))
+        for i, im in enumerate(ims):
+            dst.paste(im, (sum(ws[:i], 0)))
+    if return_image:
+        return dst
+    else:
+        _str = '\n'.join(images)
+        str_md5 = md5(_str)
+        tgt = osp.join('/tmp', str_md5 + '.jpg')
+        dst.save(tgt)
+        return tgt
+def mmqa_display(question, target_size=512):
+    question = {k.lower(): v for k, v in question.items()}
+    keys = list(question.keys())
+    keys = [k for k in keys if k not in ['index', 'image']]
+    images = question['image']
+    if isinstance(images, str):
+        images = [images]
+    idx = question.pop('index', 'XXX')
+    print(f'INDEX: {idx}')
+    for im in images:
+        image = decode_base64_to_image(im, target_size=target_size)
+        display(image)  # noqa: F821
+    for k in keys:
+        try:
+            if not pd.isna(question[k]):
+                print(f'{k.upper()}. {question[k]}')
+        except ValueError:
+            if False in pd.isna(question[k]):
+                print(f'{k.upper()}. {question[k]}')
+def encode_image_to_base64(img, target_size=-1, fmt='JPEG'):
+    # if target_size == -1, will not do resizing
+    # else, will set the max_size ot (target_size, target_size)
+    if img.mode in ('RGBA', 'P'):
+        img = img.convert('RGB')
+    if target_size > 0:
+        img.thumbnail((target_size, target_size))
+    img_buffer = io.BytesIO()
+    img.save(img_buffer, format=fmt)
+    image_data = img_buffer.getvalue()
+    ret = base64.b64encode(image_data).decode('utf-8')
+    return ret
+def encode_image_file_to_base64(image_path, target_size=-1):
+    image = Image.open(image_path)
+    return encode_image_to_base64(image, target_size=target_size)
+def decode_base64_to_image(base64_string, target_size=-1):
+    image_data = base64.b64decode(base64_string)
+    image = Image.open(io.BytesIO(image_data))
+    if image.mode in ('RGBA', 'P'):
+        image = image.convert('RGB')
+    if target_size > 0:
+        image.thumbnail((target_size, target_size))
+    return image
+def decode_base64_to_image_file(base64_string, image_path, target_size=-1):
+    image = decode_base64_to_image(base64_string, target_size=target_size)
+    image.save(image_path)
+def build_option_str(option_dict):
+    s = 'There are several options: \n'
+    for c, content in option_dict.items():
+        if not pd.isna(content):
+            s += f'{c}. {content}\n'
+    return s
+def isimg(s):
+    return osp.exists(s) or s.startswith('http')
+def read_ok(img_path):
+    if not osp.exists(img_path):
+        return False
+    try:
+        im = Image.open(img_path)
+        assert im.size[0] > 0 and im.size[1] > 0
+        return True
+    except:
+        return False
+def gpt_key_set():
+    openai_key = os.environ.get('OPENAI_API_KEY', None)
+    return isinstance(openai_key, str) and openai_key.startswith('sk-')
+def apiok(wrapper):
+    s = wrapper.generate('Hello!')
+    return wrapper.fail_msg not in s
+def circular_pred(df, extract_func=None):
+    if extract_func is None:
+        extract_func = lambda x: x  # noqa: E731
+    df = df.sort_values('index')
+    from vlmeval.utils import can_infer_option
+    shift = int(1e6)
+    choices = [extract_func(x) for x in df['prediction']]
+    pred_map = {i: c for i, c in zip(df['index'], choices)}
+    flag_map = {i: True for i in pred_map if i < 1e6}
+    valid_map = {i: True for i in pred_map if i < 1e6}
+    for i in df['index']:
+        if i >= shift and pred_map[i] and pred_map[i - shift]:
+            if pred_map[i] not in list(
+                string.ascii_uppercase
+            ) or pred_map[  # noqa: W504
+                i - shift
+            ] not in list(
+                string.ascii_uppercase
+            ):
+                valid_map[i % shift] = False
+                continue
+            if (ord(pred_map[i]) - ord(pred_map[i - shift])) % 4 == 1:
+                continue
+            else:
+                flag_map[i % shift] = False
+    flag_map = {k: v for k, v in flag_map.items() if valid_map[k]}
+    flags = list(flag_map.values())
+    return np.mean(flags)

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .matching_util import can_infer, can_infer_option, can_infer_text
+from .mp_util import track_progress_rich
+__all__ = [
+    'can_infer', 'can_infer_option', 'can_infer_text', 'track_progress_rich',
+]

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/utils/matching_util.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import string
+import copy as cp
+import os
+from ..smp import *
+def can_infer_option(answer, choices):
+    verbose = os.environ.get('VERBOSE', 0)
+    # Choices is a dictionary
+    if 'Failed to obtain answer via API' in answer:
+        return False
+    reject_to_answer = [
+        "Sorry, I can't help with images of people yet.",
+        "I can't process this file.",
+        "I'm sorry, but without the image provided",
+        'Cannot determine the answer'
+    ]
+    for err in reject_to_answer:
+        if err in answer:
+            return 'Z'
+    def count_choice(splits, choices, prefix='', suffix=''):
+        cnt = 0
+        for c in choices:
+            if prefix + c + suffix in splits:
+                cnt += 1
+        return cnt
+    answer_mod = cp.copy(answer)
+    chars = '.()[],:;!*#{}'
+    for c in chars:
+        answer_mod = answer_mod.replace(c, ' ')
+    splits = [x.strip() for x in answer_mod.split()]
+    count = count_choice(splits, choices)
+    if count == 1:
+        for ch in choices:
+            if 'A' in splits and len(splits) > 3 and verbose:
+                logger = get_logger('Evaluation')
+                logger.info(f'A might be a quantifier in the string: {answer}.')
+                return False
+            if ch in splits:
+                return ch
+    elif count == 0 and count_choice(splits, {'Z', ''}) == 1:
+        return 'Z'
+    return False
+def can_infer_text(answer, choices):
+    answer = answer.lower()
+    assert isinstance(choices, dict)
+    for k in choices:
+        assert k in string.ascii_uppercase
+        choices[k] = str(choices[k]).lower()
+    cands = []
+    for k in choices:
+        if choices[k] in answer:
+            cands.append(k)
+    if len(cands) == 1:
+        return cands[0]
+    return False
+def can_infer(answer, choices):
+    answer = str(answer)
+    copt = can_infer_option(answer, choices)
+    return copt if copt else can_infer_text(answer, choices)

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/utils/mp_util.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from multiprocessing import Pool
+import os
+from typing import Callable, Iterable, Sized
+from rich.progress import (BarColumn, MofNCompleteColumn, Progress, Task,
+                           TaskProgressColumn, TextColumn, TimeRemainingColumn)
+from rich.text import Text
+import os.path as osp
+import time
+import portalocker
+from ..smp import load, dump
+def track_progress_rich(
+        func: Callable,
+        tasks: Iterable = tuple(),
+        nproc: int = 1,
+        save=None,
+        keys=None,
+        **kwargs) -> list:
+    from concurrent.futures import ThreadPoolExecutor
+    from tqdm import tqdm
+    if save is not None:
+        assert osp.exists(osp.dirname(save)) or osp.dirname(save) == ''
+        if not osp.exists(save):
+            dump({}, save)
+    if keys is not None:
+        assert len(keys) == len(tasks)
+    if not callable(func):
+        raise TypeError('func must be a callable object')
+    if not isinstance(tasks, Iterable):
+        raise TypeError(
+            f'tasks must be an iterable object, but got {type(tasks)}')
+    assert nproc > 0, 'nproc must be a positive number'
+    res = load(save) if save is not None else {}
+    results = [None for _ in range(len(tasks))]
+    with ThreadPoolExecutor(max_workers=nproc) as executor:
+        futures = []
+        for inputs in tasks:
+            if not isinstance(inputs, (tuple, list, dict)):
+                inputs = (inputs, )
+            if isinstance(inputs, dict):
+                future = executor.submit(func, **inputs)
+            else:
+                future = executor.submit(func, *inputs)
+            futures.append(future)
+        unfinished = set(range(len(tasks)))
+        pbar = tqdm(total=len(unfinished))
+        while len(unfinished):
+            new_finished = set()
+            for idx in unfinished:
+                if futures[idx].done():
+                    results[idx] = futures[idx].result()
+                    new_finished.add(idx)
+                    if keys is not None:
+                        res[keys[idx]] = results[idx]
+            if len(new_finished):
+                if save is not None:
+                    dump(res, save)
+                pbar.update(len(new_finished))
+                for k in new_finished:
+                    unfinished.remove(k)
+            time.sleep(0.1)
+        pbar.close()
+    if save is not None:
+        dump(res, save)
+    return results

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/utils/result_transfer.py ADDED Viewed

	@@ -0,0 +1,97 @@

+from ..smp import *
+from ..dataset.utils.judge_util import build_judge
+from ..dataset.utils.multiple_choice import extract_answer_from_item
+from .matching_util import can_infer
+from .mp_util import track_progress_rich
+def MMMU_result_transfer(result_path):
+    res = {}
+    result_data = load(result_path)
+    mcq = result_data['A'].notna()
+    lt = len(result_data)
+    for i in range(lt):
+        line = result_data.iloc[i]
+        if mcq[i]:
+            options = {
+                cand: line[cand]
+                for cand in string.ascii_uppercase
+                if cand in line and not pd.isna(line[cand])
+            }
+            prediction = line['prediction']
+            infer_prediction = can_infer(prediction, options)
+            res[line['id']] = infer_prediction
+        else:
+            res[line['id']] = line['prediction']
+    result_json = result_path.replace('.xlsx', '.json')
+    dump(res, result_json)
+    return result_json
+def MMTBench_result_transfer(eval_file, dataset='default', **judge_kwargs):
+    logger = get_logger('Evaluation')
+    nproc = judge_kwargs.pop('nproc', 4)
+    rd.seed(2680)
+    suffix = eval_file.split('.')[-1]
+    model = judge_kwargs['model']
+    assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
+    name_str_map = {
+        'chatgpt-0125': 'openai',
+        'gpt-4-0125': 'gpt4'
+    }
+    name_str = name_str_map[model] if model in name_str_map else model
+    if model == 'exact_matching':
+        model = None
+    elif gpt_key_set():
+        model = build_judge(**judge_kwargs)
+        if not model.working():
+            logger.error('The OPENAI API is not working properly, will use exact matching for evaluation')
+            model = None
+    else:
+        logger.error('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+        model = None
+    logger.info(f'Evaluating {eval_file}')
+    result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_option.pkl')
+    result = {}
+    if osp.exists(result_file):
+        result = load(result_file)
+    data = load(eval_file)
+    assert 'index' in data, 'Essentail columns missing in the eval_file.'
+    data = data.sort_values(by='index')
+    data['prediction'] = [str(x) for x in data['prediction']]
+    for k in data.keys():
+        data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
+    idx2lines = {data.iloc[i]['index']: data.iloc[i] for i in range(len(data))}
+    idx2lines = {k: v for k, v in idx2lines.items() if k not in result}
+    indices = list(idx2lines.keys())
+    lines = [idx2lines[i] for i in indices]
+    tups = [(model, line) for line in lines]
+    res = track_progress_rich(
+        extract_answer_from_item,
+        tups,
+        nproc=nproc,
+        chunksize=nproc,
+        save=result_file,
+        keys=indices)
+    for i, r in zip(indices, res):
+        if i in result:
+            assert result[i]['opt'] == r['opt'] and result[i]['log'] == r['log']
+        else:
+            result[i] = r
+    indices = list(data['index'])
+    data['opt'] = [result[i]['opt'] for i in data['index']]
+    data['log'] = [result[i]['log'] for i in data['index']]
+    # load split
+    output_path = eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv')
+    dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv'))
+    return output_path

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/vlm/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import torch
+torch.set_grad_enabled(False)
+torch.manual_seed(1234)
+from .base import BaseModel
+from .minicpm_v import MiniCPM_V, MiniCPM_Llama3_V, MiniCPM_V_2_6, MiniCPM_o_2_6

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/vlm/base.py ADDED Viewed

	@@ -0,0 +1,198 @@

+from ..smp import *
+from ..dataset import img_root_map, DATASET_TYPE
+from abc import abstractmethod
+class BaseModel:
+    INTERLEAVE = False
+    allowed_types = ['text', 'image', 'video']
+    def __init__(self):
+        self.dump_image_func = None
+    def use_custom_prompt(self, dataset):
+        """Whether to use custom prompt for the given dataset.
+        Args:
+            dataset (str): The name of the dataset.
+        Returns:
+            bool: Whether to use custom prompt. If True, will call `build_prompt` of the VLM to build the prompt.
+                Default to False.
+        """
+        return False
+    @abstractmethod
+    def build_prompt(self, line, dataset):
+        """Build custom prompts for a specific dataset. Called only if `use_custom_prompt` returns True.
+        Args:
+            line (line of pd.DataFrame): The raw input line.
+            dataset (str): The name of the dataset.
+        Returns:
+            str: The built message.
+        """
+        raise NotImplementedError
+    def set_dump_image(self, dump_image_func):
+        self.dump_image_func = dump_image_func
+    def dump_image(self, line, dataset):
+        return self.dump_image_func(line)
+    @abstractmethod
+    def generate_inner(self, message, dataset=None):
+        raise NotImplementedError
+    def check_content(self, msgs):
+        """Check the content type of the input. Four types are allowed: str, dict, liststr, listdict.
+        """
+        if isinstance(msgs, str):
+            return 'str'
+        if isinstance(msgs, dict):
+            return 'dict'
+        if isinstance(msgs, list):
+            types = [self.check_content(m) for m in msgs]
+            if all(t == 'str' for t in types):
+                return 'liststr'
+            if all(t == 'dict' for t in types):
+                return 'listdict'
+        return 'unknown'
+    def preproc_content(self, inputs):
+        """Convert the raw input messages to a list of dicts.
+        Args:
+            inputs: raw input messages.
+        Returns:
+            list(dict): The preprocessed input messages. Will return None if failed to preprocess the input.
+        """
+        if self.check_content(inputs) == 'str':
+            return [dict(type='text', value=inputs)]
+        elif self.check_content(inputs) == 'dict':
+            assert 'type' in inputs and 'value' in inputs
+            return [inputs]
+        elif self.check_content(inputs) == 'liststr':
+            res = []
+            for s in inputs:
+                mime, pth = parse_file(s)
+                if mime is None or mime == 'unknown':
+                    res.append(dict(type='text', value=s))
+                else:
+                    res.append(dict(type=mime.split('/')[0], value=pth))
+            return res
+        elif self.check_content(inputs) == 'listdict':
+            for item in inputs:
+                assert 'type' in item and 'value' in item
+                mime, s = parse_file(item['value'])
+                if mime is None:
+                    assert item['type'] == 'text'
+                else:
+                    assert mime.split('/')[0] == item['type']
+                    item['value'] = s
+            return inputs
+        else:
+            return None
+    def generate(self, message, dataset=None):
+        """Generate the output message.
+        Args:
+            message (list[dict]): The input message.
+            dataset (str, optional): The name of the dataset. Defaults to None.
+        Returns:
+            str: The generated message.
+        """
+        assert self.check_content(message) in ['str', 'dict', 'liststr', 'listdict'], f'Invalid input type: {message}'
+        message = self.preproc_content(message)
+        assert message is not None and self.check_content(message) == 'listdict'
+        for item in message:
+            assert item['type'] in self.allowed_types, f'Invalid input type: {item["type"]}'
+        return self.generate_inner(message, dataset)
+    def chat(self, messages, dataset=None):
+        """The main function for multi-turn chatting. Will call `chat_inner` with the preprocessed input messages."""
+        assert hasattr(self, 'chat_inner'), 'The API model should has the `chat_inner` method. '
+        for msg in messages:
+            assert isinstance(msg, dict) and 'role' in msg and 'content' in msg, msg
+            assert self.check_content(msg['content']) in ['str', 'dict', 'liststr', 'listdict'], msg
+            msg['content'] = self.preproc_content(msg['content'])
+        while len(messages):
+            try:
+                return self.chat_inner(messages, dataset=dataset)
+            except Exception as e:
+                logging.info(f'{type(e)}: {e}')
+                messages = messages[1:]
+                while len(messages) and messages[0]['role'] != 'user':
+                    messages = messages[1:]
+                continue
+        return 'Chat Mode: Failed with all possible conversation turns.'
+    def message_to_promptimg(self, message, dataset=None):
+        assert not self.INTERLEAVE
+        model_name = self.__class__.__name__
+        warnings.warn(
+            f'Model {model_name} does not support interleaved input. '
+            'Will use the first image and aggregated texts as prompt. ')
+        num_images = len([x for x in message if x['type'] == 'image'])
+        if num_images == 0:
+            prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+            image = None
+        else:
+            prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+            images = [x['value'] for x in message if x['type'] == 'image']
+            if 'BLINK' == dataset:
+                image = concat_images_vlmeval(images, target_size=512)
+            else:
+                image = images[0]
+        return prompt, image
+    def message_to_promptvideo(self, message):
+        if self.VIDEO_LLM:
+            num_videos = len([x for x in message if x['type'] == 'video'])
+            if num_videos == 0:
+                prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+                video = None
+            else:
+                prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+                video = [x['value'] for x in message if x['type'] == 'video'][0]
+            return prompt, video
+        else:
+            logging.critical('Model does not support video input.')
+            raise NotImplementedError
+    def message_to_promptvideo_withrole(self, message, dataset=None):
+        if self.VIDEO_LLM:
+            system, user, assistant, video_list = '', '', '', []
+            for msg in message:
+                if msg['type'] == 'text':
+                    if 'role' in msg and msg['role'] == 'system':
+                        system += msg['value']
+                    elif 'role' in msg and msg['role'] == 'assistant':
+                        assistant += msg['value']
+                    else:
+                        user += msg['value']
+                elif msg['type'] == 'video':
+                    video_list.append(msg['value'])
+            question = {
+                'system': system,
+                'user': user,
+                'assistant': assistant
+            }
+            if assistant == '':
+                if listinstr(['MCQ'], DATASET_TYPE(dataset)):
+                    question['assistant'] = 'Best Option: ('
+                else:
+                    del question['assistant']
+            if len(video_list) > 1:
+                print('VLMEvalKit only support single video as input, take first video as input')
+            video = video_list[0]
+            return question, video
+        else:
+            logging.critical('Model does not support video input.')
+            raise NotImplementedError

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/vlm/minicpm_v.py ADDED Viewed

	@@ -0,0 +1,727 @@

+import math
+import torch
+import random
+import numpy as np
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE, DATASET_MODALITY
+import re
+class MiniCPM_V(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = False
+    def __init__(self, model_path='openbmb/MiniCPM-V', **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+        print(f'load from {self.model_path}')
+        self.model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True)
+        self.model = self.model.to(dtype=torch.bfloat16)
+        self.model.eval().cuda()
+        self.kwargs = kwargs
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
+        torch.cuda.empty_cache()
+        self.num_beams = 3
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if listinstr(['MMDU', 'MME-RealWorld', 'MME-RealWorld-CN'], dataset):
+            # For Multi-Turn we don't have custom prompt
+            return False
+        return False
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+        question = line['question']
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        options_prompt = 'Options:\n'
+        for key, item in options.items():
+            options_prompt += f'{key}. {item}\n'
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        prompt = ''
+        if hint is not None:
+            prompt += f'Hint: {hint}\n'
+        prompt += f'{question}\n'
+        if len(options):
+            prompt += options_prompt
+            prompt = 'Study the image carefully and pick the option associated with the correct answer. \
+                Focus solely on selecting the option and avoid including any other content.\n' + prompt
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=p) for p in tgt_path])
+        return message
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        image = Image.open(image_path).convert('RGB')
+        msgs = [{'role': 'user', 'content': prompt}]
+        if DATASET_TYPE(dataset) == 'MCQ':
+            max_new_tokens = 20
+        elif DATASET_TYPE(dataset) == 'Y/N':
+            max_new_tokens = 100
+        else:
+            max_new_tokens = 1024
+        default_kwargs = dict(
+            max_new_tokens=max_new_tokens,
+            sampling=False,
+            num_beams=self.num_beams
+        )
+        default_kwargs.update(self.kwargs)
+        res, _, _ = self.model.chat(
+            image=image,
+            msgs=msgs,
+            context=None,
+            tokenizer=self.tokenizer,
+            **default_kwargs
+        )
+        return res
+class MiniCPM_Llama3_V(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = True
+    def __init__(self, model_path='openbmb/MiniCPM-Llama3-V-2_5', **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+        print(f'load from {self.model_path}')
+        self.model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True)
+        self.model = self.model.to(dtype=torch.float16)
+        self.model.eval().cuda()
+        self.kwargs = kwargs
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
+        torch.cuda.empty_cache()
+        self.num_beams = 3
+        self.options_system_prompt = ('Carefully read the following question and select the letter corresponding '
+                                      'to the correct answer. Highlight the applicable choices without giving '
+                                      'explanations.')
+        self.wo_options_system_prompt = 'Carefully read the following question Answer the question directly.'
+        self.detail_system_prompt = 'Answer this question in detail.'
+        self.vqa_prompt = 'Answer the question using a single word or phrase.'
+    def use_custom_prompt(self, dataset):
+        if listinstr(['MCQ', 'VQA'], DATASET_TYPE(dataset)):
+            return True
+        elif dataset is not None and listinstr(['HallusionBench'], dataset):
+            return True
+        return False
+    def build_prompt(self, line, dataset=None):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        tgt_path = self.dump_image(line, dataset)
+        system_prompt = ''
+        question = line['question']
+        if DATASET_TYPE(dataset) == 'MCQ':
+            options = {
+                cand: line[cand]
+                for cand in string.ascii_uppercase
+                if cand in line and not pd.isna(line[cand])
+            }
+            options_prompt = 'Options:\n'
+            for key, item in options.items():
+                options_prompt += f'{key}. {item}\n'
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+            prompt = ''
+            if hint is not None:
+                prompt += f'Hint: {hint}\n'
+            prompt += f'Question: {question}\n'
+            if len(options):
+                prompt += options_prompt
+                system_prompt = self.options_system_prompt + '\nPlease just indicate your choice.'
+            else:
+                system_prompt = self.wo_options_system_prompt
+            if 'MMMU' in dataset:  # Corner Case
+                prompt = system_prompt + '\n' + prompt
+                system_prompt = ''
+        elif dataset is not None and listinstr(['HallusionBench'], dataset):
+            question = line['question'] + ' Yes or No?'
+            prompt = question
+        elif dataset is not None and listinstr(['MME'], dataset):
+            question = line['question'] + ' Yes or No?'
+            prompt = question
+        elif dataset is not None and listinstr(['OCRBench'], dataset):
+            system_prompt = self.vqa_prompt
+            question = line['question']
+            prompt = question
+        elif DATASET_TYPE(dataset) == 'VQA':
+            if listinstr(['LLaVABench', 'MMLongBench_DOC'], dataset):
+                system_prompt = ''
+                prompt = question
+            elif listinstr(['MMVet'], dataset):
+                system_prompt = self.detail_system_prompt
+                prompt = question
+            else:
+                system_prompt = self.vqa_prompt
+                prompt = question
+        msgs = []
+        if system_prompt:
+            msgs.append(dict(type='text', value=system_prompt))
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=prompt))
+        return msgs
+    def generate_inner(self, message, dataset=None):
+        if DATASET_TYPE(dataset) == 'MCQ':
+            max_new_tokens = 200
+        elif DATASET_TYPE(dataset) == 'Y/N':
+            max_new_tokens = 3
+        else:
+            max_new_tokens = 1024
+        default_kwargs = dict(
+            max_new_tokens=max_new_tokens,
+            sampling=False,
+            num_beams=self.num_beams,
+        )
+        default_kwargs.update(self.kwargs)
+        content = []
+        for x in message:
+            if x['type'] == 'text':
+                content.append(x['value'])
+            elif x['type'] == 'image':
+                image = Image.open(x['value']).convert('RGB')
+                content.append(image)
+        msgs = [{'role': 'user', 'content': content}]
+        res = self.model.chat(
+            msgs=msgs,
+            context=None,
+            image=None,
+            tokenizer=self.tokenizer,
+            **default_kwargs
+        )
+        if isinstance(res, tuple) and len(res) > 0:
+            res = res[0]
+        return res
+    def chat_inner(self, message, dataset=None):
+        max_new_tokens = 1024
+        default_kwargs = dict(
+            max_new_tokens=max_new_tokens,
+            sampling=False,
+            num_beams=self.num_beams,
+        )
+        default_kwargs.update(self.kwargs)
+        msgs = []
+        for msg in message:
+            content = []
+            if len(msg['content']) == 1 and msg['content'][0]['type'] == 'text':
+                msg_new = {'role': msg['role'], 'content': msg['content'][0]['value']}
+                msgs.append(msg_new)
+                continue
+            for x in msg['content']:
+                if x['type'] == 'text':
+                    content.append(x['value'])
+                elif x['type'] == 'image':
+                    image = Image.open(x['value']).convert('RGB')
+                    content.append(image)
+            msg_new = {'role': msg['role'], 'content': content}
+            msgs.append(msg_new)
+        res = self.model.chat(
+            msgs=msgs,
+            context=None,
+            image=None,
+            tokenizer=self.tokenizer,
+            **default_kwargs)
+        if isinstance(res, tuple) and len(res) > 0:
+            res = res[0]
+        return res
+class MiniCPM_V_2_6(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = True
+    def __init__(self, model_path='openbmb/MiniCPM-V-2_6', **kwargs):
+        random.seed(0)
+        np.random.seed(0)
+        torch.manual_seed(0)
+        torch.cuda.manual_seed_all(0)
+        assert model_path is not None
+        self.model_path = model_path
+        print(f'load from path {self.model_path}')
+        self.model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True)
+        self.model = self.model.to(dtype=torch.bfloat16)
+        self.model.eval().cuda()
+        self.kwargs = kwargs
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
+        torch.cuda.empty_cache()
+        self.num_beams = 3
+        self.options_suffix_prompt = '''\nAnswer with the option's letter from the given choices directly.'''
+        self.wo_options_system_prompt = 'Carefully read the following question Answer the question directly.'
+        self.detail_system_prompt = 'Answer this question in detail.'
+        self.vqa_prompt = 'Answer the question using a single word or phrase.'
+        self.multi_choice_cot_prompt = ('''Carefully read the following multichoice question, solve it step '''
+                                        '''by step and finally pick the option associated with the correct '''
+                                        '''answer in the format of "Answer: selected option\n\n''')
+        self.short_ans_cot_prompt = ('''Read the following question carefully, solve it step by step, and '''
+                                     '''then output the final answer in the format of "Answer: single number '''
+                                     '''or single word or phrase".\n\n''')
+    def use_custom_prompt(self, dataset=None):
+        if dataset is None:
+            return False
+        if DATASET_TYPE(dataset) in ['MCQ', 'VQA', 'Y/N']:
+            return True
+        return False
+    def use_cot(self, dataset=None):
+        if dataset is None:
+            return False
+        if listinstr(['MMMU', 'HallusionBench', 'OCRBench', 'ChartQA'], dataset):
+            return True
+        elif listinstr(['MathVista', 'MMVet', 'MMBench', 'MMStar', 'AI2D', 'RealWorldQA',
+                        'POPE', 'ScienceQA', 'TextVQA', 'DocVQA'], dataset):
+            return False
+        else:
+            return False
+    def use_upsize(self, dataset=None):
+        if dataset is None:
+            return False
+        if listinstr(['MMVet', 'MMBench', 'MMStar', 'AI2D', 'OCRBench'], dataset):
+            return True
+        else:
+            return False
+    def build_prompt(self, line, dataset=None):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        tgt_path = self.dump_image(line, dataset)
+        system_prompt, prompt = '', ''
+        question = line['question']
+        if not self.use_cot(dataset):
+            if DATASET_TYPE(dataset) == 'MCQ':
+                options = {
+                    cand: line[cand]
+                    for cand in string.ascii_uppercase
+                    if cand in line and not pd.isna(line[cand])
+                }
+                options_prompt = 'Options:\n'
+                for key, item in options.items():
+                    options_prompt += f'{key}. {item}\n'
+                hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+                if hint is not None:
+                    prompt += f'Hint: {hint}\n'
+                prompt += f'Question: {question}\n'
+                if len(options):
+                    prompt += options_prompt
+                    prompt += self.options_suffix_prompt
+                else:
+                    system_prompt = self.wo_options_system_prompt
+                if 'MMMU' in dataset:
+                    if len(system_prompt) > 0:
+                        prompt = system_prompt + '\n' + prompt
+                        system_prompt = ''
+            elif dataset is not None and listinstr(['HallusionBench'], dataset):
+                question += ' Yes or No?'
+                prompt = question
+            elif dataset is not None and listinstr(['OCRBench'], dataset):
+                system_prompt = self.vqa_prompt
+                prompt = question
+            elif DATASET_TYPE(dataset) == 'VQA':
+                if listinstr(['LLaVABench'], dataset):
+                    system_prompt = ''
+                elif listinstr(['MMVet'], dataset):
+                    system_prompt = self.detail_system_prompt
+                else:
+                    system_prompt = self.vqa_prompt
+                prompt = question
+            else:
+                prompt = question
+        else:
+            has_options = True
+            if DATASET_TYPE(dataset) == 'MCQ':
+                options = {
+                    cand: line[cand]
+                    for cand in string.ascii_uppercase
+                    if cand in line and not pd.isna(line[cand])
+                }
+                options_prompt = ''
+                for key, item in options.items():
+                    options_prompt += f'{key}. {item}\n'
+                hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+                if hint is not None:
+                    prompt += f'Hint: {hint}\n'
+                prompt += f'{question}\n'
+                if len(options):
+                    prompt += options_prompt
+                else:
+                    has_options = False
+                if 'MMMU' in dataset:
+                    if len(system_prompt) > 0:
+                        prompt = system_prompt + '\n' + prompt
+                        system_prompt = ''
+            else:
+                prompt = question
+            if DATASET_TYPE(dataset) in ['MCQ', 'Y/N', 'VQA']:
+                if DATASET_TYPE(dataset) == 'MCQ':
+                    if has_options:
+                        prompt = self.multi_choice_cot_prompt + prompt
+                    else:
+                        prompt = self.short_ans_cot_prompt + prompt
+                elif DATASET_TYPE(dataset) == 'Y/N':
+                    prompt = self.short_ans_cot_prompt + prompt
+                else:
+                    prompt = self.short_ans_cot_prompt + prompt
+        msgs = []
+        if system_prompt:
+            msgs.append(dict(type='text', value=system_prompt))
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=prompt))
+        return msgs
+    def generate_inner(self, message, dataset=None):
+        if DATASET_MODALITY(dataset) == 'VIDEO':
+            max_slice_nums = 1
+            use_image_id = False
+            max_inp_length = 2048 * 10
+        else:
+            max_slice_nums = None
+            use_image_id = True
+            max_inp_length = 8192
+        max_new_tokens = 2048
+        default_kwargs = dict(
+            max_new_tokens=max_new_tokens,
+            sampling=False,
+            num_beams=self.num_beams,
+        )
+        default_kwargs.update(self.kwargs)
+        content = []
+        for x in message:
+            if x['type'] == 'text':
+                content.append(x['value'])
+            elif x['type'] == 'image':
+                image = Image.open(x['value']).convert('RGB')
+                if not self.use_upsize(dataset):
+                    content.append(image)
+                else:
+                    img_width, img_height = image.width, image.height
+                    if (img_width * img_height) >= (1344 * 1344):
+                        content.append(image)
+                    else:
+                        ratio = math.sqrt((1344 * 1344) / (img_width * img_height))
+                        max_img_width = int(img_width * ratio)
+                        new_img_width = random.randint(img_width, max_img_width)
+                        new_img_height = int(new_img_width / img_width * img_height)
+                        resized_image = image.resize((new_img_width, new_img_height))
+                        content.append(resized_image)
+        msgs = [{'role': 'user', 'content': content}]
+        res = self.model.chat(
+            image=None,
+            msgs=msgs,
+            context=None,
+            tokenizer=self.tokenizer,
+            max_inp_length=max_inp_length,
+            use_image_id=use_image_id,
+            max_slice_nums=max_slice_nums,
+            **default_kwargs
+        )
+        if isinstance(res, tuple) and len(res) > 0:
+            res = res[0]
+        return res
+class MiniCPM_o_2_6(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = True
+    def __init__(self, model_path='openbmb/MiniCPM-o-2_6', **kwargs):
+        random.seed(0)
+        np.random.seed(0)
+        torch.manual_seed(0)
+        torch.cuda.manual_seed_all(0)
+        assert model_path is not None
+        self.model_path = model_path
+        print(f'load from path {self.model_path}')
+        self.model = AutoModel.from_pretrained(
+            self.model_path,
+            trust_remote_code=True,
+            attn_implementation='sdpa',
+            torch_dtype=torch.bfloat16,
+            init_vision=True,
+            init_audio=False,
+            init_tts=False
+        )
+        self.model.eval().cuda()
+        self.kwargs = kwargs
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
+        torch.cuda.empty_cache()
+        num_beams = int(os.getenv("NUM_BEAMS", "3"))
+        self.num_beams = 3 if self.model_path == 'openbmb/MiniCPM-o-2_6' else num_beams
+        repetition_penalty = float(os.getenv("PENALTY", "1.2"))
+        self.repetition_penalty = repetition_penalty
+        self.options_suffix_prompt = '''\nAnswer with the option's letter from the given choices directly.'''
+        self.wo_options_system_prompt = 'Carefully read the following question Answer the question directly.'
+        self.detail_system_prompt = 'Answer this question in detail.'
+        self.vqa_prompt = 'Answer the question using a single word or phrase.'
+        self.multi_choice_cot_prompt = ('''Carefully read the following multichoice question, solve it step '''
+                                        '''by step and finally pick the option associated with the correct '''
+                                        '''answer in the format of "Answer: selected option\n\n''')
+        self.short_ans_cot_prompt = ('''Read the following question carefully, solve it step by step, and '''
+                                     '''then output the final answer in the format of "Answer: single number '''
+                                     '''or single word or phrase".\n\n''')
+    def use_custom_prompt(self, dataset=None):
+        if dataset is None:
+            return False
+        if listinstr(['MCQ', 'VQA', 'Y/N'], DATASET_TYPE(dataset)):
+            return True
+        return False
+    def use_cot(self, dataset=None):
+        if dataset is None:
+            return False
+        if listinstr(['MMMU', 'MathVista', 'OCRBench', 'ChartQA', 'MathVision', 'MathVerse_MINI_Vision_Only'], dataset):
+            return True
+        elif listinstr(['MMVet', 'MMBench', 'MMStar', 'HallusionBench', 'AI2D', 'RealWorldQA',
+                        'POPE', 'ScienceQA', 'TextVQA', 'DocVQA'], dataset):
+            return False
+        else:
+            return False
+    def use_upsize(self, dataset=None):
+        if dataset is None:
+            return False
+        if listinstr(['MathVista', 'MMBench_TEST_CN', 'MMStar', 'AI2D', 'OCRBench', 'DynaMath'], dataset):
+            return True
+        else:
+            return False
+    def build_prompt(self, line, dataset=None):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        tgt_path = self.dump_image(line, dataset)
+        system_prompt, prompt = '', ''
+        question = line['question']
+        if not self.use_cot(dataset):
+            if DATASET_TYPE(dataset) == 'MCQ':
+                options = {
+                    cand: line[cand]
+                    for cand in string.ascii_uppercase
+                    if cand in line and not pd.isna(line[cand])
+                }
+                options_prompt = 'Options:\n'
+                for key, item in options.items():
+                    options_prompt += f'{key}. {item}\n'
+                hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+                if hint is not None:
+                    prompt += f'Hint: {hint}\n'
+                prompt += f'Question: {question}\n'
+                if len(options):
+                    prompt += options_prompt
+                    prompt += self.options_suffix_prompt
+                else:
+                    system_prompt = self.wo_options_system_prompt
+                if 'MMMU' in dataset:
+                    if len(system_prompt) > 0:
+                        prompt = system_prompt + '\n' + prompt
+                        system_prompt = ''
+            elif dataset is not None and listinstr(['HallusionBench'], dataset):
+                question += ' Yes or No?'
+                prompt = question
+            elif dataset is not None and listinstr(['OCRBench'], dataset):
+                system_prompt = self.vqa_prompt
+                prompt = question
+            elif DATASET_TYPE(dataset) == 'VQA':
+                if listinstr(['LLaVABench'], dataset):
+                    system_prompt = ''
+                elif listinstr(['MMVet'], dataset):
+                    system_prompt = self.detail_system_prompt
+                else:
+                    system_prompt = self.vqa_prompt
+                prompt = question
+            else:
+                prompt = question
+        else:
+            has_options = True
+            if DATASET_TYPE(dataset) == 'MCQ':
+                options = {
+                    cand: line[cand]
+                    for cand in string.ascii_uppercase
+                    if cand in line and not pd.isna(line[cand])
+                }
+                options_prompt = ''
+                for key, item in options.items():
+                    options_prompt += f'{key}. {item}\n'
+                hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+                if hint is not None:
+                    prompt += f'Hint: {hint}\n'
+                prompt += f'{question}\n'
+                if len(options):
+                    prompt += options_prompt
+                else:
+                    has_options = False
+                if 'MMMU' in dataset:
+                    if len(system_prompt) > 0:
+                        prompt = system_prompt + '\n' + prompt
+                        system_prompt = ''
+            else:
+                prompt = question
+            if DATASET_TYPE(dataset) in ['MCQ', 'Y/N', 'VQA']:
+                if DATASET_TYPE(dataset) == 'MCQ':
+                    if has_options:
+                        prompt = self.multi_choice_cot_prompt + prompt
+                    else:
+                        prompt = self.short_ans_cot_prompt + prompt
+                elif DATASET_TYPE(dataset) == 'Y/N':
+                    prompt = self.short_ans_cot_prompt + prompt
+                else:
+                    prompt = self.short_ans_cot_prompt + prompt
+        msgs = []
+        if system_prompt:
+            msgs.append(dict(type='text', value=system_prompt))
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=prompt))
+        return msgs
+    def extract_answer(self, res, dataset=None):
+        if dataset is None:
+            return res
+        if self.use_cot(dataset):
+            if DATASET_TYPE(dataset) == 'MCQ':
+                pattern = r'Answer:\s*([A-Ia-i])(?![A-Za-z])'
+                matches = re.findall(pattern, res, re.DOTALL)
+                if matches:
+                    extracted_res = matches[-1].strip()
+                else:
+                    extracted_res = res
+                return extracted_res
+            elif DATASET_TYPE(dataset) == 'VQA' and not listinstr(['OCRBench'], dataset):
+                pattern = r'Answer:\s*(.*)\s*$'
+                match = re.search(pattern, res, re.DOTALL)
+                if match:
+                    extracted_res = match.group(1)
+                else:
+                    extracted_res = res
+                return extracted_res
+        return res
+    def generate_inner(self, message, dataset=None):
+        if DATASET_MODALITY(dataset) == 'VIDEO':
+            max_slice_nums = 1
+            use_image_id = False
+            max_inp_length = 2048 * 10
+        else:
+            max_slice_nums = None
+            use_image_id = True
+            max_inp_length = 8192
+        max_new_tokens = 2048
+        default_kwargs = dict(
+            max_new_tokens=max_new_tokens,
+            sampling=False,
+            repetition_penalty=self.repetition_penalty,
+            num_beams=self.num_beams,
+        )
+        default_kwargs.update(self.kwargs)
+        content = []
+        for x in message:
+            if x['type'] == 'text':
+                content.append(x['value'])
+            elif x['type'] == 'image':
+                image = Image.open(x['value']).convert('RGB')
+                if not self.use_upsize(dataset):
+                    content.append(image)
+                else:
+                    img_width, img_height = image.width, image.height
+                    if (img_width * img_height) >= (1344 * 1344):
+                        content.append(image)
+                    else:
+                        ratio = math.sqrt((1344 * 1344) / (img_width * img_height))
+                        max_img_width = int(img_width * ratio)
+                        new_img_width = random.randint(img_width, max_img_width)
+                        new_img_height = int(new_img_width / img_width * img_height)
+                        resized_image = image.resize((new_img_width, new_img_height))
+                        content.append(resized_image)
+        msgs = [{'role': 'user', 'content': content}]
+        res = self.model.chat(
+            image=None,
+            msgs=msgs,
+            context=None,
+            tokenizer=self.tokenizer,
+            max_inp_length=max_inp_length,
+            use_image_id=use_image_id,
+            max_slice_nums=max_slice_nums,
+            **default_kwargs
+        )
+        if isinstance(res, tuple) and len(res) > 0:
+            res = res[0]
+        res = self.extract_answer(res, dataset)
+        return res

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vqaeval/README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # vqa-eval
2	+
3	+ contains vqa_eval kit from the server.

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vqaeval/datasets/__init__.py ADDED Viewed

File without changes

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vqaeval/datasets/vqa_dataset.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import json
+import os
+import re
+from torch.utils.data import Dataset
+def prompt_processor(prompt):
+    if prompt.startswith('OCR tokens: '):
+        pattern = r"Question: (.*?) Short answer:"
+        match = re.search(pattern, prompt, re.DOTALL)
+        question = match.group(1)
+    elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
+        if prompt.startswith('Reference OCR token:'):
+            question = prompt.split('\n')[1]
+        else:
+            question = prompt.split('\n')[0]
+    elif len(prompt.split('\n')) == 2:
+        question = prompt.split('\n')[0]
+    else:
+        assert False
+    return question.lower()
+class textVQADataset(Dataset):
+    def __init__(
+        self,
+        image_dir="./downloads/TextVQA/train_images",
+        ann_path="./downloads/TextVQA/TextVQA_0.5.1_val.json",
+    ):
+        self.data = json.load(open(ann_path, "r"))["data"]
+        self.image_dir = image_dir
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        question = self.data[idx]['question']
+        answers = self.data[idx]['answers']
+        img_id = self.data[idx]['image_id']
+        qid = self.data[idx]['question_id']
+        img_path = os.path.join(self.image_dir, f"{img_id}.jpg")
+        item = {
+            "question_id": qid,
+            "image_path": img_path,
+            "question": question,
+            "gt_answers": answers
+        }
+        return item
+class docVQADataset(Dataset):
+    def __init__(
+        self,
+        image_dir= "./downloads/DocVQA/spdocvqa_images",
+        ann_path= "./downloads/DocVQA/val_v1.0_withQT.json",
+        ocr_token_path=None
+    ):
+        self.data = json.load(open(ann_path, "r"))["data"]
+        self.image_dir = image_dir
+        self.ann_path = ann_path
+        if ocr_token_path:
+            self.ocr_token_data = {item['image_id']: item for item in json.load(open(ocr_token_path, "r"))["data"]}
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        question_id = self.data[idx]['questionId']
+        relative_img_path = self.data[idx]['image']
+        corrected_relative_img_path = relative_img_path.replace("documents", "images")
+        img_path = os.path.join(self.image_dir, corrected_relative_img_path)
+        question = self.data[idx]['question']
+        answers = self.data[idx]['answers']
+        question_type = self.data[idx]['question_types']
+        return {
+            "question_id": question_id,
+            "image_path": img_path,
+            "question": question,
+            "gt_answers": answers,
+            'question_type': question_type,
+        }
+class docVQATESTDataset(Dataset):
+    def __init__(
+        self,
+        image_dir= "./downloads/DocVQA/spdocvqa_images",
+        ann_path= "./downloads/DocVQA/test_v1.0.json",
+        ocr_token_path=None
+    ):
+        self.data = json.load(open(ann_path, "r"))["data"]
+        self.image_dir = image_dir
+        self.ann_path = ann_path
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        question_id = self.data[idx]['questionId']
+        relative_img_path = self.data[idx]['image']
+        corrected_relative_img_path = relative_img_path.replace("documents", "images")
+        img_path = os.path.join(self.image_dir, corrected_relative_img_path)
+        question = self.data[idx]['question']
+        return {
+            "question_id": question_id,
+            "image_path": img_path,
+            "question": question,
+            "gt_answers": "",
+            'question_type': "",
+        }

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vqaeval/eval.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import sys
+import datetime
+import json
+import os
+import torch
+script_dir = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(os.path.join(script_dir, '..'))
+from datasets.vqa_dataset import docVQADataset, docVQATESTDataset, textVQADataset
+print(torch.__version__)
+import numpy as np
+from eval_utils.getargs import parse_args
+from eval_utils.vqa_evaluate import *
+def get_model(args):
+    if args.model_name == '':
+        raise Exception('Model name cannot be empty str!')
+    from models.MiniCPM.minicpmv import MiniCPM_V, MiniCPM_V_2_6, MiniCPM_o_2_6
+    model_path = args.model_path
+    ckpt = args.ckpt
+    if args.model_name == 'minicpmv':
+        model = MiniCPM_V(model_path=model_path, ckpt=ckpt, device=args.device)
+    elif args.model_name == 'minicpmv26':
+        model = MiniCPM_V_2_6(model_path=model_path, ckpt=ckpt, device=args.device)
+    elif args.model_name == 'minicpmo26':
+        model = MiniCPM_o_2_6(model_path=model_path, ckpt=ckpt, device=args.device)
+    else:
+        raise Exception(f"Unexpected Moedel Name {args.model_name}!")
+    return model
+def main(args):
+    np.random.seed(0)
+    max_sample_num = None
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+    print(f'Init Rank-{torch.distributed.get_rank()}')
+    if torch.distributed.is_initialized():
+        args.device = torch.device(f"cuda:{torch.cuda.current_device()}")
+    model = get_model(args)
+    result = {}
+    time = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+    if args.eval_textVQA or args.eval_all:
+        dataset = textVQADataset(args.textVQA_image_dir, args.textVQA_ann_path)
+        if max_sample_num is not None:
+            dataset = torch.utils.data.Subset(dataset, range(max_sample_num))
+        acc = evaluate_VQA(model, dataset, args.model_name, 'textVQA', time, \
+                batch_size=args.batchsize, generate_method=args.generate_method, answer_path=args.answer_path)
+        result['textVQA'] = acc
+    if args.eval_docVQA or args.eval_all:
+        dataset = docVQADataset(args.docVQA_image_dir, args.docVQA_ann_path)
+        if max_sample_num is not None:
+            dataset = torch.utils.data.Subset(dataset, range(max_sample_num))
+        acc = evaluate_VQA(model, dataset, args.model_name, 'docVQA', time, \
+            batch_size=args.batchsize, generate_method=args.generate_method, answer_path=args.answer_path)
+        result['docVQA'] = acc
+    if args.eval_docVQATest or args.eval_all:
+        dataset = docVQATESTDataset(args.docVQATest_image_dir, args.docVQATest_ann_path)
+        if max_sample_num is not None:
+            dataset = torch.utils.data.Subset(dataset, range(max_sample_num))
+        acc = evaluate_VQA(model, dataset, args.model_name, 'docVQATest', time, \
+            batch_size=args.batchsize, generate_method=args.generate_method, answer_path=args.answer_path)
+        result['docVQATest'] = acc
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+    if torch.distributed.is_initialized() and torch.distributed.get_rank() != 0:
+        return None
+    result_path = os.path.join(os.path.join(args.answer_path, args.model_name), 'result.json')
+    output_flag = False
+    for k, v in result.items():
+        if v > 0.0:
+            output_flag = True
+            break
+    if output_flag:
+        with open(result_path, "w") as f:
+            f.write(json.dumps(result, indent=4))
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vqaeval/eval_utils/cal_metric.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import json
+import glob
+import re
+def has_word(sentence, word):
+    pattern = r"\b" + re.escape(word) + r"\b"
+    match = re.search(pattern, sentence)
+    if match:
+        return True
+    else:
+        return False
+def remove_special_chars(s):
+    pattern = r"[^a-zA-Z0-9\s]"
+    s = re.sub(pattern, "", s)
+    return s
+for model in glob.glob('./answer_save/*'):
+    print(model, ':')
+    result_list = sorted(glob.glob(f'{model}/*.json'))
+    for task_result_path in result_list:
+        taskname = task_result_path.split('/')[-1]
+        taskname = taskname.split('.')[0]
+        if taskname not in ['IIIT5K', 'svt', 'IC13_857', 'IC15_1811', 'svtp', 'ct80',
+                            'cocotext', 'ctw', 'totaltext', 'HOST']:
+            continue
+        correct = 0
+        num = 0
+        with open(task_result_path, 'r') as f:
+            dict = json.load(f)[:100]
+            for i in range(len(dict)):
+                gt_answers = dict[i]['gt_answers']
+                answer = dict[i]['answer']
+                gt_answers = remove_special_chars(gt_answers).lower()
+                answer = remove_special_chars(answer).lower()
+                if has_word(answer, gt_answers):
+                    correct+=1
+                num+=1
+        print(f'{taskname:10s}:{float(correct)/num*100:.2f}')
+    print('=' * 32)

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vqaeval/requirements.txt ADDED Viewed

	@@ -0,0 +1,49 @@

+accelerate
+aiohttp==3.8.4
+aiosignal==1.3.1
+async-timeout==4.0.2
+attrs==22.2.0
+bitsandbytes==0.37.0
+cchardet==2.1.7
+chardet==5.1.0
+contourpy==1.0.7
+cycler==0.11.0
+filelock==3.9.0
+fonttools==4.38.0
+frozenlist==1.3.3
+huggingface-hub==0.13.4
+importlib-resources==5.12.0
+kiwisolver==1.4.4
+matplotlib==3.7.0
+multidict==6.0.4
+openai==0.27.0
+packaging==23.0
+psutil==5.9.4
+pycocotools==2.0.6
+pyparsing==3.0.9
+python-dateutil==2.8.2
+pyyaml==6.0
+regex==2022.10.31
+tokenizers==0.13.2
+tqdm==4.64.1
+transformers==4.44.2
+timm==0.6.13
+spacy==3.5.1
+webdataset==0.2.48
+scikit-learn==1.2.2
+scipy==1.10.1
+yarl==1.8.2
+zipp==3.14.0
+omegaconf==2.3.0
+opencv-python==4.7.0.72
+iopath==0.1.10
+decord==0.6.0
+tenacity==8.2.2
+peft
+pycocoevalcap
+sentence-transformers
+umap-learn
+notebook
+gradio==3.24.1
+gradio-client==0.0.8
+wandb

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vqaeval/transform_docvqatest_for_submission.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import argparse
+import json
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_file_path", type=str, default="", help="path to the originial output json.")
+    parser.add_argument("--output_file_path", type=str, default="", help="path to where you want to save the processed json.")
+    args = parser.parse_args()
+    with open(args.input_file_path , 'r') as f:
+        data = json.load(f)
+    transformed_data = [{"questionId": item["question_id"], "answer": item["answer"].replace("</s>", "")} for item in data]
+    with open(args.output_file_path, 'w') as f:
+        json.dump(transformed_data, f)