1f commited on Jun 7, 2025

Commit

052bf16

verified ·

1 Parent(s): 885ccec

Add files using upload-large-folder tool

Browse files

Files changed (20) hide show

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/__init__.py +9 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/cgbench.py +682 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/crpe.py +13 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/hrbench.py +54 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/judge_util.py +49 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/llavabench.py +65 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/logicvista.py +150 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/longvideobench.py +80 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathv.py +171 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathverse.py +193 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathvista.py +164 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mlvu.py +189 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmbench_video.py +70 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/vcr.py +335 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/video_base.py +135 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/video_concat_dataset.py +85 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/video_dataset_config.py +103 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/videomme.py +283 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/vl_rewardbench.py +174 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/wildvision.py +222 -0

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .judge_util import build_judge, DEBUG_MESSAGE
+from .multiple_choice import extract_answer_from_item, prefetch_answer
+from .vqa_eval import levenshtein_distance
+__all__ = [
+    'build_judge', 'extract_answer_from_item', 'prefetch_answer',
+    'levenshtein_distance', 'DEBUG_MESSAGE',
+]

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/cgbench.py ADDED Viewed

	@@ -0,0 +1,682 @@

+from ...smp import *
+from .multiple_choice import extract_answer_from_item
+import pandas as pd
+import numpy as np
+import re
+FAIL_MSG = "Failed to obtain answer via API."
+frame_tmpl = "frame-{}-of-{}.jpg"
+sys_prompt_open_eval_step_1 = (
+    "You will be provided with a question, a model's prediction, and the ground "
+    "truth answer for this question.\n"
+    "Your task is to judge whether the model's prediction is correct based on the "
+    "meaning of the two texts.\n"
+    "In most cases, this can be done by determining if the meaning of the model's "
+    "prediction is consistent with, or contains, the ground truth answer. However, "
+    "in some cases where the two texts differ, it may represent different "
+    "descriptions of the same visual scene, in which case visual information is "
+    "needed for further judgment.\n"
+    "Therefore, I hope you:\n"
+    "- Output 0, if the model's prediction and the ground truth answer are neither "
+    "consistent nor related by inclusion, with fundamentally different meanings.\n"
+    "- Output 1, if the meaning of the model's prediction and the ground truth "
+    "answer is consistent, or if the model's prediction meaningfully contains the "
+    "ground truth answer.\n"
+    "- Output 2, if the model's prediction and ground truth are not consistent or "
+    "inclusive, but may be different descriptions of the same visual scene, "
+    "requiring visual information for further judgment.\n"
+    "Only output the answer in the following format:\n\n"
+    '```json\n{"result": choice}\n```\n\n'
+    "The choice is either 0, 1, or 2 as specified above."
+)
+sys_prompt_open_eval_step_2 = (
+    "You will be provided with a question, a model's prediction, and the sampling "
+    "frames of the clue intervals related to this question.\n"
+    "Your task is to determine whether the model has answered the question "
+    "correctly based on the visual information provided.\n"
+    "Therefore, I hope you:\n"
+    "- Output 0, if the model's prediction does not correctly answer the question.\n"
+    "- Output 1, if the model's prediction correctly answers the question.\n"
+    "Only output the answer in the following format without output extra "
+    "explanation:\n\n"
+    '```json\n{"result": choice}\n```\n\n'
+    "The choice is either 0 or 1 as specified above."
+)
+FAIL_MSG = "Failed to obtain answer via API."
+# '10-20', '20-30', '30-40', '40-50', '50-60'
+DURATIONS = ["0 ~ 10", "10 ~ 20", "20 ~ 30", "30 ~ 40", "40 ~ 50", "50 ~ 60", "60+"]
+DOMAINS = [
+    "Life Record",
+    "Music & TV show",
+    "Instruction & Knowledge",
+    "Driving",
+    "Embodied Expert",
+    "Humor/funny",
+    "Electonic/Social Gaming",
+    "Security & Health",
+    "Sports & Exercise",
+    "Special Scenes",
+    "Art & Culture",
+    "GUI",
+    "News",
+    "Animal & Pet",
+]
+SUB_CATEGORIES = [
+    "Time Cognition",
+    "Hallucination",
+    "Entity Perception",
+    "2D Spatial Perception",
+    "Time Perception",
+    "Scene Perception",
+    "Text Perception",
+    "Event Cognition",
+    "Entity Cognition",
+    "Text Cognition",
+    "Event Perception",
+    "Scene Cognition",
+]
+def get_dimention_rating_open_ended(data_path):
+    # 读取数据
+    df = load(data_path)
+    df = df[df["score"] != -1]
+    # 将秒转换为分钟并分配到对应区间
+    df["duration_minutes"] = df["duration"] / 60
+    df["duration_range"] = pd.cut(
+        df["duration_minutes"], bins=[-np.inf, 10, 20, 30, 40, 50, 60, np.inf], labels=DURATIONS
+    )
+    # 初始化结果字典
+    result = {
+        "overall": 0,
+        "duration": {k: 0 for k in DURATIONS},
+        "domain": {k: 0 for k in DOMAINS},
+        "sub_category": {k: 0 for k in SUB_CATEGORIES},
+    }
+    # Overall
+    result["overall"] = round(df["score"].mean(), 4)
+    # Duration
+    for dur in DURATIONS:
+        dur_scores = df[df["duration_range"] == dur]["score"]
+        result["duration"][dur] = round(dur_scores.mean(), 4) if not dur_scores.empty else 0
+    # Domain
+    for domain in DOMAINS:
+        domain_scores = df[df["domain"] == domain]["score"]
+        result["domain"][domain] = round(domain_scores.mean(), 4) if not domain_scores.empty else 0
+    # Sub-category
+    for sub_cat in SUB_CATEGORIES:
+        sub_cat_scores = df[df["sub_category"] == sub_cat]["score"]
+        result["sub_category"][sub_cat] = round(sub_cat_scores.mean(), 4) if not sub_cat_scores.empty else 0
+    return result
+def get_dimention_rating_mcq_grouding(data_path):
+    # 读取数据
+    df = load(data_path)
+    # df.loc[(df['task_mode'] == 'miou') & (df['score'] == -1), 'score'] = 0
+    df = df[df["score"] != -1]
+    # 将秒转换为分钟并分配到对应区间
+    df["duration_minutes"] = df["duration"] / 60
+    df["duration_range"] = pd.cut(
+        df["duration_minutes"], bins=[-np.inf, 10, 20, 30, 40, 50, 60, np.inf], labels=DURATIONS
+    )
+    # 初始化结果字典
+    result = {
+        metric: {
+            "overall": 0,
+            "duration": {k: 0 for k in DURATIONS},
+            "domain": {k: 0 for k in DOMAINS},
+            "sub_category": {k: 0 for k in SUB_CATEGORIES},
+        }
+        for metric in ["long_acc", "clue_acc", "miou", "CRR", "acc@iou", "rec@iou"]
+    }
+    # 计算基础指标
+    for metric in ["long_acc", "clue_acc", "miou"]:
+        metric_df = df[df["task_mode"] == metric]
+        # Overall
+        result[metric]["overall"] = round(metric_df["score"].mean(), 4)
+        # Duration
+        for dur in DURATIONS:
+            dur_scores = metric_df[metric_df["duration_range"] == dur]["score"]
+            result[metric]["duration"][dur] = round(dur_scores.mean(), 4) if not dur_scores.empty else 0
+        # Domain
+        for domain in DOMAINS:
+            domain_scores = metric_df[metric_df["domain"] == domain]["score"]
+            result[metric]["domain"][domain] = round(domain_scores.mean(), 4) if not domain_scores.empty else 0
+        # Sub-category
+        for sub_cat in SUB_CATEGORIES:
+            sub_cat_scores = metric_df[metric_df["sub_category"] == sub_cat]["score"]
+            result[metric]["sub_category"][sub_cat] = round(sub_cat_scores.mean(), 4) if not sub_cat_scores.empty else 0
+    # 计算复合指标 CRR
+    def calculate_crr(scores):
+        long_acc = scores[scores["task_mode"] == "long_acc"]["score"].mean()
+        clue_acc = scores[scores["task_mode"] == "clue_acc"]["score"].mean()
+        return round(min(long_acc, clue_acc) / clue_acc, 4) if clue_acc != 0 else 0
+    # Overall CRR
+    result["CRR"]["overall"] = calculate_crr(df)
+    # Duration CRR
+    for dur in DURATIONS:
+        dur_df = df[df["duration_range"] == dur]
+        result["CRR"]["duration"][dur] = calculate_crr(dur_df)
+    # Domain CRR
+    for domain in DOMAINS:
+        domain_df = df[df["domain"] == domain]
+        result["CRR"]["domain"][domain] = calculate_crr(domain_df)
+    # Sub-category CRR
+    for sub_cat in SUB_CATEGORIES:
+        sub_cat_df = df[df["sub_category"] == sub_cat]
+        result["CRR"]["sub_category"][sub_cat] = calculate_crr(sub_cat_df)
+    # 计算 acc@iou
+    def calculate_acc_at_iou_threshold(scores, threshold):
+        miou_qids = set(scores[scores["task_mode"] == "miou"]["qid"])
+        long_acc_qids = set(scores[scores["task_mode"] == "long_acc"]["qid"])
+        valid_qids = miou_qids & long_acc_qids
+        miou_positive = set(scores[(scores["task_mode"] == "miou") & (scores["score"] > threshold)]["qid"])
+        long_acc_positive = scores[
+            (scores["task_mode"] == "long_acc") & (scores["qid"].isin(miou_positive)) & (scores["score"] == 1)
+        ]
+        acc_at_iou_threshold = len(long_acc_positive) / len(valid_qids) if len(valid_qids) > 0 else 0
+        return round(acc_at_iou_threshold, 4)
+    def calculate_acc_at_iou(scores):
+        thresholds = [0.1, 0.2, 0.3, 0.4, 0.5]
+        acc_at_iou_values = [calculate_acc_at_iou_threshold(scores, threshold) for threshold in thresholds]
+        return round(sum(acc_at_iou_values) / len(acc_at_iou_values), 4)
+    # Overall acc@iou
+    result["acc@iou"]["overall"] = calculate_acc_at_iou(df)
+    # Duration acc@iou
+    for dur in DURATIONS:
+        dur_df = df[df["duration_range"] == dur]
+        result["acc@iou"]["duration"][dur] = calculate_acc_at_iou(dur_df)
+    # Domain acc@iou
+    for domain in DOMAINS:
+        domain_df = df[df["domain"] == domain]
+        result["acc@iou"]["domain"][domain] = calculate_acc_at_iou(domain_df)
+    # Sub-category acc@iou
+    for sub_cat in SUB_CATEGORIES:
+        sub_cat_df = df[df["sub_category"] == sub_cat]
+        result["acc@iou"]["sub_category"][sub_cat] = calculate_acc_at_iou(sub_cat_df)
+    # 计算 rec@iou
+    def calculate_rec_at_iou_threshold(scores, threshold):
+        # 获取所有 miou 类型的数据
+        miou_scores = scores[scores["task_mode"] == "miou"]
+        # 计算 miou score 大于 threshold 的数量
+        miou_positive = miou_scores[miou_scores["score"] > threshold]
+        # 计算比例
+        rec_at_iou = len(miou_positive) / len(miou_scores) if len(miou_scores) > 0 else 0
+        return round(rec_at_iou, 4)
+    def calculate_rec_at_iou(scores):
+        thresholds = [0.1, 0.2, 0.3, 0.4, 0.5]
+        rec_at_iou_values = [calculate_rec_at_iou_threshold(scores, threshold) for threshold in thresholds]
+        return round(sum(rec_at_iou_values) / len(rec_at_iou_values), 4)
+    # Overall rec@iou
+    result["rec@iou"]["overall"] = calculate_rec_at_iou(df)
+    # Duration rec@iou
+    for dur in DURATIONS:
+        dur_df = df[df["duration_range"] == dur]
+        result["rec@iou"]["duration"][dur] = calculate_rec_at_iou(dur_df)
+    # Domain rec@iou
+    for domain in DOMAINS:
+        domain_df = df[df["domain"] == domain]
+        result["rec@iou"]["domain"][domain] = calculate_rec_at_iou(domain_df)
+    # Sub-category rec@iou
+    for sub_cat in SUB_CATEGORIES:
+        sub_cat_df = df[df["sub_category"] == sub_cat]
+        result["rec@iou"]["sub_category"][sub_cat] = calculate_rec_at_iou(sub_cat_df)
+    return result
+def milliseconds_to_seconds(milliseconds):
+    return milliseconds / 1000
+def sample_frames_clue_average(clues_time_intervals, frame_num, fps):
+    # 计算每个线索区间的时长
+    clues_frame_intervals = [(round(interval[0] * fps), round(interval[1] * fps)) for interval in clues_time_intervals]
+    clue_durations = [interval[1] - interval[0] for interval in clues_frame_intervals]
+    total_duration = sum(clue_durations)
+    # 如果 frame_num 的数量大于等于总帧数, 则直接返回全部帧
+    if frame_num >= total_duration:
+        return [frame for interval in clues_frame_intervals for frame in range(interval[0], interval[1])]
+    frames_per_clue = [int(frame_num * (duration / total_duration)) for duration in clue_durations]
+    frame_indices = []
+    for i, (interval, num_frames) in enumerate(zip(clues_frame_intervals, frames_per_clue)):
+        num_frames = max(1, num_frames)
+        seg_size = (interval[1] - interval[0]) / num_frames
+        clue_frame_indices = [int(interval[0] + seg_size / 2 + seg_size * idx) for idx in range(num_frames)]
+        frame_indices.extend(clue_frame_indices)
+    return frame_indices
+def merge_intervals(intervals):
+    """
+    Merge overlapping intervals in a list.
+    Assumes each interval is a list [start, end].
+    """
+    if not intervals:
+        return []
+    # Sort intervals by start time
+    intervals.sort(key=lambda x: x[0])
+    merged = [intervals[0]]
+    for current in intervals[1:]:
+        last_merged = merged[-1]
+        # Check if there is an overlap
+        if current[0] <= last_merged[1]:
+            # Merge the current interval with the last one
+            last_merged[1] = max(last_merged[1], current[1])
+        else:
+            # No overlap, add current interval
+            merged.append(current)
+    return merged
+def calculate_intervals_iou(intervals1, intervals2):
+    """
+    Calculate the IoU of two lists of intervals.
+    Each list contains intervals represented as [start, end].
+    """
+    # Merge overlapping intervals in both lists
+    merged1 = merge_intervals(intervals1)
+    merged2 = merge_intervals(intervals2)
+    # Calculate total length of intervals for both lists
+    def total_length(merged_intervals):
+        return sum(end - start for start, end in merged_intervals)
+    length1 = total_length(merged1)
+    length2 = total_length(merged2)
+    # Calculate intersection length
+    intersection_length = 0
+    for interval1 in merged1:
+        for interval2 in merged2:
+            intersection_start = max(interval1[0], interval2[0])
+            intersection_end = min(interval1[1], interval2[1])
+            intersection_length += max(0, intersection_end - intersection_start)
+    # Calculate union length
+    union_length = length1 + length2 - intersection_length
+    # IoU is intersection divided by union
+    iou = intersection_length / union_length if union_length > 0 else 0
+    return iou
+def post_process(response, right_answer, task_mode, duration):
+    result = -1
+    if response:
+        # 找到 ```json 和 ``` 的位置
+        json_start = response.find("```json")
+        json_end = response.find("```", json_start + len("```json"))
+        # 如果找到了 json 内容
+        if json_start != -1 and json_end != -1:
+            json_content = response[json_start + len("```json"):json_end].strip()
+        else:
+            json_content = ""
+        if json_content:
+            if task_mode in ["long_acc", "clue_acc"]:
+                json_content = re.sub(r"(?<=:\s)([A-Za-z_]\w*)", r'"\1"', json_content)
+            try:
+                model_result = json.loads(json_content)["result"]
+                if task_mode in ["long_acc", "clue_acc"]:
+                    result = 1 if right_answer == model_result else 0
+                elif task_mode == "miou":
+                    if not isinstance(model_result, list):
+                        return -1
+                    if not isinstance(model_result[0], list):
+                        model_result = [model_result]
+                    need_duration = all(interval[0] <= 1 and interval[1] <= 1 for interval in model_result)
+                    if need_duration:
+                        model_result = [[interval[0] * duration, interval[1] * duration] for interval in model_result]
+                    right_answer = eval(right_answer)
+                    result = calculate_intervals_iou(right_answer, model_result)
+            except Exception as e:
+                print(f"Error in parsing JSON: {e}, {json_content}")
+        if result == -1:
+            if task_mode in ["long_acc", "clue_acc"]:
+                # 检查是否存在大写字母 A-H，认为其为模型答案
+                matches = re.findall(r"\b[A-H]\b", response)
+                if matches:
+                    result = 1 if right_answer in matches else 0
+            elif task_mode == "miou":
+                # 提取所有实数，进行配对
+                numbers = re.findall(r"-?\d+\.?\d*", response)
+                if len(numbers) < 2:
+                    result = -1
+                else:
+                    if len(numbers) % 2 != 0:
+                        numbers = numbers[:-1]
+                    model_result = [[float(numbers[i]), float(numbers[i + 1])] for i in range(0, len(numbers), 2)]
+                    if type(right_answer) is str:
+                        right_answer = eval(right_answer)
+                    result = calculate_intervals_iou(right_answer, model_result)
+    return result
+def get_timestampes(frame_indices, fps):
+    seconds = list(map(lambda x: str(round(x / fps, 4)), frame_indices))
+    timestamps = ", ".join(seconds)
+    return "A total of {frame_num} frames are sampled. Their corresponding timestamps are:\n\n{timestamps}\n\n".format(
+        frame_num=len(frame_indices), timestamps=timestamps
+    )
+def post_process_open(response):
+    model_result = -1
+    if response and response != FAIL_MSG:
+        json_start = response.find("```json")
+        json_end = response.find("```", json_start + len("```json"))
+        # 如果找到了 json 内容
+        if json_start != -1 and json_end != -1:
+            json_content = response[json_start + len("```json"):json_end].strip()
+        else:
+            json_content = ""
+        if json_content:
+            try:
+                model_result = json.loads(json_content)["result"]
+            except Exception as e:
+                print(f"Error in parsing JSON: {e}, {json_content}")
+        if model_result == -1:
+            model_result = response
+    return model_result
+def post_process_eval_open(response, step):
+    model_result = -1
+    if response and response != FAIL_MSG:
+        json_start = response.find("```json")
+        json_end = response.find("```", json_start + len("```json"))
+        if json_start != -1 and json_end != -1:
+            json_content = response[json_start + len("```json"):json_end].strip()
+        else:
+            json_content = ""
+        if json_content:
+            try:
+                model_result = json.loads(json_content)["result"]
+            except Exception as e:
+                print(f"Error in parsing JSON: {e}, {json_content}")
+                return -1
+        if model_result == -1:
+            if step == 1:
+                match = re.search(r"[012]", response)
+                if match:
+                    model_result = int(match.group())
+            else:
+                match = re.search(r"[01]", response)
+                if match:
+                    model_result = int(match.group())
+    return model_result
+def eval_open_first(model, line):
+    user_prompt = ""
+    user_prompt += f"Question: {line['question']}\n\n"
+    user_prompt += f"The ground truth answer is '{line['answer']}'\n\n"
+    user_prompt += f"The model's prediction is '{line['model_result']}'\n\n"
+    result = model.generate(user_prompt)
+    return result
+def save_step_1_steps(data, step_1_results):
+    # 处理所有结果
+    data["step_1_result"] = data["qid"].map(lambda x: post_process_eval_open(step_1_results[x], 1))
+    # 条件更新
+    mask = data["step_1_result"].isin([-1, 0, 1])
+    data.loc[mask, "step_2_result"] = data.loc[mask, "step_1_result"]
+    data.loc[mask, "score"] = data.loc[mask, "step_1_result"]
+    return data
+def eval_open_second(model, line, frame_paths):
+    user_prompt = ""
+    user_prompt += f"Question: {line['question']}\n\n"
+    user_prompt += f"The model's prediction is '{line['model_result']}'\n\n"
+    result = model.generate([user_prompt] + frame_paths)
+    return result
+def save_step_2_steps(data, step_1_results):
+    # 处理所有结果
+    data["score"] = data["qid"].map(lambda x: post_process_eval_open(step_1_results[x], 2))
+    return data
+def clue_frame_paths(clue_frame_root, qid, num_frames=8):
+    frame_root = osp.join(clue_frame_root, str(qid))
+    os.makedirs(frame_root, exist_ok=True)
+    return [osp.join(frame_root, frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)]
+def save_clue_video_frames(data_root, clue_frame_root, video, uid, clue_intervals=None, num_frames=8, fps=-1):
+    if type(uid) is str:
+        uid = str(uid)
+    vid_path = osp.join(data_root, video)
+    vid = decord.VideoReader(vid_path)
+    vid_fps = vid.get_avg_fps()
+    if clue_intervals is not None:
+        # 1. 合并重叠区间
+        merged_intervals = merge_intervals(clue_intervals)
+        if num_frames > 0 and fps < 0:
+            # 2. 基于clue_intervals均匀抽帧
+            indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps)
+            frame_paths = clue_frame_paths(clue_frame_root, uid, len(indices))
+    # 保存帧
+    flag = np.all([osp.exists(p) for p in frame_paths])
+    if not flag:
+        images = [vid[i].asnumpy() for i in indices]
+        images = [Image.fromarray(arr) for arr in images]
+        for im, pth in zip(images, frame_paths):
+            if not osp.exists(pth):
+                im.save(pth)
+    return frame_paths, indices, vid_fps
+def get_chunk_number(filename):
+    try:
+        num = filename.split("chunk_")[1].split(".zip")[0]
+        return int(num)
+    except:
+        return float('inf')
+def unzip_hf_zip(pth):
+    import zipfile
+    target_dir = pth
+    if os.path.exists(f"{target_dir}/cg_videos_720p") and os.path.exists(f"{target_dir}/cg_subtitles")\
+            and os.path.exists(f"{target_dir}/cg_clue_videos"):
+        print("all exists")
+        return
+    video_zip_files = [
+        os.path.join(target_dir, file)
+        for file in os.listdir(target_dir)
+        if file.endswith(".zip") and file.startswith("video")
+    ]
+    video_zip_files = sorted(video_zip_files, key=lambda x: get_chunk_number(os.path.basename(x)))
+    videos_temp_zip = os.path.join(target_dir, "videos_merged.zip")
+    print("Merging video files ...")
+    with open(videos_temp_zip, "wb") as outfile:
+        for video_zip_file in tqdm(video_zip_files, desc="Merging videos"):
+            with open(video_zip_file, "rb") as infile:
+                outfile.write(infile.read())
+    print("Extracting video files...")
+    try:
+        with zipfile.ZipFile(videos_temp_zip, "r") as zip_ref:
+            total_files = len(zip_ref.namelist())
+            for file in tqdm(zip_ref.namelist(), desc="Extracting", total=total_files):
+                zip_ref.extract(file, target_dir)
+        print(f"Successfully extracted to {target_dir}")
+    except Exception as e:
+        print(f"Error during extraction: {e}")
+    finally:
+        if os.path.exists(videos_temp_zip):
+            os.remove(videos_temp_zip)
+            print("Cleaned up temporary video file")
+    clue_video_zip_files = [
+        os.path.join(target_dir, file)
+        for file in os.listdir(target_dir)
+        if file.endswith(".zip") and file.startswith("clue_video")
+    ]
+    clue_video_zip_files = sorted(clue_video_zip_files, key=lambda x: get_chunk_number(os.path.basename(x)))
+    clue_videos_temp_zip = os.path.join(target_dir, "clue_videos_merged.zip")
+    print("Merging clue video files ...")
+    with open(clue_videos_temp_zip, "wb") as outfile:
+        for clue_video_zip_file in tqdm(clue_video_zip_files, desc="Merging clue_videos"):
+            with open(clue_video_zip_file, "rb") as infile:
+                outfile.write(infile.read())
+    print("Extracting clue video files...")
+    try:
+        with zipfile.ZipFile(clue_videos_temp_zip, "r") as zip_ref:
+            total_files = len(zip_ref.namelist())
+            for file in tqdm(zip_ref.namelist(), desc="Extracting", total=total_files):
+                zip_ref.extract(file, target_dir)
+        print(f"Successfully extracted to {target_dir}")
+    except Exception as e:
+        print(f"Error during extraction: {e}")
+    finally:
+        if os.path.exists(clue_videos_temp_zip):
+            os.remove(clue_videos_temp_zip)
+            print("Cleaned up temporary clue video file")
+    print("Extracting subtitle files ...")
+    subtitles_zip = os.path.join(target_dir, "subtitles.zip")
+    try:
+        with zipfile.ZipFile(subtitles_zip, "r") as zip_ref:
+            total_files = len(zip_ref.namelist())
+            for file in tqdm(zip_ref.namelist(), desc="Extracting", total=total_files):
+                zip_ref.extract(file, target_dir)
+        print(f"Successfully extracted to {target_dir}")
+    except Exception as e:
+        print(f"Error during extraction: {e}")

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/crpe.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import json
+import argparse
+from collections import defaultdict
+def is_correct(predict, answer):
+    # predict是标准答案 answer是预测
+    if len(answer) == 1:
+        return answer[0] == predict[0]
+    elif len(answer) != 1 and answer[0] in ['A', 'B', 'C', 'D']:
+        return answer[0] == predict[0]
+    elif len(answer) != 1 and answer[0] not in ['A', 'B', 'C', 'D']:
+        return predict[4:].lower() in answer.lower()

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/hrbench.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from ...smp import *
+import os
+def report_acc_hrbench(df):
+    cycle_group = df.groupby('cycle_category')
+    result_dic = defaultdict(list)
+    avg_dic = defaultdict(int)
+    count = 0
+    for key, data_value in cycle_group:
+        count += 1
+        _, resp_dic = hrbench_score(data_value)
+        for task_type, accuracy in resp_dic.items():
+            result_dic['cycle'].append(key)
+            result_dic['type'].append(task_type)
+            result_dic['accuracy'].append(accuracy)
+            avg_dic[task_type] += accuracy
+    for task_type, accuracy in avg_dic.items():
+        result_dic['cycle'].append('Average')
+        result_dic['type'].append(task_type)
+        result_dic['accuracy'].append(accuracy / count)
+    result_pd = pd.DataFrame(result_dic)
+    return result_pd
+def hrbench_score(data):
+    ret = defaultdict(list)
+    resp_dic = {}
+    category_list = set(data['category'])
+    score_dict = defaultdict(list)
+    for i in range(len(data)):
+        d = data.iloc[i]
+        category = d['category']
+        gpt_score = d['hit']
+        score_dict[category].append(gpt_score)
+        score_dict['all'].append(gpt_score)
+    all_acc = np.mean(score_dict['all'])
+    ret['type'].append('all')
+    ret['acc'].append(all_acc)
+    resp_dic['all'] = all_acc
+    for cate in category_list:
+        acc = np.mean(score_dict[cate])
+        ret['type'].append(cate)
+        ret['acc'].append(acc)
+        resp_dic[cate] = acc
+    return pd.DataFrame(ret), resp_dic

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/judge_util.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import os
+from ...smp import load_env
+INTERNAL = os.environ.get('INTERNAL', 0)
+def build_judge(**kwargs):
+    from ...api import OpenAIWrapper, SiliconFlowAPI
+    model = kwargs.pop('model', None)
+    kwargs.pop('nproc', None)
+    load_env()
+    LOCAL_LLM = os.environ.get('LOCAL_LLM', None)
+    if LOCAL_LLM is None:
+        model_map = {
+            'gpt-4-turbo': 'gpt-4-1106-preview',
+            'gpt-4-0613': 'gpt-4-0613',
+            'gpt-4-0125': 'gpt-4-0125-preview',
+            'gpt-4-0409': 'gpt-4-turbo-2024-04-09',
+            'chatgpt-1106': 'gpt-3.5-turbo-1106',
+            'chatgpt-0125': 'gpt-3.5-turbo-0125',
+            'gpt-4o': 'gpt-4o-2024-05-13',
+            'gpt-4o-0806': 'gpt-4o-2024-08-06',
+            'gpt-4o-mini': 'gpt-4o-mini-2024-07-18',
+            'qwen-7b': 'Qwen/Qwen2.5-7B-Instruct',
+            'qwen-72b': 'Qwen/Qwen2.5-72B-Instruct',
+            'deepseek': 'deepseek-ai/DeepSeek-V2.5',
+        }
+        model_version = model_map[model]
+    else:
+        model_version = LOCAL_LLM
+    if model in ['qwen-7b', 'qwen-72b', 'deepseek']:
+        model = SiliconFlowAPI(model_version, **kwargs)
+    else:
+        model = OpenAIWrapper(model_version, **kwargs)
+    return model
+DEBUG_MESSAGE = """
+To debug the OpenAI API, you can try the following scripts in python:
+```python
+from vlmeval.api import OpenAIWrapper
+model = OpenAIWrapper('gpt-4o', verbose=True)
+msgs = [dict(type='text', value='Hello!')]
+code, answer, resp = model.generate_inner(msgs)
+print(code, answer, resp)
+```
+You cam see the specific error if the API call fails.
+"""

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/llavabench.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import numpy as np
+import pandas as pd
+from ...smp import *
+rule_dict = {
+    'llava_bench_conv': {'role': 'Assistant', 'prompt': 'We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.'},  # noqa: E501
+    'llava_bench_detail': {'role': 'Assistant', 'prompt': 'We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.'},  # noqa: E501
+    'llava_bench_complex': {'role': 'Assistant', 'prompt': 'We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.'}  # noqa: E501
+}
+def get_eval(judge, content):
+    return judge.generate(content)
+def parse_score(review):
+    logger = get_logger('Evaluation')
+    try:
+        score_pair = review.split('\n')[0]
+        score_pair = score_pair.replace(',', ' ')
+        sp = score_pair.split(' ')
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            logger.error('error', review)
+            return [-1, -1]
+    except Exception as e:
+        logger.error(e, 'error', review)
+        return [-1, -1]
+def build_prompt(line):
+    cap_str = line['caption']
+    question = line['question']
+    ans1 = line['gpt4_ans']
+    ans2 = line['prediction']
+    category = 'llava_bench_' + line['category']
+    rule = rule_dict[category]
+    role, prompt = rule['role'], rule['prompt']
+    content = (f'[Context]\n{cap_str}\n\n'
+               f'[Question]\n{question}\n\n'
+               f'[{role} 1]\n{ans1}\n\n[End of {role} 1]\n\n'
+               f'[{role} 2]\n{ans2}\n\n[End of {role} 2]\n\n'
+               f'[System]\n{prompt}\n\n')
+    return content
+def LLaVABench_atomeval(model, prompt):
+    review = get_eval(model, prompt)
+    scores = parse_score(review)
+    return scores
+def LLaVABench_score(data):
+    cates = ['overall'] + list(set(data['category']))
+    ret = defaultdict(list)
+    for c in cates:
+        ret['split'].append(c)
+        sub = data[data['category'] == c] if c != 'overall' else data
+        ret['Relative Score (main)'].append(np.mean(sub['score']) / np.mean(sub['gpt4_score']) * 100)
+        ret['VLM Score'].append(np.mean(sub['score']) * 10)
+        ret['GPT4 Score'].append(np.mean(sub['gpt4_score']) * 10)
+    return pd.DataFrame(ret)

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/logicvista.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import pandas as pd
+# from colorama import Fore, Back, Style
+from ...smp import *
+FAIL_MSG = 'Failed to obtain answer via API.'
+def build_prompt_logicvista(line):
+    question = line['question']
+    prediction = str(line['prediction'])
+    tmpl = (
+        "You are a information extractor that extracts multiple choice letter answer choices "
+        "from a paragraph that contains the answer choice and sometimes explaination of why that "
+        "choice is correct to the given question.\n"
+        "What letter did the following answer choose? If the answer did not select a letter answer choice, "
+        "first try to infer the answer based off the given choices.\n"
+        "If it does not seem like the given answer corresponds to an answer choice OR if there is no selected answer, please just respond with Z.\n"
+        "Make sure you answer with ONLY the letters chosen.\n"
+        'Example 1: \n'
+        'Question: <start>\nWhat is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n<end>\n'
+        'Answer: <start>\na cute teddy bear\n<end>\nYour output: A\n'
+        'Example 2: \n'
+        'Question: <start>\nWhat is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n<end>\n'
+        'Answer: <start>\nSpider\n<end>\nYour output: Z\n'
+        'Example 3: \n'
+        'Question: <start>\nWhich figure is a rotation of the object?\n<end>\n'
+        'Answer: <start>\nThe figure on the right, labeled "D," is a rotation of the object shown in the top left corner.\n<end>\nYour output: D\n'
+        'Example 4: \n'
+        'Question: <start>\nWhich of the boxes comes next in the sequence? Select from A-E\n<end>\n'
+        'Answer: <start>\nThe sequence of the boxes is A, B, C, D, E.\n<end>\nYour output: ABCDE\n'
+        'Example 5: \n'
+        'Question: <start>\n{}\n<end>\nAnswer: <start>\n{}\n<end>\nYour output: '
+    )
+    return tmpl.format(question, prediction)
+def LogicVista_auxeval(model, line):
+    prompt = build_prompt_logicvista(line)
+    print(prompt)
+    log = ''
+    retry = 5
+    for i in range(retry):
+        prediction = line['prediction']
+        res = model.generate(prompt, temperature=i * 0.5)
+        answer = line['answer'].split(", ")
+        for j in range(0, len(answer)):
+            answer[j] = answer[j].lower()
+        answer.sort()
+        answer = ''.join(answer)
+        if FAIL_MSG in res:
+            log += f'Try {i}: output is {prediction}, failed to parse.\n'
+        elif not res.isupper() or not res.isalpha():
+            log += f'Try {i}: output is {prediction}, failed to parse.\n'
+        else:
+            log += 'Succeed'
+            hit = 0
+            extracted = [alpha.lower() for alpha in res]
+            extracted.sort()
+            extracted = ''.join(extracted)
+            if extracted == answer:
+                hit = 1
+            return dict(log=log, res=res, hit=hit)
+    log += 'All 5 retries failed.\n'
+    return dict(log=log, res='', hit=0)
+cat = ["diagram", "ocr", "patterns", "graphs", "tables", "3d shapes", "puzzles", "sequences", "physics"]
+def evaluate_logicvista(file_path):
+    df = pd.read_excel(file_path)
+    tot = defaultdict(lambda: 0)
+    hit = defaultdict(lambda: 0)
+    acc = defaultdict(lambda: 0)
+    lt = len(df)
+    skill_list = []
+    df_tot = df
+    df_inductive = df[df["skill"].str.contains("inductive")]
+    df_deductive = df[df["skill"].str.contains("deductive")]
+    df_numerical = df[df["skill"].str.contains("numerical")]
+    df_spatial = df[df["skill"].str.contains("spatial")]
+    df_mechanical = df[df["skill"].str.contains("mechanical")]
+    tot_correct = df_tot["hit"].sum()
+    tot_acc = (tot_correct / df_tot.shape[0]) * 100
+    tot['Overall'] = df_tot.shape[0]
+    hit['Overall'] = tot_correct
+    acc['Overall'] = tot_acc
+    inductive_correct = df_inductive["hit"].sum()
+    inductive_acc = (inductive_correct / df_inductive.shape[0]) * 100
+    tot["inductive"] = df_inductive.shape[0]
+    hit["inductive"] = inductive_correct
+    acc["inductive"] = inductive_acc
+    deductive_correct = df_deductive["hit"].sum()
+    deductive_acc = (deductive_correct / df_deductive.shape[0]) * 100
+    tot["deductive"] = df_deductive.shape[0]
+    hit["deductive"] = deductive_correct
+    acc["deductive"] = deductive_acc
+    numerical_correct = df_numerical["hit"].sum()
+    numerical_acc = (numerical_correct / df_numerical.shape[0]) * 100
+    tot["numerical"] = df_numerical.shape[0]
+    hit["numerical"] = numerical_correct
+    acc["numerical"] = numerical_acc
+    spatial_correct = df_spatial["hit"].sum()
+    spatial_acc = (spatial_correct / df_spatial.shape[0]) * 100
+    tot["spatial"] = df_spatial.shape[0]
+    hit["spatial"] = spatial_correct
+    acc["spatial"] = spatial_acc
+    mechanical_correct = df_mechanical["hit"].sum()
+    mechanical_acc = (mechanical_correct / df_mechanical.shape[0]) * 100
+    tot["mechanical"] = df_mechanical.shape[0]
+    hit["mechanical"] = mechanical_correct
+    acc["mechanical"] = mechanical_acc
+    # capability dimension, the official data json does not contain 'capability' column, so it is now ignored
+    # for i in cat:
+    #     curr = df[df["capability"].str.contains(i.replace(" ", ""))]
+    #     correct = curr["hit"].sum()
+    #     accuracy = (correct / curr.shape[0]) * 100
+    #     tot[i] = curr.shape[0]
+    #     hit[i] = correct
+    #     acc[i] = accuracy
+    res = defaultdict(list)
+    for k in tot.keys():
+        res['Task&Skill'].append(k)
+        res['tot'].append(tot[k])
+        res['hit'].append(hit[k])
+        res['acc'].append(acc[k])
+    res = pd.DataFrame(res)
+    return res

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/longvideobench.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from ...smp import *
+from .multiple_choice import extract_answer_from_item
+import numpy as np
+import re
+FAIL_MSG = 'Failed to obtain answer via API.'
+DURATIONS = [15, 60, 600, 3600]
+TASK_CATEGORIES = [
+    "S2E", "S2O", "S2A",
+    "E2O", "O2E", "T2E",
+    "T2O", "T2A", "E3E",
+    "O3O", "SSS", "SOS",
+    "SAA", "T3E", "T3O",
+    "TOS", "TAA"
+]
+def get_dimension_rating(data_path):
+    data = load(data_path)
+    print(data.iloc[0])
+    duration_rating = {k: {} for k in DURATIONS}
+    for duration in DURATIONS + ['overall']:
+        duration_rating[duration] = {
+            'overall': '',
+            'question_category': {k: [] for k in TASK_CATEGORIES}
+        }
+    for i in range(len(data)):
+        task_ctg = data.iloc[i]['question_category']
+        duration = data.iloc[i]['duration_group']
+        duration_rating[duration]['question_category'][task_ctg].append(data.iloc[i]['score'])
+        duration_rating['overall']['question_category'][task_ctg].append(data.iloc[i]['score'])
+    for duration in DURATIONS + ['overall']:
+        overall_res_dur = f'{np.mean([x for x in sum(duration_rating[duration]["question_category"].values(), []) if x >= 0]):.3f}'  # noqa: E501
+        duration_rating[duration]['overall'] = overall_res_dur
+        for task_ctg in TASK_CATEGORIES:
+            task_res_dur = f'{np.mean([x for x in duration_rating[duration]["question_category"][task_ctg] if x >= 0]):.3f}'  # noqa: E501
+            duration_rating[duration]['question_category'][task_ctg] = task_res_dur
+    return duration_rating
+def extract_option(model, input_item, dataset_name):
+    options = input_item['question'].split('\n')[1:]
+    for id, option in enumerate(options):
+        option_id = chr(ord('A') + id) + '.'
+        if option.find(option_id) >= 0:
+            input_item[chr(ord('A') + id)] = option[option.find(option_id) + len(option_id):].strip('. \n')
+    return extract_answer_from_item(model, input_item, dataset_name)['opt']
+def extract_characters_regex(s):
+    s = s.strip()
+    answer_prefixes = [
+        'The best answer is',
+        'The correct answer is',
+        'The answer is',
+        'The answer',
+        'The best option is'
+        'The correct option is',
+        'Best answer:'
+        'Best option:',
+        'Answer:',
+        'Option:',
+    ]
+    for answer_prefix in answer_prefixes:
+        s = s.replace(answer_prefix, '')
+    if len(s.split()) > 10 and not re.search('[ABCDE]', s):
+        return ''
+    matches = re.search(r'[ABCDE]', s)
+    if matches is None:
+        return ''
+    return matches[0]

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathv.py ADDED Viewed

	@@ -0,0 +1,171 @@

+from ...smp import *
+from ...utils import can_infer
+try:
+    from latex2sympy2 import latex2sympy
+except Exception as e:
+    logging.critical(f'{type(e)}: {e}')
+    logging.critical('Please install latex2sympy2 by running "pip install latex2sympy2"')
+FAIL_MSG = 'Failed to obtain answer via API.'
+def is_equal(asw: str, gt_asw: str) -> bool:
+    if not isinstance(asw, str) != str or not isinstance(gt_asw, str):
+        print('Warning: input is not string')
+        print(asw, gt_asw)
+    asw = str(asw).lower().strip()
+    gt_asw = str(gt_asw).lower().strip()
+    if gt_asw == asw:
+        return True
+    try:
+        a = eval(gt_asw)
+        b = eval(asw)
+        if abs(a - b) < 1e-6:
+            return True
+    except:
+        pass
+    try:
+        a = latex2sympy(gt_asw)
+        b = latex2sympy(asw)
+        if abs(eval(str(a)) - eval(str(b))) < 1e-6:
+            return True
+        if abs(a - b) < 1e-6:
+            return True
+    except:
+        pass
+    return False
+def get_gpt4_ICE():
+    example_1 = """
+Hint: Please answer the question and provide the final answer at the end.\n
+Question: Which number is missing?\n
+Model response: The number missing in the sequence is 14.\n
+Extracted answer: 14
+"""
+    example_2 = """
+Hint: Please answer the question and provide the final answer at the end.\n
+Question: What is the fraction of females facing the camera?\n
+Model response: The fraction of females facing the camera is 0.6,
+which means that six out of ten females in the group are facing the camera.\n
+Extracted answer: 0.6
+"""
+    example_3 = """
+Hint: Please answer the question and provide the final answer at the end.\n
+Question: How much money does Luca need to buy a sour apple candy and a butter-scotch candy? (Unit: $)\n
+Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy.\n
+Extracted answer: 1.45
+"""
+    example_4 = """
+Hint: Please answer the question and provide the final answer at the end.\n
+Question: Between which two years does the line graph saw its maximum peak?\n
+Model response: The line graph saw its maximum peak between 2007 and 2008.\n
+Extracted answer: [2007, 2008]
+"""
+    example_5 = """
+Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.\n
+Question: What fraction of the shape is blue?\n
+Choices: (A) 3/11 (B) 8/11 (C) 6/11 (D) 3/5\n
+Model response: The correct answer is (B) 8/11.\n
+Extracted answer: B
+"""
+    return [example_1, example_2, example_3, example_4, example_5]
+def build_mathv_gpt4_prompt(line):
+    task_description = """
+Please read the following example.
+Then extract the answer from the model response and type it at the end of the prompt.\n
+"""
+    question = line['question']
+    prediction = str(line['prediction'])
+    prompt = task_description
+    examples = get_gpt4_ICE()
+    for example in examples:
+        prompt += example + '\n'
+    prompt += question + '\n'
+    prompt += 'Model respone: ' + prediction
+    prompt += 'Extracted answer:'
+    return prompt
+def list_to_dict(lst):
+    return {chr(65 + i): val for i, val in enumerate(lst)}
+def post_check(line, prefetch=False):
+    res = None
+    ans = line['answer']
+    response = line['prediction'] if prefetch else line['res']
+    try:
+        if len(eval(line['choices'])) > 0:
+            ans = line['answer']
+            choices = list_to_dict(eval(line['choices']))
+            res = can_infer(response, choices)
+            if prefetch:
+                return res
+        else:
+            res = str(response)
+            ans = str(ans)
+    except ValueError:
+        pass
+    if is_equal(res, ans):
+        return res if prefetch else True
+    else:
+        return False
+def MATH_V_auxeval(model, line):
+    prompt = build_mathv_gpt4_prompt(line)
+    log = ''
+    retry = 5
+    if post_check(line, prefetch=True):
+        res = post_check(line, prefetch=True)
+        return dict(log='Prefetch succeed', res=res)
+    for i in range(retry):
+        prediction = line['prediction']
+        res = model.generate(prompt, temperature=i * 0.5)
+        if FAIL_MSG in res:
+            log += f'Try {i}: output is {prediction}, failed to parse.\n'
+        else:
+            log += 'Succeed'
+            return dict(log=log, res=res)
+    log += 'All 5 retries failed.\n'
+    return dict(log=log, res='')
+def MATH_V_acc(result_file):
+    data = load(result_file)
+    tot = defaultdict(lambda: 0)
+    fetch = defaultdict(lambda: 0)
+    hit = defaultdict(lambda: 0)
+    lt = len(data)
+    for i in range(lt):
+        item = data.iloc[i]
+        cate = item['category']
+        tot['Overall'] += 1
+        tot[cate] += 1
+        if item['log'] == 'Prefetch succeed':
+            fetch['Overall'] += 1
+            fetch[cate] += 1
+        if post_check(item, prefetch=False):
+            hit['Overall'] += 1
+            hit[cate] += 1
+    res = defaultdict(list)
+    for k in tot.keys():
+        res['Subject'].append(k)
+        res['tot'].append(tot[k])
+        res['prefetch'].append(fetch[k])
+        res['hit'].append(hit[k])
+        res['prefetch_rate'].append(fetch[k] / tot[k] * 100)
+        res['acc'].append(hit[k] / tot[k] * 100)
+    res = pd.DataFrame(res).sort_values('Subject', ignore_index=True)
+    return res

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathverse.py ADDED Viewed

	@@ -0,0 +1,193 @@

+from ...smp import *
+from ...utils import can_infer
+FAIL_MSG = 'Failed to obtain answer via API.'
+def get_gpt4_extract_ICE():
+    example_1 = """
+1.
+Model response: 'Rounded to two decimal places, the perimeter of the sector is approximately:\n\n(-2, 1)'
+Extracted Answer: (-2, 1)
+""" # noqa
+    example_2 = """
+2.
+Model response: 'at those points.\n\nTherefore, the correct option that represents the meaning of the intersection points of the graphs is:\n\nD. They give the solutions to the equation $f(t)=g(t)$.",'
+Extracted Answer: D
+""" # noqa
+    example_3 = """
+3.
+Model response: ' at 1 (there's a closed circle at y = 1), the range in interval notation is \\((-4, 1]\\).\n\nFinal values:\nDomain: \\((-3, 3]\\)\nRange: \\((-4, 1]\\)'
+Extracted Answer: Domain: \\((-3, 3]\\)\nRange: \\((-4, 1]\\)
+""" # noqa
+    example_4 = """
+4.
+Model response: 'As it stands, I cannot provide the correct option letter because there isn't enough information to solve for 'y'.'
+Extracted Answer: null
+""" # noqa
+    example_5 = """
+5.
+Model response: 'Given that AB = 17.6 meters, we can now substitute into the equation:\n\nd = 17.6 / cos(38\u00b0)\n\nTherefore, to one decimal place, the distance d between Ned and Bart is approximately 22.3 meters.'
+Extracted answer: 22.3
+""" # noqa
+    example_6 = """
+6.
+Model response:  have all the coefficients for the quadratic function:\n\\( f(x) = ax^2 + bx + c \\)\n\\( f(x) = -1x^2 - 2x + 1 \\)\n\nTherefore, the equation for the graphed function \\( f \\) is:\n\\( f(x) = -x^2 - 2x + 1 \\)"'
+Extracted answer: f(x) = -x^2 - 2x + 1
+""" # noqa
+    return [example_1, example_2, example_3, example_4, example_5, example_6]
+def get_gpt4_score_ICE():
+    example_1 = """
+[Question]: Write the set of numbers represented on the number line in interval notation.
+[Standard Answer]: (-2,1]
+[Model_answer] : Extracted Answer: \\((-2, 1)\\)
+Judgement: 0
+""" # noqa
+    example_2 = """
+[Question]: As shown in the figure, circle O has a radius 1.0, if angle BAC = 60.0, then the length of BC is ()\nChoices:\nA:2\nB:2\u221a{{3}}\nC:\u221a{{3}}\nD:2\u221a{{2}}
+[Standard Answer]: C
+[Model_answer] : B:2\u221a{{3}}
+Judgement: 0
+""" # noqa
+    example_3 = """
+[Question]: Find the domain and range of the function f using interval notation.
+[Standard Answer]: domain: [-4, 0) and range: (-3, 1]
+[Model_answer] : Range: \\((-4, 1]\\)
+Judgement: 0
+""" # noqa
+    example_4 = """
+[Question]: As shown in the figure, circle O has a radius 1.0, if angle BAC = 60.0, then the length of BC is ()\nChoices:\nA:2\nB:2\u221a{{3}}\nC:\u221a{{3}}\nD:2\u221a{{2}}
+[Standard Answer]: C
+[Model_answer] : null
+Judgement: 0
+""" # noqa
+    return [example_1, example_2, example_3, example_4]
+def build_mathverse_gpt4_extract_prompt(line):
+    task_description = """
+I am providing you a response from a model to a math problem, termed 'Model Response'. You should extract the answer from the response as 'Extracted Answer'. Directly output the extracted answer with no explanation.\n\n
+""" # noqa
+    prediction = str(line['prediction'])
+    demo_prompt = task_description
+    examples = get_gpt4_extract_ICE()
+    for example in examples:
+        demo_prompt += example + '\n\n'
+    test_prompt = f"Model response: '{prediction}'\nExtracted Answer: "
+    full_prompt = f'{demo_prompt}7.\n{test_prompt}'
+    return full_prompt
+def build_mathverse_gpt4_score_prompt(line):
+    task_description = """
+Below are two answers to a math question. Question is [Question], [Standard Answer] is the standard answer to the question, and [Model_answer] is the answer extracted from a model's output to this question.  Determine whether these two answers are consistent.
+Please note that only when the [Model_answer] completely matches the [Standard Answer] means they are consistent. For non-multiple-choice questions, if the meaning is expressed in the same way, it is also considered consistent, for example, 0.5m and 50cm.
+If they are consistent, Judement is 1; if they are different, Judement is 0.\n\n
+""" # noqa
+    question_for_eval = line['question_for_eval']
+    extract = line['extract']
+    answer = line['answer']
+    demo_prompt = task_description
+    examples = get_gpt4_score_ICE()
+    for example in examples:
+        demo_prompt += example + '\n\n'
+    test_prompt = f"""
+    [Question]: {question_for_eval}
+    [Standard Answer]: {answer}
+    [Model_answer] : {extract}
+    Judgement:"""
+    full_prompt = f'{demo_prompt}{test_prompt}'
+    return full_prompt
+def post_check_score(line, prefetch=False):
+    ans = str(line['answer']).strip()
+    response = str(line['extract']).strip()
+    if response == ans:
+        return response if prefetch else True
+    else:
+        return False
+def MathVerse_auxeval_extract(model, line):
+    prompt = build_mathverse_gpt4_extract_prompt(line)
+    log = ''
+    retry = 5
+    for i in range(retry):
+        prediction = line['prediction']
+        res = model.generate(prompt, temperature=i * 0.5)
+        if FAIL_MSG in res:
+            log += f'Try {i}: output is {prediction}, failed to parse.\n'
+        else:
+            log += 'Succeed'
+            return dict(log_extract=log, extract=res)
+    log += 'All 5 retries failed.\n'
+    return dict(log_extract=log, extract='')
+def MathVerse_auxeval_score(model, line):
+    prompt = build_mathverse_gpt4_score_prompt(line)
+    log = ''
+    retry = 5
+    if post_check_score(line, prefetch=True):
+        res = post_check_score(line, prefetch=True)
+        return dict(log_score='Prefetch succeed', score=True)
+    for i in range(retry):
+        prediction = line['prediction']
+        res = model.generate(prompt, temperature=i * 0.5)
+        if FAIL_MSG in res or res.strip() not in ['0', '1']:
+            log += f'Try {i}: output is {prediction}, res is {res}, failed to parse.\n'
+        else:
+            log += 'Succeed'
+            return dict(log_score=log, score=int(res) == 1)
+    log += 'All 5 retries failed.\n'
+    return dict(log_score=log, score=False)
+def MathVerse_acc(result_file):
+    df = load(result_file)
+    df['metadata'] = df['metadata'].apply(lambda x: x.replace("'", '"'))
+    df['metadata'] = df['metadata'].apply(json.loads)
+    df_metadata = pd.json_normalize(df['metadata'])
+    df = pd.concat([df.drop('metadata', axis=1), df_metadata], axis=1)
+    subset = list(set(df['problem_version']))
+    res = defaultdict(list)
+    for p in subset:
+        if p != 'Overall':
+            sub = df[df['problem_version'] == p]
+        else:
+            sub = cp.deepcopy(df)
+        res['split'].append(p)
+        # Overall Acc
+        res['Overall'].append(np.mean(sub['score']) * 100)
+        # Subject
+        subjects = set(df['subject'])
+        for k in subjects:
+            res[k].append(np.mean(sub[sub['subject'] == k]['score']) * 100)
+        # Subfield
+        subfields = set(df['subfield'])
+        for k in subfields:
+            res[k].append(np.mean(sub[sub['subfield'] == k]['score']) * 100)
+    return pd.DataFrame(res)

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathvista.py ADDED Viewed

	@@ -0,0 +1,164 @@

+from ...smp import *
+from ...utils import can_infer
+FAIL_MSG = 'Failed to obtain answer via API.'
+def get_gpt4_ICE():
+    example_1 = """
+Hint: Please answer the question requiring an integer answer and provide the final value,
+e.g., 1, 2, 3, at the end.\n
+Question: Which number is missing?\n
+Model response: The number missing in the sequence is 14.\n
+Extracted answer: 14
+"""
+    example_2 = """
+Hint: Please answer the question requiring a floating-point number with one decimal place and provide the final value,
+e.g., 1.2, 1.3, 1.4, at the end.\n
+Question: What is the fraction of females facing the camera?\n
+Model response: The fraction of females facing the camera is 0.6,
+which means that six out of ten females in the group are facing the camera.\n
+Extracted answer: 0.6
+"""
+    example_3 = """
+Hint: Please answer the question requiring a floating-point number with two decimal places and provide the final value,
+e.g., 1.23, 1.34, 1.45, at the end.\n
+Question: How much money does Luca need to buy a sour apple candy and a butter-scotch candy? (Unit: $)\n
+Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy.\n
+Extracted answer: 1.45
+"""
+    example_4 = """
+Hint: Please answer the question requiring a Python list as an answer and provide the final list,
+e.g., [1, 2, 3], [1.2, 1.3, 1.4], at the end.\n
+Question: Between which two years does the line graph saw its maximum peak?\n
+Model response: The line graph saw its maximum peak between 2007 and 2008.\n
+Extracted answer: [2007, 2008]
+"""
+    example_5 = """
+Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.\n
+Question: What fraction of the shape is blue?\n
+Choices: (A) 3/11 (B) 8/11 (C) 6/11 (D) 3/5\n
+Model response: The correct answer is (B) 8/11.\n
+Extracted answer: B
+"""
+    return [example_1, example_2, example_3, example_4, example_5]
+def build_mathvista_gpt4_prompt(line):
+    task_description = """
+Please read the following example.
+Then extract the answer from the model response and type it at the end of the prompt.\n
+"""
+    question = line['question']
+    prediction = str(line['prediction'])
+    prompt = task_description
+    examples = get_gpt4_ICE()
+    for example in examples:
+        prompt += example + '\n'
+    prompt += question + '\n'
+    prompt += 'Model respone: ' + prediction
+    prompt += 'Extracted answer:'
+    return prompt
+def list_to_dict(lst):
+    return {chr(65 + i): val for i, val in enumerate(lst)}
+def post_check(line, prefetch=False):
+    res = None
+    ans = line['answer']
+    response = line['prediction'] if prefetch else line['res']
+    try:
+        if line['question_type'] == 'multi_choice':
+            ans = line['answer_option']
+            choices = list_to_dict(eval(line['choices']))
+            res = can_infer(response, choices)
+            if prefetch:
+                return res
+        else:
+            if line['answer_type'] == 'integer':
+                res = int(response)
+                ans = int(line['answer'])
+            elif line['answer_type'] == 'float':
+                res = float(response)
+                ans = float(line['answer'])
+            else:
+                res = str(res)
+                ans = str(ans)
+    except ValueError:
+        pass
+    if res == ans:
+        return res if prefetch else True
+    else:
+        return False
+def MathVista_auxeval(model, line):
+    prompt = build_mathvista_gpt4_prompt(line)
+    log = ''
+    retry = 5
+    if post_check(line, prefetch=True):
+        res = post_check(line, prefetch=True)
+        return dict(log='Prefetch succeed', res=res)
+    for i in range(retry):
+        prediction = line['prediction']
+        res = model.generate(prompt, temperature=i * 0.5)
+        if FAIL_MSG in res:
+            log += f'Try {i}: output is {prediction}, failed to parse.\n'
+        else:
+            log += 'Succeed'
+            return dict(log=log, res=res)
+    log += 'All 5 retries failed.\n'
+    return dict(log=log, res='')
+def MathVista_acc(result_file):
+    data = load(result_file)
+    tot = defaultdict(lambda: 0)
+    fetch = defaultdict(lambda: 0)
+    hit = defaultdict(lambda: 0)
+    lt = len(data)
+    skill_list = []
+    for i in range(lt):
+        item = data.iloc[i]
+        cate = item['task']
+        tot['Overall'] += 1
+        try:
+            skills = eval(item['skills'])
+        except SyntaxError:
+            skills = [item['skills']]
+        for skill in skills:
+            if skill not in skill_list:
+                skill_list.append(skill)
+            tot[skill] += 1
+        tot[cate] += 1
+        if item['log'] == 'Prefetch succeed':
+            fetch['Overall'] += 1
+            fetch[cate] += 1
+            for skill in skills:
+                fetch[skill] += 1
+        if post_check(item, prefetch=False):
+            hit['Overall'] += 1
+            hit[cate] += 1
+            for skill in skills:
+                hit[skill] += 1
+    res = defaultdict(list)
+    for k in tot.keys():
+        res['Task&Skill'].append(k)
+        res['tot'].append(tot[k])
+        res['prefetch'].append(fetch[k])
+        res['hit'].append(hit[k])
+        res['prefetch_rate'].append(fetch[k] / tot[k] * 100)
+        res['acc'].append(hit[k] / tot[k] * 100)
+    res = pd.DataFrame(res)
+    return res

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mlvu.py ADDED Viewed

	@@ -0,0 +1,189 @@

+from ...smp import *
+from .multiple_choice import extract_answer_from_item
+from PIL import Image, ImageOps
+import numpy as np
+FAIL_MSG = 'Failed to obtain answer via API.'
+system_prompt_sub_scene = """
+##TASK DESCRIPTION:
+You are required to evaluate a respondent's answer based on a provided question, some scoring points, and the respondent's answer. You should provide two scores. The first is the accuracy score, which should range from 1 to 5. The second is the relevance score, which should also range from 1 to 5. Below are the criteria for each scoring category.
+##ACCURACY Scoring Criteria:
+Evaluate the respondent's answer against specific scoring points as follows:
+Score 1: The response completely misses the scoring point.
+Score 3: The response mentions content related to the scoring point but is not entirely correct.
+Score 5: The response accurately addresses the scoring point.
+Calculate the average score across all scoring points to determine the final accuracy score.
+##RELEVANCE Scoring Criteria:
+Assess how the respondent's answer relates to the original question:
+Score 1: The response is completely off-topic from the question.
+Score 2: The response is partially related to the question but contains a significant amount of irrelevant content.
+Score 3: The response primarily addresses the question, but the respondent seems uncertain about their own answer.
+Score 4: The response mostly addresses the question and the respondent appears confident in their answer.
+Score 5: The response is fully focused on addressing the question with no irrelevant content and demonstrates complete certainty.
+----
+##INSTRUCTION:
+1. Evaluate Accuracy: First, assess and score each scoring point based on the respondent's answer. Calculate the average of these scores to establish the final accuracy score. Provide a detailed rationale before assigning your score.
+2. Evaluate RELEVANCE: Assess the relevance of the respondent’s answer to the question. Note that when evaluating relevance, the correctness of the answer is not considered; focus solely on how relevant the answer is to the question. Provide a comprehensive rationale before assigning your score.
+3. Output Scores in JSON Format: Present the scores in JSON format as follows:
+{'score_accuracy': score_acc, 'score_relevance': score_rele, 'total_score': score_acc + score_rele}
+"""  # noqa
+system_prompt_summary = """
+##TASK DESCRIPTION:
+You are required to evaluate the performance of the respondent in the video summarization task based on the standard answer and the respondent's answer. You should provide two scores. The first is the COMPLETENESS score, which should range from 1 to 5. The second is the RELIABILITY score, which should also range from 1 to 5. Below are the criteria for each scoring category:
+##COMPLETENESS Scoring Criteria:
+The completeness score focuses on whether the summary covers all key points and main information from the video.
+Score 1: The summary hardly covers any of the main content or key points of the video.
+Score 2: The summary covers some of the main content and key points but misses many.
+Score 3: The summary covers most of the main content and key points.
+Score 4: The summary is very comprehensive, covering most to nearly all of the main content and key points.
+Score 5: The summary completely covers all the main content and key points of the video.
+##RELIABILITY Scoring Criteria:
+The reliability score evaluates the correctness and clarity of the video summary. It checks for factual errors, misleading statements, and contradictions with the video content. If the respondent's answer includes details that are not present in the standard answer, as long as these details do not conflict with the correct answer and are reasonable, points should not be deducted.
+Score 1: Contains multiple factual errors and contradictions; presentation is confusing.
+Score 2: Includes several errors and some contradictions; needs clearer presentation.
+Score 3: Generally accurate with minor errors; minimal contradictions; reasonably clear presentation.
+Score 4: Very accurate with negligible inaccuracies; no contradictions; clear and fluent presentation.
+Score 5: Completely accurate with no errors or contradictions; presentation is clear and easy to understand.
+----
+##INSTRUCTION:
+1. Evaluate COMPLETENESS: First, analyze the respondent's answer according to the scoring criteria, then provide an integer score between 1 and 5 based on sufficient evidence.
+2. Evaluate RELIABILITY: First, analyze the respondent's answer according to the scoring criteria, then provide an integer score between 1 and 5 based on sufficient evidence.
+3. Output Scores in JSON Format: Present the scores in JSON format as follows:
+{'score_completeness': score_comp, 'score_reliability': score_reli, 'total_score': score_comp + score_reli}
+"""  # noqa
+def check_ans_with_model(pred, gt, model, item, dataset_name='MLVU_MCQ'):
+    flag = False
+    index = gt.index("(")  # noqa
+    index2 = gt.index(")")  # noqa
+    gt_option = gt[index + 1: index2]
+    if ")" in pred:
+        index3 = pred.index(")")
+        pred = pred[index3 - 1: index3]
+    if pred == gt_option:
+        flag = True
+    elif extract_answer_from_item(model, item, dataset_name)['opt'] == item['answer']:
+        flag = True
+    return flag
+def extract_scores_summary(text):
+    # Define the keys to locate in the text
+    keys = ["score_completeness", "score_reliability"]
+    scores = []
+    for key in keys:
+        # Find the index where each key starts
+        start_index = text.find(key)
+        if start_index == -1:
+            continue  # Skip if key is not found
+        # Find the start of the number which is after the colon and space
+        start_number_index = text.find(":", start_index) + 2
+        end_number_index = text.find(",", start_number_index)  # Assuming the number ends before a comma
+        # Extract and convert the number to float
+        score = float(text[start_number_index:end_number_index])
+        scores.append(score)
+    return scores
+def check_ans_with_model_summary(pred, gt, model, item, dataset_name='MLVU_OpenEnded'):
+    user_prompt = f"""
+    Please score the respondent's answer according to the steps in the Instructions. You must end with a JSON dict to store the scores.
+    Standard Answer: {gt}
+    Respondent's Answer: {pred}
+    """  # noqa
+    result = model.generate(user_prompt)
+    result = extract_scores_summary(result)
+    result = np.sum(result)
+    return result
+def extract_scores_sub_scene(text):
+    # Define the keys to locate in the text
+    keys = ["score_accuracy", "score_relevance"]
+    scores = []
+    for key in keys:
+        # Find the index where each key starts
+        start_index = text.find(key)
+        if start_index == -1:
+            continue  # Skip if key is not found
+        # Find the start of the number which is after the colon and space
+        start_number_index = text.find(":", start_index) + 2
+        end_number_index = text.find(",", start_number_index)  # Assuming the number ends before a comma
+        # Extract and convert the number to float
+        score = float(text[start_number_index:end_number_index])
+        scores.append(score)
+    return scores
+def check_ans_with_model_sub_scene(pred, gt, model, item, dataset_name='MLVU_OpenEnded'):
+    user_prompt = f"""
+    Please score the respondent's answer according to the steps in the Instructions. You must end with a JSON dict to store the scores.
+    Question: {item['question']}
+    Scoring Points: {item['scoring_points']}
+    Respondent's Answer: {pred}
+    """  # noqa
+    result = model.generate(user_prompt)
+    result = extract_scores_sub_scene(result)
+    result = np.sum(result)
+    return result
+def MLVU_OpenEnded_generate(model, line):
+    task_type = line['task_type']
+    if task_type == 'summary':
+        user_prompt = (
+            f"Please score the respondent's answer according to the steps in the Instructions. "
+            f"You must end with a JSON dict to store the scores.\n"
+            f"Standard Answer: {line['answer']}\n"
+            f"Respondent's Answer: {line['prediction']}\n"
+        )
+    elif task_type == 'sub_scene':
+        user_prompt = (
+            f"Please score the respondent's answer according to the steps in the Instructions. "
+            f"You must end with a JSON dict to store the scores.\n"
+            f"Question: {line['question']}\n"
+            f"Scoring Points: {line['scoring_points']}\n"
+            f"Respondent's Answer: {line['prediction']}\n"
+        )
+    else:
+        AssertionError(f'MLVU don\'t have {task_type} open ended task!')
+    result = model.generate(user_prompt)
+    return result
+def MLVU_OpenEnded_extract(gpt_generate_data, org_data):
+    extract_func = {
+        'sub_scene': extract_scores_sub_scene,
+        'summary': extract_scores_summary
+    }
+    for idx, item in org_data.iterrows():
+        func = extract_func[item['task_type']]
+        text = gpt_generate_data[idx]
+        org_data.loc[idx, 'score'] = np.sum(func(text))
+    return org_data
+def get_dimension_rating(data_path):
+    data = load(data_path)
+    result_dict = {}
+    for idx, item in data.iterrows():
+        if item['task_type'] not in result_dict:
+            result_dict[item['task_type']] = [0,0]
+        result_dict[item['task_type']][0] += int(item['score'])
+        result_dict[item['task_type']][1] += 1
+    return result_dict

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmbench_video.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from ...smp import *
+import numpy as np
+FAIL_MSG = 'Failed to obtain answer via API.'
+system_prompt = """
+As an AI assistant, your task is to evaluate a candidate answer in comparison to a given correct answer.
+The question itself, the correct 'groundtruth' answer, and the candidate answer will be provided to you.
+Your assessment should range from 0 to 3, \
+based solely on the semantic similarity between the groundtruth and the candidate answer, \
+disregarding any grammatical differences.
+A rating of 0 suggests no similarity, implying the candidate answer is entirely incorrect.
+A rating of 1 suggests low similarity, meaning the candidate answer is largely incorrect.
+A rating of 2 suggests high similarity, meaning the candidate answer is largely correct.
+Lastly, a rating of 3 indicates complete similarity, which means the candidate answer is entirely correct.
+Your response should be a single integer from 0, 1, 2, or 3.
+"""
+MMV_DIMENSIONS = {
+    'CP': ['Video Topic', 'Video Emotion', 'Video Scene', 'Video Style'],
+    'FP-S': ['OCR', 'Object Recognition', 'Attribute Recognition', 'Event Recognition', 'Human Motion', 'Counting'],
+    'FP-C': ['Spatial Relationship', 'Human-object Interaction', 'Human Interaction'],
+    'HL': ['Hallucination'],
+    'LR': ['Structuralized Image-Text Understanding', 'Mathematical Calculation'],
+    'AR': ['Physical Property', 'Function Reasoning', 'Identity Reasoning'],
+    'RR': ['Natural Relation', 'Physical Relation', 'Social Relation'],
+    'CSR': ['Common Sense Reasoning'],
+    'TR': ['Counterfactual Reasoning', 'Causal Reasoning', 'Future Prediction'],
+}
+L3_DIMS = []
+for k, v in MMV_DIMENSIONS.items():
+    L3_DIMS.extend(v)
+MMV_DIMENSIONS['Perception'] = []
+MMV_DIMENSIONS['Reasoning'] = []
+MMV_DIMENSIONS['Overall'] = []
+for k in ['CP', 'FP-C', 'FP-S', 'HL']:
+    MMV_DIMENSIONS['Perception'].extend(MMV_DIMENSIONS[k])
+    MMV_DIMENSIONS['Overall'].extend(MMV_DIMENSIONS[k])
+for k in ['LR', 'AR', 'RR', 'CSR', 'TR']:
+    MMV_DIMENSIONS['Reasoning'].extend(MMV_DIMENSIONS[k])
+    MMV_DIMENSIONS['Overall'].extend(MMV_DIMENSIONS[k])
+def get_dimension_rating(data_path):
+    data = load(data_path)
+    coarse_rating = {k: [] for k in MMV_DIMENSIONS}
+    fine_rating = {k: [] for k in L3_DIMS}
+    for i in range(len(data)):
+        cate = data.iloc[i]['dimensions']
+        cates = eval(cate)
+        for c in cates:
+            fine_rating[c].append(data.iloc[i]['score'])
+        for d in MMV_DIMENSIONS:
+            if np.any([x in MMV_DIMENSIONS[d] for x in cates]):
+                coarse_rating[d].append(data.iloc[i]['score'])
+    coarse_all = {k: f'{np.mean([max(x, 0) for x in v]):.2f}' for k, v in coarse_rating.items()}
+    coarse_valid = {k: f'{np.mean([x for x in v if x >= 0]):.2f}' for k, v in coarse_rating.items()}
+    fine_all = {k: f'{np.mean([max(x, 0) for x in v]):.2f}' for k, v in fine_rating.items()}
+    fine_valid = {k: f'{np.mean([x for x in v if x >= 0]):.2f}' for k, v in fine_rating.items()}
+    return dict(coarse_all=coarse_all, coarse_valid=coarse_valid, fine_all=fine_all, fine_valid=fine_valid)
+def build_prompt(item):
+    tmpl = 'Question: {}\nGroundtruth answer: {}\nCandidate answer: {}\nYour response: '
+    return tmpl.format(item['question'], item['answer'], item['prediction'])

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/vcr.py ADDED Viewed

	@@ -0,0 +1,335 @@

+import uuid
+from functools import partial
+from .image_base import ImageBaseDataset
+from ..smp import *
+rouge = None
+nlp_en = None
+nlp_zh = None
+nlp = None
+def initialize():
+    import evaluate
+    import spacy
+    global rouge, nlp_en, nlp_zh, nlp
+    try:
+        rouge = evaluate.load('rouge', experiment_id=str(uuid.uuid4()))
+    except Exception as e:
+        logging.critical(f'{type(e)}: {e}')
+        logging.critical('Please first `pip install rouge_score`.')
+    try:
+        nlp_en = spacy.load('en_core_web_sm')
+    except Exception as e:
+        logging.warning(f'{type(e)}: {e}')
+        logging.warning('Will automatically download en_core_web_sm via spacy.')
+        spacy.cli.download('en_core_web_sm')
+        nlp_en = spacy.load('en_core_web_sm')
+    try:
+        nlp_zh = spacy.load('zh_core_web_sm')
+    except Exception as e:
+        logging.warning(f'{type(e)}: {e}')
+        logging.warning('Will automatically download zh_core_web_sm via spacy.')
+        spacy.cli.download('zh_core_web_sm')
+        nlp_zh = spacy.load('zh_core_web_sm')
+    nlp = {'en': nlp_en, 'zh': nlp_zh}
+def rough_filter(answer_text):
+    if "I can't" in answer_text:
+        return False
+    elif 'I cannot' in answer_text:
+        return False
+    elif 'sorry' in answer_text.lower():
+        return False
+    if '无法' in answer_text:
+        return False
+    elif '抱歉' in answer_text:
+        return False
+    else:
+        return True
+def zero_template(crossed_text):
+    return {
+        'crossed_text': crossed_text,
+        'max_sim_val': 0,
+        'max_sim_string': '',
+        'precision': 0,
+        'recall': 0,
+        'f1': 0,
+        'jaccard': 0,
+        'rouge1': 0,
+        'exact_match': 0,
+    }
+def tokenize(text, language):
+    """
+    Tokenize the text and return the tokens.
+    Parameters:
+    text (str): The text to tokenize.
+    language (str): The language of the text.
+    Returns:
+    list: The list of tokens.
+    """
+    assert language in ['en', 'zh']
+    nlp_language = nlp[language]
+    processed_text = nlp_language(text)
+    return [token.text for token in processed_text]
+def find_best_match(needle, hay, language, rouge):
+    """
+    Finds the best matching n-gram in the haystack for the given needle.
+    Parameters:
+    needle (str): The string to find.
+    hay (str): The text to search within.
+    Returns:
+    tuple: The highest similarity value and the best matching string.
+    """
+    assert language in ['en', 'zh']
+    from nltk.util import ngrams
+    from difflib import SequenceMatcher as SM
+    tokens_hay = tokenize(hay, language)
+    tokens_needle = tokenize(needle, language)
+    splitter = '' if language == 'zh' else ' '
+    ngrams_ = ngrams(tokens_hay, len(tokens_needle))
+    max_sim_val = 0
+    max_sim_string = ''
+    max_sim_ngram = []
+    tokens_needle_set = set(tokens_needle)
+    ngrams_hasjoint = [
+        ngram
+        for ngram in ngrams_
+        if not set(ngram).isdisjoint(tokens_needle_set)
+    ]
+    for ngram in ngrams_hasjoint:
+        hay_ngram = splitter.join(ngram)
+        similarity = SM(None, hay_ngram, needle).ratio()
+        if similarity > max_sim_val:
+            max_sim_val = similarity
+            max_sim_string = hay_ngram
+            max_sim_ngram = ngram
+    # Evaluate
+    if len(max_sim_ngram) == 0:
+        return {
+            'crossed_text': needle,
+            'max_sim_val': 0,
+            'max_sim_string': '',
+            'precision': 0,
+            'recall': 0,
+            'f1': 0,
+            'jaccard': 0,
+            'rouge1': 0,
+            'exact_match': 0,
+        }
+    pred_set = set(max_sim_ngram)
+    ref_set = set(tokens_needle)
+    correct_tokens = pred_set.intersection(ref_set)
+    len_correct_tokens = len(correct_tokens)
+    precision = len_correct_tokens / len(pred_set)
+    recall = len_correct_tokens / len(ref_set)
+    if (precision + recall) == 0:
+        f1 = 0
+    else:
+        f1 = 2 * precision * recall / (precision + recall)
+    union = pred_set.union(ref_set)
+    jaccard = len_correct_tokens / len(union) if len(union) > 0 else 0
+    rouge_1 = rouge.compute(
+        predictions=[max_sim_string],
+        references=[needle],
+        tokenizer=partial(tokenize, language=language),
+        rouge_types=['rouge1'],
+    )['rouge1']
+    exact_match = float(list(max_sim_ngram) == list(tokens_needle))
+    out = {
+        'crossed_text': needle,
+        'max_sim_string': max_sim_string,
+        'max_sim_val': max_sim_val,
+        'precision': precision,
+        'recall': recall,
+        'f1': f1,
+        'jaccard': jaccard,
+        'rouge1': rouge_1,
+        'exact_match': exact_match,
+    }
+    return out
+def process_match_single_new(
+        image_id, prediction, answer, language, progress
+):
+    """
+    process the inference results for a single image and calculate the metrics
+    Parameters:
+    image_id (int): The image id (question id).
+    prediction (str): The prediction text.
+    answer (Union[str, List[str]]): The answer text, or a list of answer texts. The masked n-grams in the image.
+    language (str): The language of the text. Can be "en" or "zh".
+    rouge (rouge): The rouge metric object.
+    progress (multiprocessing.Queue): The progress queue.
+    Returns:
+    tuple: The image id (question_id, int) and the result per id (dict of dict of dict).
+    """
+    result_per_id = {image_id: {}}
+    if isinstance(answer, str):
+        answer = eval(answer)
+    assert isinstance(answer, list)
+    result = prediction.split('Assistant: ')[-1]
+    for i, crossed_text in enumerate(answer):
+        if rough_filter(result):
+            find_best_match_result = find_best_match(
+                crossed_text, result, language, rouge
+            )
+            if i == 0:
+                result_per_id[image_id] = {str(i): find_best_match_result}
+            else:
+                result_per_id[image_id][str(i)] = find_best_match_result
+        else:
+            if i == 0:
+                result_per_id[image_id] = {str(i): zero_template(crossed_text)}
+            else:
+                result_per_id[image_id][str(i)] = zero_template(crossed_text)
+    progress.put(1)
+    return image_id, result_per_id
+class VCRDataset(ImageBaseDataset):
+    TYPE = 'VQA'
+    URL_PREFIX = 'https://huggingface.co/datasets/vcr-org'
+    DATASET_URL = {
+        'VCR_EN_EASY_500': f'{URL_PREFIX}/VCR-wiki-en-easy-test-500/resolve/main/VCR-wiki-en-easy-test-500.tsv',
+        'VCR_EN_EASY_100': f'{URL_PREFIX}/VCR-wiki-en-easy-test-100/resolve/main/VCR-wiki-en-easy-test-100.tsv',
+        'VCR_EN_EASY_ALL': f'{URL_PREFIX}/VCR-wiki-en-easy-test/resolve/main/VCR-wiki-en-easy-test.tsv',
+        'VCR_EN_HARD_500': f'{URL_PREFIX}/VCR-wiki-en-hard-test-500/resolve/main/VCR-wiki-en-hard-test-500.tsv',
+        'VCR_EN_HARD_100': f'{URL_PREFIX}/VCR-wiki-en-hard-test-100/resolve/main/VCR-wiki-en-hard-test-100.tsv',
+        'VCR_EN_HARD_ALL': f'{URL_PREFIX}/VCR-wiki-en-hard-test/resolve/main/VCR-wiki-en-hard-test.tsv',
+        'VCR_ZH_EASY_500': f'{URL_PREFIX}/VCR-wiki-zh-easy-test-500/resolve/main/VCR-wiki-zh-easy-test-500.tsv',
+        'VCR_ZH_EASY_100': f'{URL_PREFIX}/VCR-wiki-zh-easy-test-100/resolve/main/VCR-wiki-zh-easy-test-100.tsv',
+        'VCR_ZH_EASY_ALL': f'{URL_PREFIX}/VCR-wiki-zh-easy-test/resolve/main/VCR-wiki-zh-easy-test.tsv',
+        'VCR_ZH_HARD_500': f'{URL_PREFIX}/VCR-wiki-zh-hard-test-500/resolve/main/VCR-wiki-zh-hard-test-500.tsv',
+        'VCR_ZH_HARD_100': f'{URL_PREFIX}/VCR-wiki-zh-hard-test-100/resolve/main/VCR-wiki-zh-hard-test-100.tsv',
+        'VCR_ZH_HARD_ALL': f'{URL_PREFIX}/VCR-wiki-zh-hard-test/resolve/main/VCR-wiki-zh-hard-test.tsv',
+    }
+    DATASET_MD5 = {
+        'VCR_EN_EASY_500': 'fd9258db52f8685dc710619a0ea0a261',
+        'VCR_EN_EASY_100': '9df5d7266683458621ecbe122beb72f0',
+        'VCR_EN_EASY_ALL': '8a9b96885f251d1c85f42f84073327f1',
+        'VCR_EN_HARD_500': '0a22a85080b6a1f52b1f95e302d43df4',
+        'VCR_EN_HARD_100': '1b20f5cbcbeae0b0bec77f7a36143958',
+        'VCR_EN_HARD_ALL': '2d8b8b1ee0eba0e0b618fd3aa7d9710e',
+        'VCR_ZH_EASY_500': 'beca5fd54176adf44cf94bd9b50cf048',
+        'VCR_ZH_EASY_100': '4a86a5678a79844d6d22ab0629c51cd5',
+        'VCR_ZH_EASY_ALL': '5050fe7f0027ad2068fd4c7f220edaea',
+        'VCR_ZH_HARD_500': '617e3360f75c54455625cb0a8da5c1e7',
+        'VCR_ZH_HARD_100': 'b0e38c85f5d5e63894a3b881c372a62b',
+        'VCR_ZH_HARD_ALL': '54bbfef448206518b03127ef8b61404c',
+    }
+    def __init__(self, dataset='VCR_EN_EASY_500', skip_noimg=True):
+        super().__init__(dataset, skip_noimg)
+        initialize()
+        self.language = 'en' if 'EN' in dataset else 'zh'
+        self.difficulty = 'easy' if 'EASY' in dataset else 'hard'
+    # def build_prompt(self, line):
+    #     msgs = super().build_prompt(line)
+    #     assert msgs[-1]['type'] == 'text'
+    #     if self.language == 'zh':
+    #         msgs[-1]['value'] += '图像中被覆盖的文本是什么？请在不输出解释的情况下还原被覆盖的文本。'
+    #     else:
+    #         msgs[-1]['value'] += ('What is the covered texts in the image? '
+    #                               'Please restore the covered texts without outputting the explanations.')
+    #     return msgs
+    def evaluate(self, eval_file, **judge_kwargs):
+        import multiprocessing
+        vcr_score_list = {'Exact_Match': [], 'Jaccard': []}
+        vcr_score = {'Exact_Match': 0, 'Jaccard': 0}
+        logger = get_logger('Evaluation')
+        data = load(eval_file)
+        lt = len(data)
+        lines = [data.iloc[i] for i in range(lt)]
+        pool = multiprocessing.Pool()
+        manager = multiprocessing.Manager()
+        progress_queue = manager.Queue()
+        results = []
+        overall_results = {str(image_id): {} for image_id in range(len(lines))}
+        for instance_id, instance in enumerate(lines):
+            results.append(
+                pool.apply_async(
+                    process_match_single_new,
+                    args=(
+                        str(instance_id),
+                        instance['prediction'],
+                        instance['answer'],
+                        self.language,
+                        progress_queue,
+                    ),
+                )
+            )
+        pool.close()
+        # Display progress bar
+        for _ in tqdm(range(len(results))):
+            progress_queue.get()
+        pool.join()
+        # Merging results into overall_result
+        for result in results:
+            image_id, result_per_id = result.get()
+            overall_results[str(image_id)].update(result_per_id[image_id])
+            for blank_id_str in result_per_id[image_id].keys():
+                vcr_score_list['Exact_Match'].append(
+                    result_per_id[image_id][blank_id_str]['exact_match']
+                )
+                vcr_score_list['Jaccard'].append(
+                    result_per_id[image_id][blank_id_str]['jaccard']
+                )
+            vcr_score['Exact_Match'] = np.mean(vcr_score_list['Exact_Match'])
+            vcr_score['Jaccard'] = np.mean(vcr_score_list['Jaccard'])
+        results_out = {
+            k: v for i in range(len(results)) for k, v in results[i].get()[1].items()
+        }
+        results_with_metrics = {
+            'Exact_Match': vcr_score['Exact_Match'],
+            'Jaccard': vcr_score['Jaccard'],
+            'Predictions': results_out,
+        }
+        score_pth = eval_file.replace(
+            '.xlsx', f'{self.language}_{self.difficulty}_score.json'
+        )
+        dump(results_with_metrics, score_pth)
+        logger.info(
+            f'VCR successfully finished evaluating {eval_file}, results saved in {score_pth}'
+        )
+        logger.info('Score: ')
+        for key, value in vcr_score.items():
+            logger.info('{}:{}'.format(key, value))

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/video_base.py ADDED Viewed

	@@ -0,0 +1,135 @@

+from abc import abstractmethod
+from ..smp import *
+class VideoBaseDataset:
+    MODALITY = 'VIDEO'
+    def __init__(self,
+                 dataset='MMBench-Video',
+                 pack=False,
+                 nframe=0,
+                 fps=-1):
+        try:
+            import decord
+        except Exception as e:
+            logging.critical(f'{type(e)}: {e}')
+            logging.critical('Please install decord via `pip install decord`.')
+        self.dataset_name = dataset
+        ret = self.prepare_dataset(dataset)
+        assert ret is not None
+        lmu_root = LMUDataRoot()
+        self.frame_root = osp.join(lmu_root, 'images', dataset)
+        os.makedirs(self.frame_root, exist_ok=True)
+        self.frame_tmpl = 'frame-{}-of-{}.jpg'
+        self.frame_tmpl_fps = 'frame-{}-of-{}-{}fps.jpg'
+        self.data_root = ret['root']
+        self.data_file = ret['data_file']
+        self.data = load(self.data_file)
+        assert 'question' in self.data and 'video' in self.data
+        videos = list(set(self.data['video']))
+        videos.sort()
+        self.videos = videos
+        self.pack = pack
+        self.nframe = nframe
+        self.fps = fps
+        if self.fps > 0 and self.nframe > 0:
+            raise ValueError('fps and nframe should not be set at the same time')
+        if self.fps <= 0 and self.nframe <= 0:
+            raise ValueError('fps and nframe should be set at least one valid value')
+    def __len__(self):
+        return len(self.videos) if self.pack else len(self.data)
+    def __getitem__(self, idx):
+        if self.pack:
+            assert idx < len(self.videos)
+            sub_data = self.data[self.data['video'] == self.videos[idx]]
+            return sub_data
+        else:
+            assert idx < len(self.data)
+            return dict(self.data.iloc[idx])
+    def frame_paths(self, video):
+        frame_root = osp.join(self.frame_root, video)
+        os.makedirs(frame_root, exist_ok=True)
+        return [osp.join(frame_root, self.frame_tmpl.format(i, self.nframe)) for i in range(1, self.nframe + 1)]
+    def frame_paths_fps(self, video, num_frames):
+        frame_root = osp.join(self.frame_root, video)
+        os.makedirs(frame_root, exist_ok=True)
+        return [osp.join(frame_root,
+                         self.frame_tmpl_fps.format(i, num_frames, self.fps)) for i in range(1, num_frames + 1)]
+    def save_video_frames(self, video):
+        if self.fps > 0:
+            vid_path = osp.join(self.data_root, video + '.mp4')
+            vid = decord.VideoReader(vid_path)
+            # 计算视频的总帧数和总时长
+            total_frames = len(vid)
+            video_fps = vid.get_avg_fps()
+            total_duration = total_frames / video_fps
+            # 计算需要提取的总帧数
+            required_frames = int(total_duration * self.fps)
+            # 计算提取帧的间隔
+            step_size = video_fps / self.fps
+            # 计算提取帧的索引
+            indices = [int(i * step_size) for i in range(required_frames)]
+            # 提取帧并保存
+            frame_paths = self.frame_paths_fps(video, len(indices))
+            flag = np.all([osp.exists(p) for p in frame_paths])
+            if flag:
+                return frame_paths
+            images = [vid[i].asnumpy() for i in indices]
+            images = [Image.fromarray(arr) for arr in images]
+            for im, pth in zip(images, frame_paths):
+                if not osp.exists(pth):
+                    im.save(pth)
+            return frame_paths
+        else:
+            frame_paths = self.frame_paths(video)
+            flag = np.all([osp.exists(p) for p in frame_paths])
+            if flag:
+                return frame_paths
+            vid_path = osp.join(self.data_root, video + '.mp4')
+            vid = decord.VideoReader(vid_path)
+            step_size = len(vid) / (self.nframe + 1)
+            indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
+            images = [vid[i].asnumpy() for i in indices]
+            images = [Image.fromarray(arr) for arr in images]
+            for im, pth in zip(images, frame_paths):
+                if not osp.exists(pth):
+                    im.save(pth)
+            return frame_paths
+    # Return a list of dataset names that are supported by this class, can override
+    @classmethod
+    def supported_datasets(cls):
+        return ['MMBench-Video', 'Video-MME', 'MVBench', 'MVBench_MP4', 'LongVideoBench']
+    # Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
+    @abstractmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        pass
+    @abstractmethod
+    def build_prompt(self, idx):
+        pass
+    @abstractmethod
+    def prepare_dataset(self, dataset):
+        # The prepare_dataset function should return a dictionary containing:
+        # `root` (directory that containing video files)
+        # `data_file` (the TSV dataset file)
+        pass

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/video_concat_dataset.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from ..smp import *
+from .video_base import VideoBaseDataset
+class ConcatVideoDataset(VideoBaseDataset):
+    # This dataset takes multiple dataset names as input and aggregate them into a single dataset.
+    # Each single dataset should not have a field named `SUB_DATASET`
+    DATASET_SETS = {}
+    def __init__(self, dataset, **kwargs):
+        from . import build_dataset
+        datasets = self.DATASET_SETS[dataset]
+        self.dataset_map = {}
+        # The name of the compliation
+        self.dataset_name = dataset
+        self.datasets = datasets
+        self.nframe = kwargs.get('nframe', 0)
+        self.fps = kwargs.get('fps', -1)
+        for dname in datasets:
+            dataset = build_dataset(dname, **kwargs)
+            assert dataset is not None, dataset
+            self.dataset_map[dname] = dataset
+        TYPES = [x.TYPE for x in self.dataset_map.values()]
+        MODALITIES = [x.MODALITY for x in self.dataset_map.values()]
+        # assert np.all([x == TYPES[0] for x in TYPES]), (datasets, TYPES)
+        assert np.all([x == MODALITIES[0] for x in MODALITIES]), (datasets, MODALITIES)
+        self.TYPE = TYPES
+        self.MODALITY = MODALITIES[0]
+        data_all = []
+        for dname in datasets:
+            data = self.dataset_map[dname].data
+            data['SUB_DATASET'] = [dname] * len(data)
+            data_all.append(data)
+        data = pd.concat(data_all)
+        data['original_index'] = data.pop('index')
+        data['index'] = np.arange(len(data))
+        self.data = data
+    def build_prompt(self, line, video_llm):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        idx = line['original_index']
+        dname = line['SUB_DATASET']
+        org_data = self.dataset_map[dname].data
+        org_line = cp.deepcopy(org_data[org_data['index'] == idx]).iloc[0]
+        return self.dataset_map[dname].build_prompt(org_line, video_llm)
+    def dump_image(self, line):
+        # Assert all images are pre-dumped
+        assert 'image' not in line
+        assert 'image_path' in line
+        tgt_path = toliststr(line['image_path'])
+        return tgt_path
+    @classmethod
+    def supported_datasets(cls):
+        return []  # list(cls.DATASET_SETS)
+    def evaluate(self, eval_file, **judge_kwargs):
+        suffix = eval_file.split('.')[-1]
+        # First, split the eval_file by dataset
+        data_all = load(eval_file)
+        for dname in self.datasets:
+            tgt = eval_file.replace(self.dataset_name, dname)
+            data_sub = data_all[data_all['SUB_DATASET'] == dname]
+            data_sub.pop('index')
+            data_sub['index'] = data_sub.pop('original_index')
+            data_sub.pop('SUB_DATASET')
+            dump(data_sub, tgt)
+        # Then, evaluate each dataset separately
+        results_all = {}
+        for dname in self.datasets:
+            tgt = eval_file.replace(self.dataset_name, dname)
+            res = self.dataset_map[dname].evaluate(tgt, **judge_kwargs)
+            results_all.update(res)
+        result = pd.DataFrame(results_all, index=['success', 'overall'])
+        result = result.T
+        for idx, item in result.iterrows():
+            result.loc[idx, 'acc'] = round(item['success'] / item['overall'] * 100, 1)
+        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        dump(result, score_file)
+        return result

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/video_dataset_config.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from vlmeval.dataset import *
+from functools import partial
+mmbench_video_dataset = {
+    'MMBench_Video_8frame_nopack': partial(MMBenchVideo, dataset='MMBench-Video', nframe=8, pack=False),
+    'MMBench_Video_8frame_pack': partial(MMBenchVideo, dataset='MMBench-Video', nframe=8, pack=True),
+    'MMBench_Video_16frame_nopack': partial(MMBenchVideo, dataset='MMBench-Video', nframe=16, pack=False),
+    'MMBench_Video_1fps_nopack': partial(MMBenchVideo, dataset='MMBench-Video', fps=1.0, pack=False),
+    'MMBench_Video_1fps_pack': partial(MMBenchVideo, dataset='MMBench-Video', fps=1.0, pack=True)
+}
+mvbench_dataset = {
+    'MVBench_8frame': partial(MVBench, dataset='MVBench', nframe=8),
+    # MVBench not support fps, but MVBench_MP4 does
+    'MVBench_MP4_8frame': partial(MVBench_MP4, dataset='MVBench_MP4', nframe=8),
+    'MVBench_MP4_1fps': partial(MVBench_MP4, dataset='MVBench_MP4', fps=1.0),
+}
+videomme_dataset = {
+    'Video-MME_8frame': partial(VideoMME, dataset='Video-MME', nframe=8),
+    'Video-MME_8frame_subs': partial(VideoMME, dataset='Video-MME', nframe=8, use_subtitle=True),
+    'Video-MME_1fps': partial(VideoMME, dataset='Video-MME', fps=1.0),
+    'Video-MME_0.5fps': partial(VideoMME, dataset='Video-MME', fps=0.5),
+    'Video-MME_0.5fps_subs': partial(VideoMME, dataset='Video-MME', fps=0.5, use_subtitle=True),
+}
+longvideobench_dataset = {
+    'LongVideoBench_8frame': partial(LongVideoBench, dataset='LongVideoBench', nframe=8),
+    'LongVideoBench_8frame_subs': partial(LongVideoBench, dataset='LongVideoBench', nframe=8, use_subtitle=True),
+    'LongVideoBench_1fps': partial(LongVideoBench, dataset='LongVideoBench', fps=1.0),
+    'LongVideoBench_0.5fps': partial(LongVideoBench, dataset='LongVideoBench', fps=0.5),
+    'LongVideoBench_0.5fps_subs': partial(LongVideoBench, dataset='LongVideoBench', fps=0.5, use_subtitle=True)
+}
+mlvu_dataset = {
+    'MLVU_8frame': partial(MLVU, dataset='MLVU', nframe=8),
+    'MLVU_1fps': partial(MLVU, dataset='MLVU', fps=1.0)
+}
+tempcompass_dataset = {
+    'TempCompass_8frame': partial(TempCompass, dataset='TempCompass', nframe=8),
+    'TempCompass_1fps': partial(TempCompass, dataset='TempCompass', fps=1.0),
+    'TempCompass_0.5fps': partial(TempCompass, dataset='TempCompass', fps=0.5)
+}
+# In order to reproduce the experimental results in CGbench paper,
+# use_subtitle, use_subtitle_time and use_frame_time need to be set to True.
+# When measuring clue-related results, if the number of frames used is greater
+# than 32, the frame capture limit will be set to 32.
+cgbench_dataset = {
+    'CGBench_MCQ_Grounding_Mini_8frame_subs_subt': partial(
+        CGBench_MCQ_Grounding_Mini,
+        dataset='CG-Bench_MCQ_Grounding_Mini',
+        nframe=8,
+        use_subtitle=True,
+        use_subtitle_time=True
+    ),
+    'CGBench_OpenEnded_Mini_8frame_subs_subt_ft': partial(
+        CGBench_OpenEnded_Mini,
+        dataset='CG-Bench_OpenEnded_Mini',
+        nframe=8,
+        use_subtitle=True,
+        use_subtitle_time=True,
+        use_frame_time=True
+    ),
+    'CGBench_MCQ_Grounding_32frame_subs': partial(
+        CGBench_MCQ_Grounding,
+        dataset='CG-Bench_MCQ_Grounding',
+        nframe=32,
+        use_subtitle=True
+    ),
+    'CGBench_OpenEnded_8frame': partial(
+        CGBench_OpenEnded,
+        dataset='CG-Bench_OpenEnded',
+        nframe=8
+    ),
+    'CGBench_MCQ_Grounding_16frame_subs_subt_ft': partial(
+        CGBench_MCQ_Grounding,
+        dataset='CG-Bench_MCQ_Grounding',
+        nframe=16,
+        use_subtitle=True,
+        use_subtitle_time=True,
+        use_frame_time=True
+    ),
+    'CGBench_OpenEnded_16frame_subs_subt_ft': partial(
+        CGBench_OpenEnded,
+        dataset='CG-Bench_OpenEnded',
+        nframe=16,
+        use_subtitle=True,
+        use_subtitle_time=True,
+        use_frame_time=True
+    )
+}
+supported_video_datasets = {}
+dataset_groups = [
+    mmbench_video_dataset, mvbench_dataset, videomme_dataset, longvideobench_dataset,
+    mlvu_dataset, tempcompass_dataset, cgbench_dataset
+]
+for grp in dataset_groups:
+    supported_video_datasets.update(grp)

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/videomme.py ADDED Viewed

	@@ -0,0 +1,283 @@

+from huggingface_hub import snapshot_download
+from ..smp import *
+from .video_base import VideoBaseDataset
+from .utils import build_judge, DEBUG_MESSAGE
+FAIL_MSG = 'Failed to obtain answer via API.'
+def unwrap_hf_pkl(pth, suffix='.mp4'):
+    base_dir = os.path.join(pth, 'video_pkl/')
+    target_dir = os.path.join(pth, 'video/')
+    pickle_files = [os.path.join(base_dir, file) for file in os.listdir(base_dir)]
+    pickle_files.sort()
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir, exist_ok=True)
+        for pickle_file in pickle_files:
+            with open(pickle_file, 'rb') as file:
+                video_data = pickle.load(file)
+            # For each video file in the pickle file, write its contents to a new mp4 file
+            for video_name, video_content in video_data.items():
+                output_path = os.path.join(target_dir, f'{video_name}{suffix}')
+                with open(output_path, 'wb') as output_file:
+                    output_file.write(video_content)
+        print('The video file has been restored and stored from the pickle file.')
+    else:
+        print('The video file already exists.')
+class VideoMME(VideoBaseDataset):
+    MD5 = '85bdd91f9b29a99354c23b97ab7c113c'
+    SYS = ''
+    FRAMES_TMPL_NOSUB = """
+These are the frames of a video. \
+Select the best answer to the following multiple-choice question based on the video. \
+Respond with only the letter (A, B, C, or D) of the correct option.
+"""
+    FRAMES_TMPL_SUB = """
+These are the frames of a video. \
+This video's subtitles are listed below:
+{}
+Select the best answer to the following multiple-choice question based on the video. \
+Respond with only the letter (A, B, C, or D) of the correct option.
+"""
+    TYPE = 'Video-MCQ'
+    def __init__(self, dataset='Video-MME', use_subtitle=False, nframe=0, fps=-1):
+        super().__init__(dataset=dataset, nframe=nframe, fps=fps)
+        self.use_subtitle = use_subtitle
+        self.dataset_name = dataset
+    @classmethod
+    def supported_datasets(cls):
+        return ['Video-MME']
+    def prepare_dataset(self, dataset_name='Video-MME', repo_id='lmms-lab/Video-MME'):
+        def check_integrity(pth):
+            data_file = osp.join(pth, f'{dataset_name}.tsv')
+            if not os.path.exists(data_file):
+                return False
+            if md5(data_file) != self.MD5:
+                return False
+            data = load(data_file)
+            for video_pth in data['video_path']:
+                if not osp.exists(osp.join(pth, video_pth)):
+                    return False
+            return True
+        cache_path = get_cache_path(repo_id)
+        if cache_path is not None and check_integrity(cache_path):
+            dataset_path = cache_path
+        else:
+            def unzip_hf_zip(pth):
+                import zipfile
+                base_dir = pth
+                target_dir = os.path.join(pth, 'video/')
+                zip_files = [
+                    os.path.join(base_dir, file) for file in os.listdir(base_dir)
+                    if file.endswith('.zip') and file.startswith('video')
+                ]
+                zip_files.sort()
+                if not os.path.exists(target_dir):
+                    os.makedirs(target_dir, exist_ok=True)
+                    for zip_file in zip_files:
+                        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
+                            for member in zip_ref.namelist():
+                                # Check if the member is a file (not a directory)
+                                if not member.endswith('/'):
+                                    # Extract the file to the specified directory
+                                    source = zip_ref.open(member)
+                                    target = open(os.path.join(target_dir, os.path.basename(member)), 'wb')
+                                    with source, target:
+                                        target.write(source.read())
+                    print('The video file has been restored and stored from the zip file.')
+                else:
+                    print('The video file already exists.')
+                subtitle_zip_file = os.path.join(base_dir, 'subtitle.zip')
+                subtitle_target_dir = os.path.join(base_dir, 'subtitle')
+                if not os.path.exists(subtitle_target_dir):
+                    os.makedirs(subtitle_target_dir, exist_ok=True)
+                    with zipfile.ZipFile(subtitle_zip_file, 'r') as zip_ref:
+                        for member in zip_ref.namelist():
+                            # Check if the member is a file (not a directory)
+                            if not member.endswith('/'):
+                                # Extract the file to the specified directory
+                                source = zip_ref.open(member)
+                                target = open(os.path.join(subtitle_target_dir, os.path.basename(member)), 'wb')
+                                with source, target:
+                                    target.write(source.read())
+                    print('The subtitle file has been restored and stored from the zip file.')
+                else:
+                    print('The subtitle file already exists.')
+            def generate_tsv(pth):
+                data_file = osp.join(pth, f'{dataset_name}.tsv')
+                if os.path.exists(data_file) and md5(data_file) == self.MD5:
+                    return
+                data_file = pd.read_parquet(os.path.join(pth, 'videomme/test-00000-of-00001.parquet'))
+                data_file = data_file.assign(index=range(len(data_file)))
+                data_file['video'] = data_file['videoID']
+                data_file['video_path'] = data_file['videoID'].apply(lambda x: f'./video/{x}.mp4')
+                data_file['subtitle_path'] = data_file['videoID'].apply(lambda x: f'./subtitle/{x}.srt')
+                data_file['candidates'] = data_file['options'].apply(lambda x: x.tolist())
+                data_file = data_file[['index', 'video', 'video_path', 'duration', 'domain', 'candidates',
+                                       'sub_category', 'task_type', 'subtitle_path', 'question', 'answer']]
+                data_file.to_csv(osp.join(pth, f'{dataset_name}.tsv'), sep='\t', index=False)
+            if modelscope_flag_set():
+                from modelscope import dataset_snapshot_download
+                dataset_path = dataset_snapshot_download(dataset_id=repo_id)
+            else:
+                dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
+            unzip_hf_zip(dataset_path)
+            generate_tsv(dataset_path)
+        data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
+        return dict(data_file=data_file, root=dataset_path)
+    def save_video_frames(self, video, video_llm=False):
+        vid_path = osp.join(self.data_root, 'video', video + '.mp4')
+        vid = decord.VideoReader(vid_path)
+        video_info = {
+            'fps': vid.get_avg_fps(),
+            'n_frames': len(vid),
+        }
+        if self.nframe > 0 and self.fps < 0:
+            step_size = len(vid) / (self.nframe + 1)
+            indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
+            frame_paths = self.frame_paths(video)
+        elif self.fps > 0:
+            # not constrained by num_frames, get frames by fps
+            total_duration = video_info['n_frames'] / video_info['fps']
+            required_frames = int(total_duration * self.fps)
+            step_size = video_info['fps'] / self.fps
+            indices = [int(i * step_size) for i in range(required_frames)]
+            frame_paths = self.frame_paths_fps(video, len(indices))
+        flag = np.all([osp.exists(p) for p in frame_paths])
+        if not flag:
+            images = [vid[i].asnumpy() for i in indices]
+            images = [Image.fromarray(arr) for arr in images]
+            for im, pth in zip(images, frame_paths):
+                if not osp.exists(pth) and not video_llm:
+                    im.save(pth)
+        return frame_paths, indices, video_info
+    def build_prompt(self, line, video_llm):
+        if isinstance(line, int):
+            assert line < len(self)
+            line = self.data.iloc[line]
+        frames, indices, video_info = self.save_video_frames(line['video'], video_llm)
+        if self.use_subtitle and os.path.exists(osp.join(self.data_root, line['subtitle_path'])):
+            import pysubs2
+            subs = pysubs2.load(osp.join(self.data_root, line['subtitle_path']), encoding='utf-8')
+            subtitles = []
+            for seleced_frame_id in indices:
+                sub_text = ''
+                cur_time = pysubs2.make_time(fps=video_info['fps'], frames=seleced_frame_id)
+                for sub in subs:
+                    if sub.start < cur_time and sub.end > cur_time:
+                        sub_text = sub.text.replace('\\N', ' ')
+                        break
+                if sub_text.strip():
+                    subtitles.append(sub_text)
+            subtitles = '\n'.join(subtitles)
+        else:
+            subtitles = ''
+        message = [dict(type='text', value=self.SYS)]
+        if video_llm:
+            message.append(dict(type='video', value=osp.join(self.data_root, 'video', line['video'] + '.mp4')))
+        else:
+            for im in frames:
+                message.append(dict(type='image', value=im))
+        text_prompt = self.FRAMES_TMPL_NOSUB if not self.use_subtitle else self.FRAMES_TMPL_SUB.format(subtitles)
+        message.append(dict(type='text', value=text_prompt))
+        line['question'] += '\n' + '\n'.join(eval(line['candidates']))
+        prompt = 'Question: {}\nAnswer: '.format(line['question'])
+        message.append(dict(type='text', value=prompt))
+        return message
+    # It returns a dictionary
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.videomme import get_dimension_rating, extract_characters_regex, extract_option
+        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
+        tgt_file = eval_file.replace('.xlsx', '_rating.json')
+        score_file = eval_file.replace('.xlsx', '_score.xlsx')
+        if not osp.exists(score_file):
+            model = judge_kwargs.get('model', 'exact_matching')
+            assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
+            if model == 'exact_matching':
+                model = None
+            elif gpt_key_set():
+                model = build_judge(**judge_kwargs)
+                if not model.working():
+                    warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
+                    warnings.warn(DEBUG_MESSAGE)
+                    model = None
+            else:
+                warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+                model = None
+            res = {} if not osp.exists(tmp_file) else load(tmp_file)
+            res = {k: v for k, v in res.items() if FAIL_MSG not in v}
+            data = load(eval_file)
+            data_un = data[~pd.isna(data['prediction'])]
+            for idx in data['index']:
+                ans = data.loc[data['index'] == idx, 'answer'].values[0]
+                pred = str(data.loc[data['index'] == idx, 'prediction'].values[0])
+                if extract_characters_regex(pred) == '':
+                    extract_pred = extract_option(
+                        model,
+                        data.loc[data['index'] == idx].to_dict(orient='records')[0],
+                        'Video-MME'
+                    )
+                    data.loc[idx, 'score'] = int(extract_pred == ans)
+                else:
+                    data.loc[idx, 'score'] = int(extract_characters_regex(pred) == ans)
+            rejected = [x for x in data['score'] if x == -1]
+            print(
+                f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
+                f'failed to obtain the score for another {len(rejected)} questions. '
+                f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
+            )
+            dump(data, score_file)
+        rating = get_dimension_rating(score_file)
+        dump(rating, tgt_file)
+        return rating

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/vl_rewardbench.py ADDED Viewed

	@@ -0,0 +1,174 @@

+from ast import literal_eval
+from .image_base import ImageBaseDataset
+from .utils import build_judge, DEBUG_MESSAGE
+from ..smp import *
+from ..utils import track_progress_rich
+LLM_PARSE_ANSWER_PROMPT = '''
+You are given a pairwise judgement for two responses. Please return the better response according to the judgement.
+Return the Answer X ONLY. e.g., Answer 1 or Answer 2.
+Judgement: {judgement}
+'''
+PROMPT_TEMPLATE = '''\
+You are a highly capable multimodal AI assistant tasked with evaluating answers to visual questions.
+Please analyze the following image and question, then determine which of the two provided answers is better.
+Question: {query}
+Answer 1: {answer_0}
+Answer 2: {answer_1}
+Please evaluate both answers based on the following criteria:
+1. Accuracy: How well does the answer align with the visual information in the image?
+2. Completeness: Does the answer fully address all aspects of the question?
+3. Clarity: Is the answer easy to understand and well-articulated?
+4. Relevance: Does the answer directly relate to the question and the image?
+After your evaluation, please:
+1. Explain your reasoning for each criterion.
+2. Provide an overall judgment on which answer is better (Answer 1 or Answer 2).\
+For example: Overall Judgment: Answer X is better.
+Your response should be structured and detailed, \
+demonstrating your understanding of both the visual and textual elements of the task.'''
+def get_score(line, parsed_response, random_number):
+    gt_ans = line['human_ranking'].index(0 if random_number == 0 else 1) + 1
+    if 'Answer 1'.lower() in parsed_response.lower():
+        pred = 1
+    elif 'Answer 2'.lower() in parsed_response.lower():
+        pred = 2
+    else:  # failed
+        pred = 'None'  # random.choice([1, 2])
+    if pred == gt_ans:
+        return 1.0
+    else:
+        return 0.0
+def VLRewardBench_eval_answer(model, line):
+    response = toliststr(line['response'])
+    random_number = sum(len(res) for res in response) % 2
+    prompt = LLM_PARSE_ANSWER_PROMPT.format(judgement=line['prediction'])
+    messages = [dict(type='text', value=prompt)]
+    resp = model.generate(messages)
+    score = get_score(line, resp, random_number)
+    if score is None:
+        return 'Unknown'
+    return score
+class VLRewardBench(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'VL-RewardBench': 'https://huggingface.co/datasets/MMInstruction/VL-RewardBench/resolve/main/vl_rewardbench.tsv'
+    }
+    DATASET_MD5 = {'VL-RewardBench': '1d2676f4ab4a5f755019ec0af2b28189'}
+    # Given one data record, return the built prompt (a multi-modal message), can override
+    def build_prompt(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        tgt_path = self.dump_image(line)  # save image to local
+        question = line['question']
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        response = toliststr(line['response'])
+        random_number = sum(len(res) for res in response) % 2
+        if random_number == 1:
+            # randomly shuffle the order of the responses
+            response = response[::-1]
+        query_prompt = PROMPT_TEMPLATE.format(
+            query=question, answer_0=response[0], answer_1=response[1]
+        )
+        msgs = msgs + [dict(type='text', value=query_prompt)]
+        return msgs
+    # It returns a DataFrame
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        suffix = eval_file.split('.')[-1]
+        model = judge_kwargs['model']
+        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
+        score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        nproc = judge_kwargs.pop('nproc', 4)
+        if not osp.exists(storage):
+            raw_data = VLRewardBench('VL-RewardBench').data
+            data = load(eval_file)
+            data['prediction'] = [str(x) for x in data['prediction']]
+            data['human_ranking'] = [literal_eval(x) for x in raw_data['answer']]
+            judge_kwargs['temperature'] = 0
+            judge_kwargs['timeout'] = 60
+            model = build_judge(max_tokens=128, **judge_kwargs)
+            assert model.working(), (
+                'VLRewardBench evaluation requires a working OPENAI API\n'
+                + DEBUG_MESSAGE
+            )
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = [line['index'] for line in lines]
+            ans = load(tmp_file) if osp.exists(tmp_file) else {}
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+            if len(indices):
+                new_results = track_progress_rich(
+                    VLRewardBench_eval_answer,
+                    tups,
+                    nproc=nproc,
+                    chunksize=nproc,
+                    keys=indices,
+                    save=tmp_file,
+                )
+                ans = load(tmp_file)
+                for k, v in zip(indices, new_results):
+                    ans[k] = v
+            data['score'] = [ans[idx] for idx in data['index']]
+            # data.pop('image')
+            dump(data, storage)
+        data = load(storage)
+        lt = len(data)
+        category_scores = defaultdict(lambda: 0)
+        category_cnt = defaultdict(lambda: 0)
+        scores = defaultdict(lambda: 0)
+        for i in range(lt):
+            item = data.iloc[i]
+            category_scores[item['category']] += item['score']
+            category_cnt[item['category']] += 1
+        # calculate the average score for each category
+        for k, v in category_scores.items():
+            scores[k] = v / category_cnt[k]
+        # calculate category macro accuracy (average across categories)
+        scores['Macro Accuracy'] = sum(scores.values()) / len(scores)
+        # calculate the total average score
+        scores['Overall Consistency'] = sum(category_scores.values()) / lt
+        scores = {k: [v] for k, v in scores.items()}
+        scores = pd.DataFrame(scores)
+        dump(scores, score_file)
+        return scores

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/wildvision.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import re
+from functools import partial
+from .image_base import ImageBaseDataset
+from .utils import build_judge, DEBUG_MESSAGE
+from ..smp import *
+from ..utils import track_progress_rich
+SYSTEM_PROMPT = """\
+Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user \
+prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate \
+which assistant's answer is better.
+Begin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any \
+answers.
+When evaluating the assistants' answers, compare both assistants' answers with your answer. \
+You must identify and correct any mistakes or inaccurate information.
+Then consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly \
+responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one \
+interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than \
+providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate \
+to what is being asked. Concise means the response is clear and not verbose or excessive.
+Then consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing \
+important information in the assistants' answers that would be beneficial to include when responding to the user \
+prompt.
+After providing your explanation, you must output only one of the following choices as your final verdict with a label:
+1. Assistant A is significantly better: [[A>>B]]
+2. Assistant A is slightly better: [[A>B]]
+3. Tie, relatively the same: [[A=B]]
+4. Assistant B is slightly better: [[B>A]]
+5. Assistant B is significantly better: [[B>>A]]
+Example output: "My final verdict is tie: [[A=B]]".\
+"""
+PROMPT_TEMPLATE = """\
+"<|User Prompt|>\n{question}
+<|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|>
+<|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|>
+"""
+REGEX_PATTERN = re.compile("\[\[([AB<>=]+)\]\]")  # noqa: W605
+def get_score(judgement, pattern=REGEX_PATTERN):
+    matches = pattern.findall(judgement)
+    matches = [m for m in matches if m != ""]
+    if len(set(matches)) == 0:
+        return None, True
+    elif len(set(matches)) == 1:
+        return matches[0].strip("\n"), False
+    else:
+        return None, True
+def WildVision_auxeval(model, line):
+    config = dict(question=line['question'], answer_1=line['A'], answer_2=line['B'])
+    prompt = PROMPT_TEMPLATE.format(**config)
+    prefix = 'data:image/jpeg;base64,'
+    img = prefix + line['image']
+    messages = [
+        dict(type='text', value=prompt),
+        dict(type='image', value=img)
+    ]
+    retry = 2
+    while retry:
+        resp = model.generate(messages)
+        score, try_again = get_score(resp)
+        if not try_again:
+            break
+        retry -= 1
+    if score is None:
+        return 'Unknown'
+    return score
+class WildVision(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'WildVision': 'https://opencompass.openxlab.space/utils/VLMEval/WildVision.tsv'
+    }
+    DATASET_MD5 = {'WildVision': 'b38f80156d49411c594772866b0d0b52'}
+    score_map = {
+        'A>>B': -2,
+        'A>B': -1,
+        'A=B': 0,
+        'B>A': 1,
+        'B>>A': 2
+    }
+    # Given one data record, return the built prompt (a multi-modal message), can override
+    def build_prompt(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        if self.meta_only:
+            tgt_path = toliststr(line['image_path'])
+        else:
+            tgt_path = self.dump_image(line)
+        question = line['question']
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        # WildVision adopts text first
+        msgs = [dict(type='text', value=question)] + msgs
+        return msgs
+    @classmethod
+    def gen_eval_base(self, eval_file, b64_map):
+        data = load(eval_file)
+        data['B'] = data.pop('prediction')
+        data['A'] = data.pop('claude3_sonnet')
+        data['image'] = [b64_map[x] for x in data['index']]
+        return data
+        # rev = cp.deepcopy(data)
+        # rev['A'] = data['B']
+        # rev['B'] = data['A']
+        # rev['index'] = [x + '_rev' for x in data['index']]
+        # return pd.concat([data, rev], ignore_index=True)
+    # It returns a DataFrame
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        # We adopt pairwise evaluation (twice for a pair) for this dataset
+        suffix = eval_file.split('.')[-1]
+        model = judge_kwargs['model']
+        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
+        score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        nproc = judge_kwargs.pop('nproc', 4)
+        if not osp.exists(storage):
+            raw_data = WildVision('WildVision').data
+            b64_map = {x: y for x, y in zip(raw_data['index'], raw_data['image'])}
+            data = self.gen_eval_base(eval_file, b64_map)
+            judge_kwargs['system_prompt'] = SYSTEM_PROMPT
+            judge_kwargs['temperature'] = 0
+            judge_kwargs['img_detail'] = 'high'
+            judge_kwargs['timeout'] = 300
+            model = build_judge(max_tokens=4096, **judge_kwargs)
+            assert model.working(), ('WildVision evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = [line['index'] for line in lines]
+            ans = load(tmp_file) if osp.exists(tmp_file) else {}
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+            if len(indices):
+                new_results = track_progress_rich(
+                    WildVision_auxeval,
+                    tups,
+                    nproc=nproc,
+                    chunksize=nproc,
+                    keys=indices,
+                    save=tmp_file,
+                )
+                ans = load(tmp_file)
+                for k, v in zip(indices, new_results):
+                    ans[k] = v
+            data['score'] = [ans[idx] for idx in data['index']]
+            data.pop('image')
+            dump(data, storage)
+        data = load(storage)
+        lt = len(data)
+        scores = defaultdict(lambda: 0)
+        for i in range(lt):
+            item = data.iloc[i]
+            if item['score'] not in self.score_map:
+                score = 0
+            else:
+                score = self.score_map[item['score']]
+                if '_rev' in item['index']:
+                    score = -score
+            scores[score] += 1
+        name_map = {
+            2: 'Much Better',
+            1: 'Better',
+            0: 'Tie',
+            -1: 'Worse',
+            -2: 'Much Worse'
+        }
+        scores = {name_map[k]: v for k, v in scores.items()}
+        much_better = scores.get('Much Better', 0)
+        better = scores.get('Better', 0)
+        worse = scores.get('Worse', 0)
+        much_worse = scores.get('Much Worse', 0)
+        scores['Reward'] = (
+            100 * much_better + 50 * better - 50 * worse - 100 * much_worse
+        ) / lt
+        scores['Win Rate'] = (better + much_better) / lt
+        scores = {k: [v] for k, v in scores.items()}
+        scores = pd.DataFrame(scores)
+        dump(scores, score_file)
+        return scores