File size: 3,308 Bytes
b0c0df0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import base64
import datetime
import io
import json
import os
import string
from collections import defaultdict
from pathlib import Path

import numpy as np
import pandas as pd
import yaml
from loguru import logger as eval_logger
from PIL import Image

from lmms_eval.tasks.hrbench.hrbench_evals import HRBenchEval

with open(Path(__file__).parent / "hrbench.yaml", "r") as f:
    raw_data = f.readlines()
    safe_data = []
    for i, line in enumerate(raw_data):
        # remove function definition since yaml load cannot handle it
        if "!function" not in line:
            safe_data.append(line)

    config = yaml.safe_load("".join(safe_data))

hrbench_evaluator = HRBenchEval(api_key=os.getenv("OPENAI_API_KEY", "YOUR_API_KEY"), gpt_model=os.getenv("MODEL_VERSION", "gpt-4o-2024-11-20"), max_workers=config["metadata"]["max_workers"])


def decode_base64_to_image(base64_string, target_size=-1):
    image_data = base64.b64decode(base64_string)
    image = Image.open(io.BytesIO(image_data)).convert("RGB")
    if target_size > 0:
        image.thumbnail((target_size, target_size))
    return image


def hrbench_doc_to_visual(doc):
    image = decode_base64_to_image(doc["image"])
    return [image]


def hrbench_doc_to_options(doc):
    options = {cand: doc[cand] for cand in string.ascii_uppercase if cand in doc and not pd.isna(doc[cand])}
    return options


def hrbench_doc_to_text(doc, lmms_eval_specific_kwargs=None):
    question = doc["question"].strip()
    options = hrbench_doc_to_options(doc)
    options_prompt = ""
    for key, item in options.items():
        options_prompt += f"{key}. {item}\n"
    prompt = ""
    prompt += f"{question}\n{options_prompt}Answer the option letter directly."
    return prompt


def hrbench_process_results(doc, results):
    """
    Args:
        doc: a instance of the eval dataset
        results: [pred]
    Returns:
        a dictionary with key: metric name, value: metric value
    """
    pred = results[0].strip()
    gt = doc["answer"]
    options = hrbench_doc_to_options(doc)
    question = doc["question"]
    resp_dic = hrbench_evaluator.get_chat_response({"question": question, "options": options, "prediction": pred})
    gpt_prediction = resp_dic["gpt_prediction"]
    category = doc["category"]
    cycle_category = doc["cycle_category"]

    gpt_score = 0
    if gt.lower() == gpt_prediction.lower():
        gpt_score = 1

    return {category: {"index": doc["index"], "cycle_category": cycle_category, "gpt_score": gpt_score}, "average": {"index": doc["index"], "cycle_category": cycle_category, "gpt_score": gpt_score}}


def hrbench_aggregate_results(results, args):
    """
    Args:
        results: a list of values returned by process_results
    Returns:
        A score
    """
    cycle_category_scores = defaultdict(list)
    for result in results:
        score = result["gpt_score"]
        cycle_category = result["cycle_category"]
        cycle_category_scores[cycle_category].append(score)

    cycle_category_avg_score = {}
    for cycle_category, scores in cycle_category_scores.items():
        avg_score = sum(scores) / len(scores)
        cycle_category_avg_score[cycle_category] = avg_score

    avg_score = sum(cycle_category_avg_score.values()) / len(cycle_category_avg_score)
    return avg_score