Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- EAGLE/lmms_eval/tasks/ferret/ferret.yaml +39 -0
- EAGLE/lmms_eval/tasks/ferret/rule.json +5 -0
- EAGLE/lmms_eval/tasks/ferret/utils.py +206 -0
- EAGLE/lmms_eval/tasks/flickr30k/flickr30k.yaml +3 -0
- EAGLE/lmms_eval/tasks/flickr30k/flickr30k_test.yaml +44 -0
- EAGLE/lmms_eval/tasks/flickr30k/utils.py +141 -0
- EAGLE/lmms_eval/tasks/hallusion_bench/evaluate_hb.py +129 -0
- EAGLE/lmms_eval/tasks/hallusion_bench/hallusion_bench_image.yaml +41 -0
- EAGLE/lmms_eval/tasks/hallusion_bench/utils.py +306 -0
- EAGLE/lmms_eval/tasks/iconqa/utils.py +57 -0
- EAGLE/lmms_eval/tasks/mme/mme.yaml +37 -0
- EAGLE/lmms_eval/tasks/mme/utils.py +120 -0
- EAGLE/lmms_eval/tasks/multidocvqa/multidocvqa.yaml +4 -0
- EAGLE/lmms_eval/tasks/multidocvqa/multidocvqa_test.yaml +20 -0
- EAGLE/lmms_eval/tasks/multidocvqa/multidocvqa_val.yaml +23 -0
- EAGLE/lmms_eval/tasks/multidocvqa/utils.py +116 -0
- EAGLE/lmms_eval/tasks/nocaps/_default_template_nocaps_yaml +3 -0
- EAGLE/lmms_eval/tasks/nocaps/nocaps.yaml +4 -0
- EAGLE/lmms_eval/tasks/nocaps/nocaps_test.yaml +25 -0
- EAGLE/lmms_eval/tasks/nocaps/nocaps_val.yaml +46 -0
- EAGLE/lmms_eval/tasks/nocaps/utils.py +153 -0
- EAGLE/lmms_eval/tasks/ok_vqa/_default_template_vqa_yaml +24 -0
- EAGLE/lmms_eval/tasks/ok_vqa/_generate_config.py +25 -0
- EAGLE/lmms_eval/tasks/ok_vqa/_ok_vqa.yaml +3 -0
- EAGLE/lmms_eval/tasks/ok_vqa/ok_vqa_val2014.yaml +4 -0
- EAGLE/lmms_eval/tasks/ok_vqa/utils.py +70 -0
- EAGLE/lmms_eval/tasks/olympiadbench/cn_utils.py +69 -0
- EAGLE/lmms_eval/tasks/olympiadbench/en_utils.py +69 -0
- EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench.yaml +6 -0
- EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench_evals.py +355 -0
- EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench_test_cn.yaml +25 -0
- EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench_test_en.yaml +25 -0
- EAGLE/lmms_eval/tasks/seedbench/seedbench.yaml +28 -0
- EAGLE/lmms_eval/tasks/seedbench/seedbench_ppl.yaml +15 -0
- EAGLE/lmms_eval/tasks/seedbench/utils.py +60 -0
- EAGLE/lmms_eval/tasks/textvqa/_default_template_textvqa_yaml +17 -0
- EAGLE/lmms_eval/tasks/textvqa/_textvqa.yaml +4 -0
- EAGLE/lmms_eval/tasks/textvqa/textvqa_test.yaml +7 -0
- EAGLE/lmms_eval/tasks/textvqa/textvqa_val.yaml +12 -0
- EAGLE/lmms_eval/tasks/textvqa/utils.py +68 -0
- EAGLE/lmms_eval/tasks/vizwiz_vqa/_default_template_vqa_yaml +15 -0
- EAGLE/lmms_eval/tasks/vizwiz_vqa/_generate_config.py +25 -0
- EAGLE/lmms_eval/tasks/vizwiz_vqa/_vizwiz_vqa.yaml +4 -0
- EAGLE/lmms_eval/tasks/vizwiz_vqa/utils.py +70 -0
- EAGLE/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_test.yaml +14 -0
- EAGLE/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_val.yaml +13 -0
- EAGLE/lmms_eval/tasks/vqav2/_default_template_vqav2_yaml +15 -0
- EAGLE/lmms_eval/tasks/vqav2/_vqav2.yaml +4 -0
- EAGLE/lmms_eval/tasks/vqav2/utils.py +89 -0
- EAGLE/lmms_eval/tasks/vqav2/vqav2_test.yaml +8 -0
EAGLE/lmms_eval/tasks/ferret/ferret.yaml
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset_path: lmms-lab/Ferret-Bench
|
| 2 |
+
dataset_kwargs:
|
| 3 |
+
token: True
|
| 4 |
+
task: "ferret"
|
| 5 |
+
test_split: test
|
| 6 |
+
output_type: generate_until
|
| 7 |
+
doc_to_visual: !function utils.ferret_doc_to_visual
|
| 8 |
+
doc_to_text: !function utils.ferret_doc_to_text
|
| 9 |
+
doc_to_target: "gpt_answer"
|
| 10 |
+
generation_kwargs:
|
| 11 |
+
until:
|
| 12 |
+
- "ASSISTANT:"
|
| 13 |
+
image_aspect_ratio: original
|
| 14 |
+
max_new_tokens: 1024
|
| 15 |
+
temperature: 0
|
| 16 |
+
top_p: 0
|
| 17 |
+
num_beams: 1
|
| 18 |
+
do_sample: false
|
| 19 |
+
process_results: !function utils.ferret_process_results
|
| 20 |
+
metric_list:
|
| 21 |
+
- metric: gpt_eval_ferret_all
|
| 22 |
+
aggregation: !function utils.ferret_all_aggregation
|
| 23 |
+
higher_is_better: true
|
| 24 |
+
- metric: gpt_eval_ferret_refer_desc
|
| 25 |
+
aggregation: !function utils.ferret_refer_desc_aggregation
|
| 26 |
+
higher_is_better: true
|
| 27 |
+
- metric: gpt_eval_ferret_refer_reason
|
| 28 |
+
aggregation: !function utils.ferret_refer_reason_aggregation
|
| 29 |
+
higher_is_better: true
|
| 30 |
+
- metric: gpt_eval_ferret_ground_conv
|
| 31 |
+
aggregation: !function utils.ferret_ground_conv_aggregation
|
| 32 |
+
higher_is_better: true
|
| 33 |
+
metadata:
|
| 34 |
+
version: 0.0
|
| 35 |
+
gpt_eval_model_name: "gpt-4-0314"
|
| 36 |
+
model_specific_prompt_kwargs:
|
| 37 |
+
default:
|
| 38 |
+
pre_prompt: ""
|
| 39 |
+
post_prompt: ""
|
EAGLE/lmms_eval/tasks/ferret/rule.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"refer_desc": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question about specific region of an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image. In addition, specific object locations within the image are given, along with detailed coordinates. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. Also, the relationships between pairs of objects are provided, in the format of object -> relationship -> subject, where the object/subject are indexed by object id from previous object lists as well as the object names. Also, several region description are given, each describing a box region of image, with detailed coordinates. \nPlease rate the spatial correspondence, helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
|
| 3 |
+
"refer_reason": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question about specific region of an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image. In addition, specific object locations within the image are given, along with detailed coordinates. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. Also, the relationships between pairs of objects are provided, in the format of object -> relationship -> subject, where the object/subject are indexed by object id from previous object lists as well as the object names. Also, several region description are given, each describing a box region of image, with detailed coordinates. \nPlease rate the spatial correspondence, helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
|
| 4 |
+
"ground_conv": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question that requires model to predict the coordinates of relevant object. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image. In addition, specific object locations within the image are given, along with detailed coordinates. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. Also, the relationships between pairs of objects are provided, in the format of object -> relationship -> subject, where the object/subject are indexed by object id from previous object lists as well as the object names. Also, several region description are given, each describing a box region of image, with detailed coordinates. \nPlease rate the predicted coordinates, helpfulness, relevance, accuracy, level of details of their responses. Specifically, pay your attention to the precision of the coordinates and whether it matches the object. Small deviation (<20% of ground-truth box width or height) of coordinates is allowed and shouldn't be punished. More than that, the degree of deviation should be reflected in scoring too. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}
|
| 5 |
+
}
|
EAGLE/lmms_eval/tasks/ferret/utils.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import logging
|
| 3 |
+
import os
|
| 4 |
+
import requests
|
| 5 |
+
import numpy as np
|
| 6 |
+
import openai
|
| 7 |
+
from openai import OpenAI
|
| 8 |
+
import time
|
| 9 |
+
import yaml
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from copy import deepcopy
|
| 12 |
+
|
| 13 |
+
eval_logger = logging.getLogger("lmms-eval")
|
| 14 |
+
NUM_SECONDS_TO_SLEEP = 0.5
|
| 15 |
+
|
| 16 |
+
FERRET_W_METRICS = ["gpt_eval_ferret_refer_desc", "gpt_eval_ferret_refer_reason", "gpt_eval_ferret_ground_conv"]
|
| 17 |
+
|
| 18 |
+
rule_dict = json.load(open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "rule.json"), "r"))
|
| 19 |
+
|
| 20 |
+
with open(Path(__file__).parent / "ferret.yaml", "r") as f:
|
| 21 |
+
raw_data = f.readlines()
|
| 22 |
+
safe_data = []
|
| 23 |
+
for i, line in enumerate(raw_data):
|
| 24 |
+
# remove function definition since yaml load cannot handle it
|
| 25 |
+
if "!function" not in line:
|
| 26 |
+
safe_data.append(line)
|
| 27 |
+
|
| 28 |
+
config = yaml.safe_load("".join(safe_data))
|
| 29 |
+
|
| 30 |
+
GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
|
| 31 |
+
|
| 32 |
+
API_TYPE = os.getenv("API_TYPE", "openai")
|
| 33 |
+
|
| 34 |
+
if API_TYPE == "openai":
|
| 35 |
+
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
|
| 36 |
+
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
|
| 37 |
+
headers = {
|
| 38 |
+
"Authorization": f"Bearer {API_KEY}",
|
| 39 |
+
"Content-Type": "application/json",
|
| 40 |
+
}
|
| 41 |
+
elif API_TYPE == "azure":
|
| 42 |
+
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
|
| 43 |
+
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
|
| 44 |
+
headers = {
|
| 45 |
+
"api-key": API_KEY,
|
| 46 |
+
"Content-Type": "application/json",
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def get_eval(content: str, max_tokens: int, retries: int = 3):
|
| 51 |
+
global headers
|
| 52 |
+
|
| 53 |
+
messages = [
|
| 54 |
+
{
|
| 55 |
+
"role": "system",
|
| 56 |
+
"content": "You are a helpful and precise assistant for checking the quality of the answer.",
|
| 57 |
+
},
|
| 58 |
+
{"role": "user", "content": content},
|
| 59 |
+
]
|
| 60 |
+
|
| 61 |
+
payload = {
|
| 62 |
+
"model": GPT_EVAL_MODEL_NAME,
|
| 63 |
+
"messages": messages,
|
| 64 |
+
"temperature": 0.2,
|
| 65 |
+
"max_tokens": max_tokens,
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
for attempt in range(retries):
|
| 69 |
+
try:
|
| 70 |
+
response = requests.post(API_URL, headers=headers, json=payload)
|
| 71 |
+
response.raise_for_status()
|
| 72 |
+
response_data = response.json()
|
| 73 |
+
|
| 74 |
+
content = response_data["choices"][0]["message"]["content"].strip()
|
| 75 |
+
if content != "":
|
| 76 |
+
return content, response_data["model"]
|
| 77 |
+
break # If successful, break out of the loop
|
| 78 |
+
|
| 79 |
+
except Exception as e:
|
| 80 |
+
eval_logger.info(f"Attempt {attempt + 1} failed with error: {e}")
|
| 81 |
+
if attempt < retries - 1: # If we have retries left, sleep and then continue to next attempt
|
| 82 |
+
time.sleep(NUM_SECONDS_TO_SLEEP)
|
| 83 |
+
else: # If this was the last attempt, log and return empty
|
| 84 |
+
eval_logger.error(f"All {retries} attempts failed. Last error message: {e}")
|
| 85 |
+
return "", ""
|
| 86 |
+
return "", ""
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def parse_score(review):
|
| 90 |
+
try:
|
| 91 |
+
score_pair = review.split("\n")[0]
|
| 92 |
+
score_pair = score_pair.replace(",", " ")
|
| 93 |
+
sp = score_pair.split(" ")
|
| 94 |
+
if len(sp) == 2:
|
| 95 |
+
return [float(sp[0]), float(sp[1])]
|
| 96 |
+
else:
|
| 97 |
+
eval_logger.debug(f"Can not split: {review}. Returning [-1, -1]")
|
| 98 |
+
return [-1, -1]
|
| 99 |
+
except Exception as e:
|
| 100 |
+
eval_logger.debug(f"Error: {e}. Returning [-1, -1]")
|
| 101 |
+
return [-1, -1]
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def ferret_doc_to_visual(doc):
|
| 105 |
+
return [doc["image"].convert("RGB")]
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def ferret_doc_to_text(doc, model_specific_prompt_kwargs=None):
|
| 109 |
+
if model_specific_prompt_kwargs is None:
|
| 110 |
+
model_specific_prompt_kwargs = {}
|
| 111 |
+
pre_prompt = model_specific_prompt_kwargs.get("pre_prompt", "")
|
| 112 |
+
post_prompt = model_specific_prompt_kwargs.get("post_prompt", "")
|
| 113 |
+
question = f"{pre_prompt}{doc['question']}{post_prompt}"
|
| 114 |
+
return question
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def ferret_process_results(doc, result):
|
| 118 |
+
"""
|
| 119 |
+
Args:
|
| 120 |
+
doc: a instance of the eval dataset
|
| 121 |
+
results: [pred]
|
| 122 |
+
Returns:
|
| 123 |
+
a dictionary with key: metric name (in this case coco_bleu), value: metric value
|
| 124 |
+
"""
|
| 125 |
+
try:
|
| 126 |
+
question = doc.get("question", "")
|
| 127 |
+
ans1 = doc.get("gpt_answer", "")
|
| 128 |
+
ans2 = result[0] if result else ""
|
| 129 |
+
context = doc.get("context", [])
|
| 130 |
+
context = "\n".join(context) if isinstance(context, list) else context
|
| 131 |
+
category = doc.get("category", "")
|
| 132 |
+
rule = rule_dict.get(category, {})
|
| 133 |
+
prompt = rule.get("prompt", "")
|
| 134 |
+
role = rule.get("role", "user")
|
| 135 |
+
content = f"[Context]\n{context}\n\n" f"[Question]\n{question}\n\n" f"[{role} 1]\n{ans1}\n\n[End of {role} 1]\n\n" f"[{role} 2]\n{ans2}\n\n[End of {role} 2]\n\n" f"[System]\n{prompt}\n\n"
|
| 136 |
+
review, model_name = get_eval(content, 1024)
|
| 137 |
+
scores = parse_score(review)
|
| 138 |
+
except Exception as e:
|
| 139 |
+
eval_logger.error(f"Error for Question ID: {doc.get('question_id', 'Unknown')}: {e}")
|
| 140 |
+
review = "Failed to Get a Proper Review."
|
| 141 |
+
model_name = "Failed Request"
|
| 142 |
+
scores = [-1, -1]
|
| 143 |
+
|
| 144 |
+
metric = f"gpt_eval_ferret_{doc.get('category', 'all')}"
|
| 145 |
+
category_review_dict = {
|
| 146 |
+
"question": question,
|
| 147 |
+
"ans1": ans1,
|
| 148 |
+
"ans2": ans2,
|
| 149 |
+
"context": context,
|
| 150 |
+
"category": category,
|
| 151 |
+
"review": review,
|
| 152 |
+
"scores": scores,
|
| 153 |
+
"eval_model": model_name,
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
non_category_review_dict = deepcopy(category_review_dict)
|
| 157 |
+
non_category_review_dict["scores"] = [-999, -999]
|
| 158 |
+
|
| 159 |
+
data_dict = {}
|
| 160 |
+
for m in FERRET_W_METRICS:
|
| 161 |
+
if m == metric:
|
| 162 |
+
data_dict[m] = category_review_dict
|
| 163 |
+
else:
|
| 164 |
+
data_dict[m] = non_category_review_dict
|
| 165 |
+
data_dict["gpt_eval_ferret_all"] = category_review_dict
|
| 166 |
+
|
| 167 |
+
# return {"gpt_eval_ferret_all": review_dict}
|
| 168 |
+
return data_dict
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def ferret_refer_desc_aggregation(results):
|
| 172 |
+
return ferret_aggregation(results, "refer_desc")
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
def ferret_refer_reason_aggregation(results):
|
| 176 |
+
return ferret_aggregation(results, "refer_reason")
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def ferret_ground_conv_aggregation(results):
|
| 180 |
+
return ferret_aggregation(results, "ground_conv")
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def ferret_all_aggregation(results):
|
| 184 |
+
return ferret_aggregation(results, "all")
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def ferret_aggregation(results, category):
|
| 188 |
+
try:
|
| 189 |
+
scores = []
|
| 190 |
+
for result in results:
|
| 191 |
+
if -999 in result["scores"]:
|
| 192 |
+
continue
|
| 193 |
+
scores.append(result["scores"])
|
| 194 |
+
|
| 195 |
+
stats = np.asarray(scores).mean(0).tolist()
|
| 196 |
+
stats = [round(x, 3) for x in stats]
|
| 197 |
+
# gpt4_score_percentage = stats[0] * 10
|
| 198 |
+
# model_score_percentage = stats[1] * 10
|
| 199 |
+
# eval_logger.info(f"Category: {category}")
|
| 200 |
+
# eval_logger.info(f"GPT4 Score: {gpt4_score_percentage:.1f}%")
|
| 201 |
+
# eval_logger.info(f"Model Score: {model_score_percentage:.1f}%")
|
| 202 |
+
# eval_logger.info("=========================")
|
| 203 |
+
return round(stats[1] / stats[0] * 100, 1)
|
| 204 |
+
except Exception as e:
|
| 205 |
+
eval_logger.info(f"Error in ferret_aggregation: {e}, and in category: {category}")
|
| 206 |
+
return None
|
EAGLE/lmms_eval/tasks/flickr30k/flickr30k.yaml
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
group: flickr30k
|
| 2 |
+
task:
|
| 3 |
+
- flickr30k_test
|
EAGLE/lmms_eval/tasks/flickr30k/flickr30k_test.yaml
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset_path: lmms-lab/flickr30k
|
| 2 |
+
dataset_kwargs:
|
| 3 |
+
token: True
|
| 4 |
+
task : "flickr30k_test"
|
| 5 |
+
test_split: test
|
| 6 |
+
output_type: generate_until
|
| 7 |
+
doc_to_visual: !function utils.flickr_doc_to_visual
|
| 8 |
+
doc_to_text: !function utils.flickr_doc_to_text
|
| 9 |
+
doc_to_target: "answer"
|
| 10 |
+
generation_kwargs:
|
| 11 |
+
max_new_tokens: 64
|
| 12 |
+
temperature: 0
|
| 13 |
+
top_p: 0
|
| 14 |
+
num_beams: 1
|
| 15 |
+
do_sample: false
|
| 16 |
+
process_results: !function utils.flickr_process_result
|
| 17 |
+
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
|
| 18 |
+
metric_list:
|
| 19 |
+
- metric: flickr_Bleu_4
|
| 20 |
+
aggregation : !function utils.flickr_bleu4
|
| 21 |
+
higher_is_better : true
|
| 22 |
+
- metric: flickr_Bleu_3
|
| 23 |
+
aggregation : !function utils.flickr_bleu3
|
| 24 |
+
higher_is_better : true
|
| 25 |
+
- metric: flickr_Bleu_2
|
| 26 |
+
aggregation : !function utils.flickr_bleu2
|
| 27 |
+
higher_is_better : true
|
| 28 |
+
- metric: flickr_Bleu_1
|
| 29 |
+
aggregation : !function utils.flickr_bleu1
|
| 30 |
+
higher_is_better : true
|
| 31 |
+
- metric: flickr_METEOR
|
| 32 |
+
aggregation : !function utils.flickr_meteor
|
| 33 |
+
higher_is_better : true
|
| 34 |
+
- metric: flickr_ROUGE_L
|
| 35 |
+
aggregation : !function utils.flickr_rougel
|
| 36 |
+
higher_is_better : true
|
| 37 |
+
- metric: flickr_CIDEr
|
| 38 |
+
aggregation : !function utils.flickr_cider
|
| 39 |
+
higher_is_better : true
|
| 40 |
+
#- metric: flickr_SPICE
|
| 41 |
+
# aggregation : !function utils.flickr_spice
|
| 42 |
+
# higher_is_better : true
|
| 43 |
+
metadata:
|
| 44 |
+
- version: 0.0
|
EAGLE/lmms_eval/tasks/flickr30k/utils.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
from pycocoevalcap.eval import COCOEvalCap, Bleu, Meteor, Rouge, Cider, Spice
|
| 4 |
+
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
|
| 5 |
+
from pycocotools.coco import COCO
|
| 6 |
+
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
|
| 7 |
+
import datetime
|
| 8 |
+
|
| 9 |
+
import logging
|
| 10 |
+
|
| 11 |
+
eval_logger = logging.getLogger("lmms-eval")
|
| 12 |
+
|
| 13 |
+
dir_name = os.path.dirname(os.path.abspath(__file__))
|
| 14 |
+
|
| 15 |
+
FLICKR_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr"] # , "SPICE"]
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def flickr_doc_to_visual(doc):
|
| 19 |
+
return [doc["image"].convert("RGB")]
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def flickr_doc_to_text(doc):
|
| 23 |
+
# question = "Please carefully observe the image and come up with a caption for the image"
|
| 24 |
+
return f"Provide a one-sentence caption for the provided image."
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def flickr_process_result(doc, result):
|
| 28 |
+
"""
|
| 29 |
+
Args:
|
| 30 |
+
doc: a instance of the eval dataset
|
| 31 |
+
results: [pred]
|
| 32 |
+
Returns:
|
| 33 |
+
a dictionary with key: metric name, value: metric value
|
| 34 |
+
"""
|
| 35 |
+
pred = result[0] if len(result) > 0 else ""
|
| 36 |
+
image_id = int(doc["img_id"])
|
| 37 |
+
|
| 38 |
+
data_dict = {"answer": doc["caption"], "pred": pred, "image_id": image_id}
|
| 39 |
+
|
| 40 |
+
return {f"flickr_{metric}": data_dict for metric in FLICKR_METRICS}
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def flickr_aggregation_result(results, metric, args):
|
| 44 |
+
scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE")]
|
| 45 |
+
scorers_dict = {s[1]: s for s in scorers}
|
| 46 |
+
|
| 47 |
+
stored_results = []
|
| 48 |
+
# In order to make the coco eval tools to successfully create index
|
| 49 |
+
# We need at least two dict in the dataset
|
| 50 |
+
# 'annotation' and 'images'
|
| 51 |
+
# 'annotation' exactly reproduce the original annotation
|
| 52 |
+
# 'images' however only need the image id which is contained in the file name
|
| 53 |
+
dataset = {"annotations": [], "images": []}
|
| 54 |
+
idx = 0
|
| 55 |
+
for result in results:
|
| 56 |
+
stored_results.append({"image_id": int(result["image_id"]), "caption": result["pred"]})
|
| 57 |
+
for a in result["answer"]:
|
| 58 |
+
dataset["annotations"].append({"image_id": int(result["image_id"]), "caption": a, "id": idx})
|
| 59 |
+
idx += 1
|
| 60 |
+
dataset["images"].append({"id": int(result["image_id"])})
|
| 61 |
+
|
| 62 |
+
coco = COCO()
|
| 63 |
+
# Manually create index here
|
| 64 |
+
coco.dataset = dataset
|
| 65 |
+
coco.createIndex()
|
| 66 |
+
|
| 67 |
+
flickr_result = coco.loadRes(stored_results)
|
| 68 |
+
flickr_eval = COCOEvalCap(coco, flickr_result)
|
| 69 |
+
|
| 70 |
+
imgIds = flickr_eval.params["image_id"]
|
| 71 |
+
gts = {}
|
| 72 |
+
res = {}
|
| 73 |
+
for imgId in imgIds:
|
| 74 |
+
gts[imgId] = flickr_eval.coco.imgToAnns[imgId]
|
| 75 |
+
res[imgId] = flickr_eval.cocoRes.imgToAnns[imgId]
|
| 76 |
+
|
| 77 |
+
eval_logger.info("tokenization...")
|
| 78 |
+
tokenizer = PTBTokenizer()
|
| 79 |
+
gts = tokenizer.tokenize(gts)
|
| 80 |
+
res = tokenizer.tokenize(res)
|
| 81 |
+
|
| 82 |
+
eval_logger.info(f"Computing {metric} scores...")
|
| 83 |
+
|
| 84 |
+
score, scores = scorers_dict[metric][0].compute_score(gts, res)
|
| 85 |
+
# When metric is one of the Bleu, score will be a list
|
| 86 |
+
if type(score) == list:
|
| 87 |
+
n = int(metric.split("_")[-1])
|
| 88 |
+
score = score[n - 1]
|
| 89 |
+
|
| 90 |
+
path = generate_submission_file(f"flickr30k_captions_val2014_alg_results_{metric}.json", args)
|
| 91 |
+
|
| 92 |
+
eval_logger.info("Storing prediction that can be submitted to the server ...")
|
| 93 |
+
with open(path, "w") as f:
|
| 94 |
+
json.dump(stored_results, f, indent=4)
|
| 95 |
+
|
| 96 |
+
return score
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def flickr_bleu4(results, args):
|
| 100 |
+
return flickr_aggregation_result(results, "Bleu_4", args)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def flickr_bleu3(results, args):
|
| 104 |
+
return flickr_aggregation_result(results, "Bleu_3", args)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def flickr_bleu2(results, args):
|
| 108 |
+
return flickr_aggregation_result(results, "Bleu_2", args)
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def flickr_bleu1(results, args):
|
| 112 |
+
return flickr_aggregation_result(results, "Bleu_1", args)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def flickr_meteor(results, args):
|
| 116 |
+
return flickr_aggregation_result(results, "METEOR", args)
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def flickr_rougel(results, args):
|
| 120 |
+
return flickr_aggregation_result(results, "ROUGE_L", args)
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def flickr_cider(results, args):
|
| 124 |
+
return flickr_aggregation_result(results, "CIDEr", args)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def flickr_spice(results, args):
|
| 128 |
+
return flickr_aggregation_result(results, "SPICE", args)
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def flickr_test_process_result(doc, result):
|
| 132 |
+
"""
|
| 133 |
+
Args:
|
| 134 |
+
doc: a instance of the eval dataset
|
| 135 |
+
results: [pred]
|
| 136 |
+
Returns:
|
| 137 |
+
a dictionary with key: metric name (in this case flickr_passthrough), value: metric value
|
| 138 |
+
"""
|
| 139 |
+
# The question id in our dataset is the image file itself
|
| 140 |
+
image_id = doc["img_id"]
|
| 141 |
+
return {"flickr_passthrough": {"pred": result, "image_id": image_id}}
|
EAGLE/lmms_eval/tasks/hallusion_bench/evaluate_hb.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import logging
|
| 4 |
+
from tqdm import tqdm
|
| 5 |
+
|
| 6 |
+
from lmms_eval.tasks.hallusion_bench.utils import evaluate_by_chatgpt, check_same_by_chatgpt, assign_correctness, get_eval_all, get_eval_fig, get_eval_pair_all
|
| 7 |
+
|
| 8 |
+
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
| 9 |
+
output_entry = "model_prediction"
|
| 10 |
+
correctness_entry = "gpt4v_output_gpt_check"
|
| 11 |
+
|
| 12 |
+
metric = ["aAcc", "fAcc", "qAcc"]
|
| 13 |
+
|
| 14 |
+
eval_logger = logging.getLogger("lmms-eval")
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def hb_doc_to_text(doc, model_specific_prompt_kwargs=None):
|
| 18 |
+
if model_specific_prompt_kwargs is None:
|
| 19 |
+
model_specific_prompt_kwargs = {}
|
| 20 |
+
pre_prompt = model_specific_prompt_kwargs.get("pre_prompt", "")
|
| 21 |
+
post_prompt = model_specific_prompt_kwargs.get("post_prompt", "")
|
| 22 |
+
return f"{pre_prompt}{doc['question']}{post_prompt}"
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def hb_doc_to_visual(doc):
|
| 26 |
+
return [doc["image"].convert("RGB")]
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def hb_process_results(doc, result):
|
| 30 |
+
sample = doc
|
| 31 |
+
# doc.pop("image")
|
| 32 |
+
sample["model_prediction"] = result[0]
|
| 33 |
+
return {k: sample for k in metric}
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def hb_aggregation_result(results, metric, args):
|
| 37 |
+
data_vd = []
|
| 38 |
+
data_vs = []
|
| 39 |
+
for data in tqdm(results, desc="Split vd and vs"):
|
| 40 |
+
if data["category"] == "VD":
|
| 41 |
+
data_vd.append(data)
|
| 42 |
+
if data["category"] == "VS":
|
| 43 |
+
data_vs.append(data)
|
| 44 |
+
eval_logger.info("Do gpt eval vd ...")
|
| 45 |
+
path = os.path.join(args.output_path, "gpt_response")
|
| 46 |
+
os.makedirs(path, exist_ok=True)
|
| 47 |
+
save_json_path_vd = f"{path}/hallusion_output_vd_model.json"
|
| 48 |
+
save_json_path_vs = f"{path}/hallusion_output_vs_model.json"
|
| 49 |
+
data_vd = evaluate_by_chatgpt(data_vd, output_entry=output_entry, correctness_entry=correctness_entry, load_json=True, save_json_path=save_json_path_vd)
|
| 50 |
+
# data_vd = check_same_by_chatgpt(data_vd, output_entry=output_entry, load_json=True, save_json_path=save_json_path_vd)
|
| 51 |
+
data_vd = assign_correctness(data_vd, correctness_entry=correctness_entry)
|
| 52 |
+
eval_logger.info("Do gpt eval vs")
|
| 53 |
+
data_vs = evaluate_by_chatgpt(data_vs, output_entry=output_entry, correctness_entry=correctness_entry, load_json=True, save_json_path=save_json_path_vs)
|
| 54 |
+
# data_vs = check_same_by_chatgpt(data_vs, output_entry=output_entry, load_json=True, save_json_path=save_json_path_vs)
|
| 55 |
+
data_vs = assign_correctness(data_vs, correctness_entry=correctness_entry)
|
| 56 |
+
results = data_vs + data_vd
|
| 57 |
+
|
| 58 |
+
if metric == "aAcc":
|
| 59 |
+
all_data = get_eval_all(results, model_correctness_entry=correctness_entry)
|
| 60 |
+
return round(100 * all_data["correct"] / all_data["total"], 4)
|
| 61 |
+
elif metric == "fAcc":
|
| 62 |
+
fig_all = get_eval_fig(results)
|
| 63 |
+
return round(100 * fig_all["correct"] / fig_all["total"], 4)
|
| 64 |
+
elif metric == "qAcc":
|
| 65 |
+
all_data = get_eval_pair_all(results, model_correctness_entry=correctness_entry)
|
| 66 |
+
return round(100 * all_data["correct"] / all_data["total"], 4)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def hb_aggregation_result_qAcc(results, args):
|
| 70 |
+
return hb_aggregation_result(results, "qAcc", args)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def hb_aggregation_result_fAcc(results, args):
|
| 74 |
+
return hb_aggregation_result(results, "fAcc", args)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def hb_aggregation_result_aAcc(results, args):
|
| 78 |
+
return hb_aggregation_result(results, "aAcc", args)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def hb_aggregation_result_intern(results, metric):
|
| 82 |
+
scores = []
|
| 83 |
+
for result in results:
|
| 84 |
+
ans = "1" if result["model_prediction"].lower().find("yes") != -1 else "0"
|
| 85 |
+
scores.append(ans == result["gt_answer"])
|
| 86 |
+
result["answer"] = ans
|
| 87 |
+
|
| 88 |
+
if metric == "aAcc":
|
| 89 |
+
return sum(scores) / len(scores)
|
| 90 |
+
elif metric == "qAcc":
|
| 91 |
+
qlist = {}
|
| 92 |
+
for r in results:
|
| 93 |
+
key = "_".join([r["category"], r["subcategory"], str(r["set_id"]), str(r["question_id"])])
|
| 94 |
+
try:
|
| 95 |
+
qlist[key].append(r["answer"] == r["gt_answer"])
|
| 96 |
+
except:
|
| 97 |
+
qlist[key] = [r["answer"] == r["gt_answer"]]
|
| 98 |
+
out = []
|
| 99 |
+
for q, v in qlist.items():
|
| 100 |
+
out.append(min(v))
|
| 101 |
+
|
| 102 |
+
return sum(out) / len(out)
|
| 103 |
+
elif metric == "fAcc":
|
| 104 |
+
qlist = {}
|
| 105 |
+
for r in results:
|
| 106 |
+
key = "_".join([r["category"], r["subcategory"], str(r["set_id"]), str(r["figure_id"])])
|
| 107 |
+
try:
|
| 108 |
+
qlist[key].append(r["answer"] == r["gt_answer"])
|
| 109 |
+
except:
|
| 110 |
+
qlist[key] = [r["answer"] == r["gt_answer"]]
|
| 111 |
+
out = []
|
| 112 |
+
for q, v in qlist.items():
|
| 113 |
+
out.append(min(v))
|
| 114 |
+
return sum(out) / len(out)
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def hb_aggregation_result_qAcc_intern(results):
|
| 118 |
+
eval_logger.info("Calculating qAcc ...")
|
| 119 |
+
return hb_aggregation_result_intern(results, "qAcc")
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def hb_aggregation_result_fAcc_intern(results):
|
| 123 |
+
eval_logger.info("Calculating fAcc ...")
|
| 124 |
+
return hb_aggregation_result_intern(results, "fAcc")
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def hb_aggregation_result_aAcc_intern(results):
|
| 128 |
+
eval_logger.info("Calculating aAcc ...")
|
| 129 |
+
return hb_aggregation_result_intern(results, "aAcc")
|
EAGLE/lmms_eval/tasks/hallusion_bench/hallusion_bench_image.yaml
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset_path: lmms-lab/HallusionBench
|
| 2 |
+
dataset_kwargs:
|
| 3 |
+
token: True
|
| 4 |
+
task: "hallusion_bench_image"
|
| 5 |
+
test_split: image
|
| 6 |
+
output_type: generate_until
|
| 7 |
+
doc_to_visual: !function evaluate_hb.hb_doc_to_visual
|
| 8 |
+
doc_to_text: !function evaluate_hb.hb_doc_to_text
|
| 9 |
+
doc_to_target: "gt_answer_details"
|
| 10 |
+
process_results: !function evaluate_hb.hb_process_results
|
| 11 |
+
model_specific_prompt_kwargs:
|
| 12 |
+
default:
|
| 13 |
+
pre_prompt: ""
|
| 14 |
+
post_prompt: ""
|
| 15 |
+
generation_kwargs:
|
| 16 |
+
max_new_tokens: 128
|
| 17 |
+
temperature: 0
|
| 18 |
+
top_p: 0
|
| 19 |
+
num_beams: 1
|
| 20 |
+
do_sample: false
|
| 21 |
+
metric_list:
|
| 22 |
+
- metric: aAcc
|
| 23 |
+
aggregation: !function evaluate_hb.hb_aggregation_result_aAcc
|
| 24 |
+
higher_is_better: true
|
| 25 |
+
- metric: qAcc
|
| 26 |
+
aggregation: !function evaluate_hb.hb_aggregation_result_qAcc
|
| 27 |
+
higher_is_better: true
|
| 28 |
+
- metric: fAcc
|
| 29 |
+
aggregation: !function evaluate_hb.hb_aggregation_result_fAcc
|
| 30 |
+
higher_is_better: true
|
| 31 |
+
# - metric: aAcc
|
| 32 |
+
# aggregation: !function evaluate_hb.hb_aggregation_result_aAcc_intern
|
| 33 |
+
# higher_is_better: true
|
| 34 |
+
# - metric: qAcc
|
| 35 |
+
# aggregation: !function evaluate_hb.hb_aggregation_result_qAcc_intern
|
| 36 |
+
# higher_is_better: true
|
| 37 |
+
# - metric: fAcc
|
| 38 |
+
# aggregation: !function evaluate_hb.hb_aggregation_result_fAcc_intern
|
| 39 |
+
# higher_is_better: true
|
| 40 |
+
metadata:
|
| 41 |
+
- version: 0.0
|
EAGLE/lmms_eval/tasks/hallusion_bench/utils.py
ADDED
|
@@ -0,0 +1,306 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import csv
|
| 2 |
+
import json
|
| 3 |
+
from tqdm import tqdm
|
| 4 |
+
import numpy as np
|
| 5 |
+
import os
|
| 6 |
+
import time
|
| 7 |
+
import openai
|
| 8 |
+
import threading
|
| 9 |
+
import requests
|
| 10 |
+
import logging
|
| 11 |
+
|
| 12 |
+
API_TYPE = os.getenv("API_TYPE", "openai")
|
| 13 |
+
|
| 14 |
+
if API_TYPE == "openai":
|
| 15 |
+
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
|
| 16 |
+
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
|
| 17 |
+
headers = {
|
| 18 |
+
"Authorization": f"Bearer {API_KEY}",
|
| 19 |
+
"Content-Type": "application/json",
|
| 20 |
+
}
|
| 21 |
+
elif API_TYPE == "azure":
|
| 22 |
+
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
|
| 23 |
+
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
|
| 24 |
+
headers = {
|
| 25 |
+
"api-key": API_KEY,
|
| 26 |
+
"Content-Type": "application/json",
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
eval_logger = logging.getLogger("lmms-eval")
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def evaluate_by_chatgpt(data, output_entry, correctness_entry, gpt_model="gpt-4", load_json=False, save_json_path="./hallusion_output.json", retries=3):
|
| 33 |
+
if load_json and os.path.exists(save_json_path):
|
| 34 |
+
with open(save_json_path, "r") as f:
|
| 35 |
+
output = json.load(f)
|
| 36 |
+
else:
|
| 37 |
+
output = []
|
| 38 |
+
for sample in tqdm(data[len(output) :], desc="Eval by GPT"):
|
| 39 |
+
prompt = "Imagine you are an intelligent teacher. Thoroughly read the question, reference answer and the prediction answer to ensure a clear understanding of the information provided. Assess the correctness of the predictions. "
|
| 40 |
+
prompt += 'If the prediction answer does not conflict with the reference answer, please generate “correct”. If the prediction answer conflict with the reference answer, please generate “incorrect”. If the prediction answer is unclear about the answer, please generate "unclear". \n\n Question:'
|
| 41 |
+
prompt += sample["question"]
|
| 42 |
+
prompt += "\nReference answer: "
|
| 43 |
+
prompt += sample["gt_answer_details"]
|
| 44 |
+
prompt += "\nPrediction answer:"
|
| 45 |
+
prompt += sample[output_entry]
|
| 46 |
+
prompt += "\nOutput:"
|
| 47 |
+
|
| 48 |
+
# https://github.com/openai/openai-python/issues/322#issuecomment-1767841683
|
| 49 |
+
for attempt in range(retries):
|
| 50 |
+
try:
|
| 51 |
+
messages = [{"role": "user", "content": prompt}]
|
| 52 |
+
payload = {
|
| 53 |
+
"messages": messages,
|
| 54 |
+
"max_tokens": 16,
|
| 55 |
+
}
|
| 56 |
+
# set model when using openai api_key. Azure api_key does not need model since the endpoint fixed the model.
|
| 57 |
+
if API_TYPE == "openai":
|
| 58 |
+
payload["model"] = gpt_model
|
| 59 |
+
response = requests.post(API_URL, headers=headers, json=payload, timeout=30)
|
| 60 |
+
response.raise_for_status()
|
| 61 |
+
response = response.json()
|
| 62 |
+
break
|
| 63 |
+
except Exception as e:
|
| 64 |
+
eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}")
|
| 65 |
+
if attempt < retries - 1: # If we have retries left, sleep and then continue to next attempt
|
| 66 |
+
time.sleep(5)
|
| 67 |
+
else: # If this was the last attempt, log and return empty
|
| 68 |
+
eval_logger.error(f"All {retries} attempts failed. Last error message: {str(e)}")
|
| 69 |
+
try:
|
| 70 |
+
output_text = response["choices"][0]["message"]["content"]
|
| 71 |
+
except Exception as e:
|
| 72 |
+
eval_logger.info(f"Get error {str(e)} when extracting response")
|
| 73 |
+
output_text = "unclear"
|
| 74 |
+
|
| 75 |
+
if "incorrect" in output_text.lower():
|
| 76 |
+
gpt_correctness = "0"
|
| 77 |
+
|
| 78 |
+
elif "correct" in output_text.lower():
|
| 79 |
+
gpt_correctness = "1"
|
| 80 |
+
else:
|
| 81 |
+
gpt_correctness = "2"
|
| 82 |
+
|
| 83 |
+
sample[correctness_entry] = gpt_correctness
|
| 84 |
+
sample["gpt_answer"] = prompt + output_text
|
| 85 |
+
|
| 86 |
+
output.append(sample)
|
| 87 |
+
|
| 88 |
+
with open(save_json_path, "w") as f:
|
| 89 |
+
json.dump(output, f, indent=4)
|
| 90 |
+
|
| 91 |
+
return output
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def check_same_by_chatgpt(data, output_entry, gpt_model="gpt-4", load_json=False, save_json_path="./hallusion_output.json", retries=3):
|
| 95 |
+
orig_response = {}
|
| 96 |
+
|
| 97 |
+
for r in data:
|
| 98 |
+
if str(r["figure_id"]) == "0":
|
| 99 |
+
key = "_".join([r["category"], r["subcategory"], str(r["set_id"]), str(r["question_id"])])
|
| 100 |
+
orig_response[key] = r[output_entry]
|
| 101 |
+
|
| 102 |
+
for sample in tqdm(data, desc="Check same by GPT"):
|
| 103 |
+
if "same" not in sample.keys():
|
| 104 |
+
key = "_".join([sample["category"], sample["subcategory"], str(sample["set_id"]), str(sample["question_id"])])
|
| 105 |
+
response2 = orig_response[key]
|
| 106 |
+
|
| 107 |
+
prompt = "Imagine you are an intelligent teacher. Thoroughly read the two responses to two different questions. Assess the consistency of the information provided within those two responses. "
|
| 108 |
+
prompt += "You do not know the specific questions, but you can asssess the consistency among the two responses by checking for logical conflicts if both responses are correct. "
|
| 109 |
+
prompt += 'If response1 does not conflict with response2, please generate “same”. Otherwise, generate "different". \n\n response1:'
|
| 110 |
+
prompt += sample[output_entry]
|
| 111 |
+
prompt += "\nresponse2: "
|
| 112 |
+
prompt += response2
|
| 113 |
+
prompt += "\nOutput:"
|
| 114 |
+
|
| 115 |
+
# https://github.com/openai/openai-python/issues/322#issuecomment-1767841683
|
| 116 |
+
for attempt in range(retries):
|
| 117 |
+
try:
|
| 118 |
+
headers = {
|
| 119 |
+
"api-key": API_KEY,
|
| 120 |
+
"Content-Type": "application/json",
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
messages = [{"role": "user", "content": prompt}]
|
| 124 |
+
|
| 125 |
+
payload = {
|
| 126 |
+
"model": gpt_model,
|
| 127 |
+
"messages": messages,
|
| 128 |
+
"max_tokens": 16,
|
| 129 |
+
}
|
| 130 |
+
response = requests.post(API_URL, headers=headers, json=payload)
|
| 131 |
+
response.raise_for_status()
|
| 132 |
+
response = response.json()
|
| 133 |
+
|
| 134 |
+
break
|
| 135 |
+
except Exception as e:
|
| 136 |
+
eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}")
|
| 137 |
+
if attempt < retries - 1: # If we have retries left, sleep and then continue to next attempt
|
| 138 |
+
time.sleep(5)
|
| 139 |
+
else: # If this was the last attempt, log and return empty
|
| 140 |
+
eval_logger.error(f"All {retries} attempts failed. Last error message: {str(e)}")
|
| 141 |
+
|
| 142 |
+
try:
|
| 143 |
+
output_text = response["choices"][0]["message"]["content"]
|
| 144 |
+
except Exception as e:
|
| 145 |
+
eval_logger.info(f"Get error {str(e)} when extracting response")
|
| 146 |
+
output_text = "different"
|
| 147 |
+
|
| 148 |
+
gpt_same = "0"
|
| 149 |
+
|
| 150 |
+
if "same" in output_text.lower():
|
| 151 |
+
gpt_same = "1"
|
| 152 |
+
|
| 153 |
+
elif "different" in output_text.lower():
|
| 154 |
+
gpt_same = "0"
|
| 155 |
+
|
| 156 |
+
sample["same"] = gpt_same
|
| 157 |
+
|
| 158 |
+
with open(save_json_path, "w") as f:
|
| 159 |
+
json.dump(data, f, indent=4)
|
| 160 |
+
|
| 161 |
+
return data
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def assign_correctness(data_arr, correctness_entry):
|
| 165 |
+
for r in data_arr:
|
| 166 |
+
assert int(r[correctness_entry]) == 0 or int(r[correctness_entry]) == 1 or int(r[correctness_entry]) == 2
|
| 167 |
+
if r["category"] == "VS" and int(r["figure_id"]) == 0: # if there is no visual supplement and the model does not know, count it as correct
|
| 168 |
+
r["correct"] = 1 if int(r[correctness_entry]) == 1 or int(r[correctness_entry]) == 2 else 0
|
| 169 |
+
else:
|
| 170 |
+
r["correct"] = 1 if int(r[correctness_entry]) == 1 else 0
|
| 171 |
+
return data_arr
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def get_eval_fig(data): # per figure
|
| 175 |
+
eval_fig_dict = dict()
|
| 176 |
+
|
| 177 |
+
for r in data:
|
| 178 |
+
if r["category"] == "VS" and str(r["figure_id"]) == "0": # no figure
|
| 179 |
+
continue
|
| 180 |
+
name = "_".join([r["category"], r["subcategory"], str(r["set_id"]), str(r["figure_id"])])
|
| 181 |
+
if name in eval_fig_dict:
|
| 182 |
+
c, t = eval_fig_dict[name]
|
| 183 |
+
eval_fig_dict[name] = (c + r["correct"], t + 1)
|
| 184 |
+
else:
|
| 185 |
+
eval_fig_dict[name] = (r["correct"], 1)
|
| 186 |
+
|
| 187 |
+
eval_fig_stat = {}
|
| 188 |
+
eval_fig_stat["note"] = "all accuracy per image (consistency test)"
|
| 189 |
+
eval_fig_stat["total"] = len(eval_fig_dict.keys())
|
| 190 |
+
eval_fig_stat["correct"] = 0
|
| 191 |
+
eval_fig_stat["wrong"] = 0
|
| 192 |
+
eval_fig_stat["inconsistent"] = 0
|
| 193 |
+
eval_fig_stat["score"] = 0
|
| 194 |
+
|
| 195 |
+
for v in eval_fig_dict.values():
|
| 196 |
+
if v[0] == v[1]:
|
| 197 |
+
eval_fig_stat["correct"] += 1
|
| 198 |
+
elif v[0] == 0:
|
| 199 |
+
eval_fig_stat["wrong"] += 1
|
| 200 |
+
else:
|
| 201 |
+
eval_fig_stat["inconsistent"] += 1
|
| 202 |
+
eval_fig_stat["score"] += v[0] / v[1]
|
| 203 |
+
|
| 204 |
+
eval_fig_stat["score"] = eval_fig_stat["score"] / eval_fig_stat["total"]
|
| 205 |
+
return eval_fig_stat
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def get_eval_all(data, model_correctness_entry): # per question
|
| 209 |
+
eval_all_dict = dict()
|
| 210 |
+
eval_all_stat = {}
|
| 211 |
+
eval_all_stat["LH"] = 0
|
| 212 |
+
eval_all_stat["VI"] = 0
|
| 213 |
+
eval_all_stat["Mix"] = 0
|
| 214 |
+
|
| 215 |
+
for r in data:
|
| 216 |
+
name = "_".join([r["category"], r["subcategory"], str(r["set_id"]), str(r["figure_id"]), str(r["question_id"])])
|
| 217 |
+
assert name not in eval_all_dict
|
| 218 |
+
|
| 219 |
+
eval_all_dict[name] = r["correct"]
|
| 220 |
+
|
| 221 |
+
if str(r["category"]) == "VD": # VD
|
| 222 |
+
if str(r["figure_id"]) == "0":
|
| 223 |
+
if str(r[model_correctness_entry]) == "0" or str(r[model_correctness_entry]) == "2":
|
| 224 |
+
eval_all_stat["VI"] += 1
|
| 225 |
+
else:
|
| 226 |
+
if str(r[model_correctness_entry]) == "0":
|
| 227 |
+
eval_all_stat["Mix"] += 1
|
| 228 |
+
elif str(r[model_correctness_entry]) == "2":
|
| 229 |
+
eval_all_stat["VI"] += 1
|
| 230 |
+
else: # VS
|
| 231 |
+
if str(r["visual_input"]) == "0": # no visual
|
| 232 |
+
if str(r[model_correctness_entry]) == "0":
|
| 233 |
+
eval_all_stat["LH"] += 1
|
| 234 |
+
else: # original visual or modified visual (isual_input == 1 or 2)
|
| 235 |
+
if str(r[model_correctness_entry]) == "0":
|
| 236 |
+
eval_all_stat["Mix"] += 1
|
| 237 |
+
elif str(r[model_correctness_entry]) == "2":
|
| 238 |
+
eval_all_stat["VI"] += 1
|
| 239 |
+
|
| 240 |
+
eval_all_stat["note"] = "all accuracy per question"
|
| 241 |
+
eval_all_stat["total"] = len(eval_all_dict.keys())
|
| 242 |
+
eval_all_stat["correct"] = np.count_nonzero(list(eval_all_dict.values()))
|
| 243 |
+
eval_all_stat["wrong"] = eval_all_stat["total"] - eval_all_stat["correct"]
|
| 244 |
+
|
| 245 |
+
return eval_all_stat
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def get_eval_pair_all(data, model_correctness_entry): # per question pair
|
| 249 |
+
orig_correctness = dict()
|
| 250 |
+
counter = 0
|
| 251 |
+
lh_counter = 0
|
| 252 |
+
vi_counter = 0
|
| 253 |
+
both_counter = 0
|
| 254 |
+
|
| 255 |
+
for r in data:
|
| 256 |
+
if str(r["figure_id"]) == "0":
|
| 257 |
+
key = "_".join([r["category"], r["subcategory"], str(r["set_id"]), str(r["question_id"])])
|
| 258 |
+
orig_correctness[key] = r[model_correctness_entry]
|
| 259 |
+
|
| 260 |
+
get_eval_pair_dict = dict()
|
| 261 |
+
|
| 262 |
+
for r in data:
|
| 263 |
+
name = "_".join([r["category"], r["subcategory"], str(r["set_id"]), str(r["question_id"])])
|
| 264 |
+
if name in get_eval_pair_dict:
|
| 265 |
+
c, t = get_eval_pair_dict[name]
|
| 266 |
+
get_eval_pair_dict[name] = (c + r["correct"], t + 1)
|
| 267 |
+
else:
|
| 268 |
+
get_eval_pair_dict[name] = (r["correct"], 1)
|
| 269 |
+
counter += 1
|
| 270 |
+
|
| 271 |
+
eval_all_pair_stat = {}
|
| 272 |
+
eval_all_pair_stat["note"] = "all accuracy per question pair"
|
| 273 |
+
eval_all_pair_stat["total"] = len(get_eval_pair_dict.keys())
|
| 274 |
+
eval_all_pair_stat["total_q"] = counter
|
| 275 |
+
eval_all_pair_stat["correct"] = 0
|
| 276 |
+
eval_all_pair_stat["wrong"] = 0
|
| 277 |
+
eval_all_pair_stat["LH"] = 0
|
| 278 |
+
eval_all_pair_stat["VI"] = 0
|
| 279 |
+
eval_all_pair_stat["Mix"] = 0
|
| 280 |
+
|
| 281 |
+
eval_all_pair_stat["LH_cg"] = lh_counter
|
| 282 |
+
eval_all_pair_stat["VI_cg"] = vi_counter
|
| 283 |
+
eval_all_pair_stat["Mix_cg"] = both_counter
|
| 284 |
+
|
| 285 |
+
# for v in get_eval_pair_dict.values():
|
| 286 |
+
# if v[0] == v[1]:
|
| 287 |
+
# eval_all_pair_stat["correct"] += 1
|
| 288 |
+
# else:
|
| 289 |
+
# eval_all_pair_stat["wrong"] += 1
|
| 290 |
+
|
| 291 |
+
# for v in get_analysis_pair_dict.values():
|
| 292 |
+
# if v[0] > 0 and v[1] > 0:
|
| 293 |
+
# eval_all_pair_stat["Mix"] += 1
|
| 294 |
+
# elif v[0] > 0:
|
| 295 |
+
# eval_all_pair_stat["LH"] += 1
|
| 296 |
+
# elif v[1] > 0:
|
| 297 |
+
# eval_all_pair_stat["VI"] += 1
|
| 298 |
+
|
| 299 |
+
for k in get_eval_pair_dict.keys():
|
| 300 |
+
v = get_eval_pair_dict[k]
|
| 301 |
+
if v[0] == v[1]:
|
| 302 |
+
eval_all_pair_stat["correct"] += 1
|
| 303 |
+
else:
|
| 304 |
+
eval_all_pair_stat["wrong"] += 1
|
| 305 |
+
|
| 306 |
+
return eval_all_pair_stat
|
EAGLE/lmms_eval/tasks/iconqa/utils.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def options_to_str(options_prompt):
|
| 6 |
+
option_prompt_str = ""
|
| 7 |
+
for i, option in enumerate(options_prompt):
|
| 8 |
+
option_choice = chr(ord("A") + i)
|
| 9 |
+
option_prompt_str += f"{option_choice}. {option}\n"
|
| 10 |
+
|
| 11 |
+
option_prompt_str = option_prompt_str.rstrip("\n")
|
| 12 |
+
return option_prompt_str
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def doc_to_visual(doc):
|
| 16 |
+
image_list = []
|
| 17 |
+
if "query_image" in doc:
|
| 18 |
+
image_list.append(doc["query_image"].convert("RGB"))
|
| 19 |
+
for i in range(5):
|
| 20 |
+
id = f"choice_image_{i}"
|
| 21 |
+
if id in doc and doc[id] is not None:
|
| 22 |
+
image_list.append(doc[id].convert("RGB"))
|
| 23 |
+
assert len(image_list) < 6, "Maximum 5 images allowed for ICON-QA"
|
| 24 |
+
return image_list
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def doc_to_text(doc, model_specific_prompt_kwargs):
|
| 28 |
+
question = doc["question"]
|
| 29 |
+
ques_type = doc["ques_type"]
|
| 30 |
+
options_prompt = []
|
| 31 |
+
|
| 32 |
+
if ques_type == "choose_img":
|
| 33 |
+
options_prompt.append("The first image.")
|
| 34 |
+
options_prompt.append("The second image.")
|
| 35 |
+
|
| 36 |
+
options_str = options_to_str(options_prompt)
|
| 37 |
+
full_prompt = f"{model_specific_prompt_kwargs['pre_prompt']}{model_specific_prompt_kwargs['statement']}{model_specific_prompt_kwargs['options_statement'].format(question=question, options=options_str)}"
|
| 38 |
+
|
| 39 |
+
elif ques_type == "choose_txt":
|
| 40 |
+
choices = doc["choices"].split(",")
|
| 41 |
+
for i, choice in enumerate(choices):
|
| 42 |
+
options_prompt.append(f"{choice}")
|
| 43 |
+
|
| 44 |
+
options_str = options_to_str(options_prompt)
|
| 45 |
+
full_prompt = f"{model_specific_prompt_kwargs['pre_prompt']}{model_specific_prompt_kwargs['statement']}{model_specific_prompt_kwargs['options_statement'].format(question=question, options=options_str)}"
|
| 46 |
+
|
| 47 |
+
elif ques_type == "fill_in_blank":
|
| 48 |
+
full_prompt = f"{model_specific_prompt_kwargs['pre_prompt']}{model_specific_prompt_kwargs['statement']}{model_specific_prompt_kwargs['freeform_statement'].format(question=question)}"
|
| 49 |
+
|
| 50 |
+
return full_prompt
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def test_process_results(doc, results):
|
| 54 |
+
pred = results[0]
|
| 55 |
+
questionId = doc["question_id"]
|
| 56 |
+
answer = doc["answer"]
|
| 57 |
+
return {"anls": {"questionId": int(questionId), "answer": answer, "pred_answer": pred}}
|
EAGLE/lmms_eval/tasks/mme/mme.yaml
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset_path: lmms-lab/MME
|
| 2 |
+
dataset_kwargs:
|
| 3 |
+
token: True
|
| 4 |
+
task: "mme"
|
| 5 |
+
test_split: test
|
| 6 |
+
output_type: generate_until
|
| 7 |
+
doc_to_visual: !function utils.mme_doc_to_visual
|
| 8 |
+
doc_to_text: !function utils.mme_doc_to_text
|
| 9 |
+
doc_to_target: "answer"
|
| 10 |
+
generation_kwargs:
|
| 11 |
+
max_new_tokens: 16
|
| 12 |
+
temperature: 0
|
| 13 |
+
top_p: 0
|
| 14 |
+
num_beams: 1
|
| 15 |
+
do_sample: false
|
| 16 |
+
# The return value of process_results will be used by metrics
|
| 17 |
+
process_results: !function utils.mme_process_results
|
| 18 |
+
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
|
| 19 |
+
metric_list:
|
| 20 |
+
- metric: mme_percetion_score
|
| 21 |
+
aggregation: !function utils.mme_aggregate_results
|
| 22 |
+
higher_is_better: true
|
| 23 |
+
- metric: mme_cognition_score
|
| 24 |
+
aggregation: !function utils.mme_aggregate_results
|
| 25 |
+
higher_is_better: true
|
| 26 |
+
model_specific_prompt_kwargs:
|
| 27 |
+
default:
|
| 28 |
+
pre_prompt: ""
|
| 29 |
+
post_prompt: "\nAnswer the question using a single word or phrase."
|
| 30 |
+
qwen_vl:
|
| 31 |
+
pre_prompt: ""
|
| 32 |
+
post_prompt: " Answer:"
|
| 33 |
+
otterhd:
|
| 34 |
+
pre_prompt: ""
|
| 35 |
+
post_prompt: " Answer:"
|
| 36 |
+
metadata:
|
| 37 |
+
- version: 0.0
|
EAGLE/lmms_eval/tasks/mme/utils.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import defaultdict
|
| 2 |
+
import os
|
| 3 |
+
import datetime
|
| 4 |
+
import json
|
| 5 |
+
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
|
| 6 |
+
|
| 7 |
+
import logging
|
| 8 |
+
|
| 9 |
+
eval_logger = logging.getLogger("lmms-eval")
|
| 10 |
+
|
| 11 |
+
dir_name = os.path.dirname(os.path.abspath(__file__))
|
| 12 |
+
|
| 13 |
+
eval_type_dict = {
|
| 14 |
+
"Perception": [
|
| 15 |
+
"existence",
|
| 16 |
+
"count",
|
| 17 |
+
"position",
|
| 18 |
+
"color",
|
| 19 |
+
"posters",
|
| 20 |
+
"celebrity",
|
| 21 |
+
"scene",
|
| 22 |
+
"landmark",
|
| 23 |
+
"artwork",
|
| 24 |
+
"OCR",
|
| 25 |
+
],
|
| 26 |
+
"Cognition": [
|
| 27 |
+
"commonsense_reasoning",
|
| 28 |
+
"numerical_calculation",
|
| 29 |
+
"text_translation",
|
| 30 |
+
"code_reasoning",
|
| 31 |
+
],
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
replace_prompt = " Please answer yes or no."
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def mme_doc_to_visual(doc):
|
| 39 |
+
return [doc["image"].convert("RGB")]
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def mme_doc_to_text(doc, model_specific_prompt_kwargs=None):
|
| 43 |
+
question = doc["question"].strip()
|
| 44 |
+
if "pre_prompt" in model_specific_prompt_kwargs and model_specific_prompt_kwargs["pre_prompt"] != "":
|
| 45 |
+
question = question.replace(replace_prompt, "")
|
| 46 |
+
question = f"{model_specific_prompt_kwargs['pre_prompt']}{question}"
|
| 47 |
+
if "post_prompt" in model_specific_prompt_kwargs and model_specific_prompt_kwargs["post_prompt"] != "":
|
| 48 |
+
question = question.replace(replace_prompt, "")
|
| 49 |
+
question = f"{question}{model_specific_prompt_kwargs['post_prompt']}"
|
| 50 |
+
return question
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def parse_pred_ans(pred_ans):
|
| 54 |
+
"""Brought from Otter Eval"""
|
| 55 |
+
pred_ans = pred_ans.lower().strip().replace(".", "")
|
| 56 |
+
pred_label = None
|
| 57 |
+
if pred_ans in ["yes", "no"]:
|
| 58 |
+
pred_label = pred_ans
|
| 59 |
+
else:
|
| 60 |
+
prefix_pred_ans = pred_ans[:4]
|
| 61 |
+
if "yes" in prefix_pred_ans:
|
| 62 |
+
pred_label = "yes"
|
| 63 |
+
elif "no" in prefix_pred_ans:
|
| 64 |
+
pred_label = "no"
|
| 65 |
+
else:
|
| 66 |
+
pred_label = "other"
|
| 67 |
+
return pred_label
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def mme_process_results(doc, results):
|
| 71 |
+
"""
|
| 72 |
+
Args:
|
| 73 |
+
doc: a instance of the eval dataset
|
| 74 |
+
results: [pred]
|
| 75 |
+
Returns:
|
| 76 |
+
a dictionary with key: metric name (in this case mme score), value: metric value
|
| 77 |
+
"""
|
| 78 |
+
pred = results[0]
|
| 79 |
+
pred_ans = parse_pred_ans(pred)
|
| 80 |
+
gt_ans = doc["answer"].lower().strip().replace(".", "")
|
| 81 |
+
assert gt_ans in ["yes", "no"]
|
| 82 |
+
assert pred_ans in ["yes", "no", "other"]
|
| 83 |
+
score = 1.0 if pred_ans == gt_ans else 0.0
|
| 84 |
+
category = doc["category"]
|
| 85 |
+
key_name = "mme_percetion_score" if category in eval_type_dict["Perception"] else "mme_cognition_score"
|
| 86 |
+
# Note: the key name here is very important. It decides which aggregation function will receive the results
|
| 87 |
+
# We note down the question id/category to help us aggregate the results later
|
| 88 |
+
return {key_name: {"question_id": doc["question_id"], "category": category, "score": score}}
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def mme_aggregate_results(results):
|
| 92 |
+
"""
|
| 93 |
+
Args:
|
| 94 |
+
results: a list of values returned by process_results
|
| 95 |
+
Returns:
|
| 96 |
+
A score
|
| 97 |
+
"""
|
| 98 |
+
category2score = defaultdict(dict)
|
| 99 |
+
for result in results:
|
| 100 |
+
question_id = result["question_id"]
|
| 101 |
+
score = result["score"]
|
| 102 |
+
category = result["category"]
|
| 103 |
+
if question_id not in category2score[category]:
|
| 104 |
+
category2score[category][question_id] = []
|
| 105 |
+
category2score[category][question_id].append(score)
|
| 106 |
+
category2avg_score = {}
|
| 107 |
+
for category, question2scores in category2score.items():
|
| 108 |
+
total_score = 0
|
| 109 |
+
for question_id, scores in question2scores.items():
|
| 110 |
+
assert len(scores) == 2
|
| 111 |
+
acc = sum(scores) / len(scores) * 100.0
|
| 112 |
+
acc_plus = (sum(scores) == 2) * 100.0
|
| 113 |
+
score = acc_plus + acc
|
| 114 |
+
total_score += score
|
| 115 |
+
avg_score = total_score / len(question2scores)
|
| 116 |
+
category2avg_score[category] = avg_score
|
| 117 |
+
for category, avg_score in category2avg_score.items():
|
| 118 |
+
eval_logger.info(f"{category}: {avg_score:.2f}")
|
| 119 |
+
total_score = sum(category2avg_score.values())
|
| 120 |
+
return total_score
|
EAGLE/lmms_eval/tasks/multidocvqa/multidocvqa.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
group: multidocvqa
|
| 2 |
+
task:
|
| 3 |
+
- multidocvqa_val
|
| 4 |
+
- multidocvqa_test
|
EAGLE/lmms_eval/tasks/multidocvqa/multidocvqa_test.yaml
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset_path: lmms-lab/MP-DocVQA
|
| 2 |
+
task: "multidocvqa_test"
|
| 3 |
+
test_split: test
|
| 4 |
+
output_type: generate_until
|
| 5 |
+
doc_to_visual: !function utils.multidocvqa_doc_to_visual
|
| 6 |
+
doc_to_text: !function utils.multidocvqa_doc_to_text
|
| 7 |
+
doc_to_target: "answers"
|
| 8 |
+
generation_kwargs:
|
| 9 |
+
max_new_tokens: 32
|
| 10 |
+
temperature: 0
|
| 11 |
+
do_sample: False
|
| 12 |
+
process_results: !function utils.multidocvqa_process_test_results_for_submission
|
| 13 |
+
metric_list:
|
| 14 |
+
- metric: submission
|
| 15 |
+
aggregation: !function utils.multidocvqa_test_aggregate_results_for_submission
|
| 16 |
+
model_specific_prompt_kwargs:
|
| 17 |
+
default:
|
| 18 |
+
pre_prompt: ""
|
| 19 |
+
post_prompt: "\nAnswer the question using a single word or phrase."
|
| 20 |
+
|
EAGLE/lmms_eval/tasks/multidocvqa/multidocvqa_val.yaml
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset_path: lmms-lab/MP-DocVQA
|
| 2 |
+
task: "multidocvqa_val"
|
| 3 |
+
test_split: val
|
| 4 |
+
output_type: generate_until
|
| 5 |
+
doc_to_visual: !function utils.multidocvqa_doc_to_visual
|
| 6 |
+
doc_to_text: !function utils.multidocvqa_doc_to_text
|
| 7 |
+
doc_to_target: "answers"
|
| 8 |
+
generation_kwargs:
|
| 9 |
+
max_new_tokens: 32
|
| 10 |
+
temperature: 0
|
| 11 |
+
do_sample: False
|
| 12 |
+
process_results: !function utils.multidocvqa_process_results
|
| 13 |
+
metric_list:
|
| 14 |
+
- metric: anls
|
| 15 |
+
aggregation: !function utils.multidocvqa_aggregate_results_anls
|
| 16 |
+
higher_is_better: true
|
| 17 |
+
- metric: accuracy
|
| 18 |
+
aggregation: !function utils.multidocvqa_aggregate_results_accuracy
|
| 19 |
+
higher_is_better: true
|
| 20 |
+
model_specific_prompt_kwargs:
|
| 21 |
+
default:
|
| 22 |
+
pre_prompt: ""
|
| 23 |
+
post_prompt: "\nAnswer the question using a single word or phrase."
|
EAGLE/lmms_eval/tasks/multidocvqa/utils.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import ast
|
| 4 |
+
import json
|
| 5 |
+
import logging
|
| 6 |
+
from lmms_eval.api.metrics import levenshtein_distance
|
| 7 |
+
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
|
| 8 |
+
|
| 9 |
+
lmms_logger = logging.getLogger("lmms-eval")
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def multidocvqa_doc_to_text(doc, model_specific_prompt_kwargs):
|
| 13 |
+
question = doc["question"]
|
| 14 |
+
pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
|
| 15 |
+
post_prompt = model_specific_prompt_kwargs["post_prompt"]
|
| 16 |
+
|
| 17 |
+
return f"{pre_prompt}{question}{post_prompt}"
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def multidocvqa_doc_to_visual(doc):
|
| 21 |
+
return [doc[f"image_{i}"].convert("RGB") for i in range(1, 21) if doc[f"image_{i}"] is not None]
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def multidocvqa_process_results(doc, results):
|
| 25 |
+
pred_answer = results[0]
|
| 26 |
+
answer = ast.literal_eval(doc["answers"])
|
| 27 |
+
|
| 28 |
+
return {"anls": {"questionId": int(doc["questionId"]), "answer": answer, "pred_answer": pred_answer}, "accuracy": {"questionId": int(doc["questionId"]), "answer": answer, "pred_answer": pred_answer}}
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def multidocvqa_aggregate_results_anls(results):
|
| 32 |
+
keys = {k for result in results for k in result}
|
| 33 |
+
results = {key: [result.get(key, None) for result in results] for key in keys}
|
| 34 |
+
evaluator = Evaluator(case_sensitive=False)
|
| 35 |
+
metric = evaluator.get_metrics(results["answer"], results["pred_answer"])
|
| 36 |
+
|
| 37 |
+
return sum(metric["anls"]) / len(metric["anls"])
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def multidocvqa_aggregate_results_accuracy(results):
|
| 41 |
+
keys = {k for result in results for k in result}
|
| 42 |
+
results = {key: [result.get(key, None) for result in results] for key in keys}
|
| 43 |
+
evaluator = Evaluator(case_sensitive=False)
|
| 44 |
+
metric = evaluator.get_metrics(results["answer"], results["pred_answer"])
|
| 45 |
+
|
| 46 |
+
return sum(metric["accuracy"]) / len(metric["accuracy"])
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def multidocvqa_process_test_results_for_submission(doc, results):
|
| 50 |
+
answer = results[0]
|
| 51 |
+
return {"submission": {"questionId": int(doc["questionId"]), "answer": answer, "answer_page": None}}
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def multidocvqa_test_aggregate_results_for_submission(results, args):
|
| 55 |
+
path = generate_submission_file("multidocvqa_test_for_submission.json", args)
|
| 56 |
+
with open(path, "w") as f:
|
| 57 |
+
json.dump(results, f)
|
| 58 |
+
lmms_logger.info(f"Results saved to {path}.")
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
##################
|
| 62 |
+
# Helper functions
|
| 63 |
+
##################
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
class Evaluator:
|
| 67 |
+
def __init__(self, case_sensitive=False):
|
| 68 |
+
self.case_sensitive = case_sensitive
|
| 69 |
+
self.get_edit_distance = levenshtein_distance
|
| 70 |
+
self.anls_threshold = 0.5
|
| 71 |
+
|
| 72 |
+
def get_metrics(self, gt_answers, preds):
|
| 73 |
+
batch_accuracy = []
|
| 74 |
+
batch_anls = []
|
| 75 |
+
for batch_idx in range(len(preds)):
|
| 76 |
+
gt = [self._preprocess_str(gt_elm) for gt_elm in gt_answers[batch_idx]]
|
| 77 |
+
pred = self._preprocess_str(preds[batch_idx])
|
| 78 |
+
|
| 79 |
+
batch_accuracy.append(self._calculate_accuracy(gt, pred))
|
| 80 |
+
batch_anls.append(self._calculate_anls(gt, pred))
|
| 81 |
+
|
| 82 |
+
return {"accuracy": batch_accuracy, "anls": batch_anls}
|
| 83 |
+
|
| 84 |
+
def _preprocess_str(self, string):
|
| 85 |
+
if not self.case_sensitive:
|
| 86 |
+
string = string.lower()
|
| 87 |
+
|
| 88 |
+
return string.strip()
|
| 89 |
+
|
| 90 |
+
def _calculate_accuracy(self, gt, pred):
|
| 91 |
+
if pred == "none":
|
| 92 |
+
return 0
|
| 93 |
+
|
| 94 |
+
for gt_elm in gt:
|
| 95 |
+
if gt_elm == pred:
|
| 96 |
+
return 1
|
| 97 |
+
|
| 98 |
+
return 0
|
| 99 |
+
|
| 100 |
+
def _calculate_anls(self, gt, pred):
|
| 101 |
+
if len(pred) == 0:
|
| 102 |
+
return 0
|
| 103 |
+
|
| 104 |
+
if pred == "none":
|
| 105 |
+
return 0
|
| 106 |
+
|
| 107 |
+
answers_similarity = [1 - self.get_edit_distance(gt_elm, pred) / max(len(gt_elm), len(pred)) for gt_elm in gt]
|
| 108 |
+
max_similarity = max(answers_similarity)
|
| 109 |
+
|
| 110 |
+
anls = max_similarity if max_similarity >= self.anls_threshold else 0
|
| 111 |
+
return anls
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
if __name__ == "__main__":
|
| 115 |
+
print("-----------------")
|
| 116 |
+
multidocvqa_aggregate_results_anls([{"questionId": 1, "answer": ["answer"], "pred_answer": "pred_answer"}, {"questionId": 2, "answer": ["nswer"], "pred_answer": "nswer"}])
|
EAGLE/lmms_eval/tasks/nocaps/_default_template_nocaps_yaml
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_specific_prompt_kwargs:
|
| 2 |
+
default:
|
| 3 |
+
prompt: "Provide a one-sentence caption for the provided image."
|
EAGLE/lmms_eval/tasks/nocaps/nocaps.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
group : nocaps
|
| 2 |
+
task:
|
| 3 |
+
- nocaps_test
|
| 4 |
+
- nocaps_val
|
EAGLE/lmms_eval/tasks/nocaps/nocaps_test.yaml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset_path: lmms-lab/NoCaps
|
| 2 |
+
dataset_kwargs:
|
| 3 |
+
token: True
|
| 4 |
+
task : "nocaps_test"
|
| 5 |
+
group : "nocaps_caption"
|
| 6 |
+
test_split: test
|
| 7 |
+
output_type: generate_until
|
| 8 |
+
doc_to_visual: !function utils.nocaps_doc_to_visual
|
| 9 |
+
doc_to_text: !function utils.nocaps_doc_to_text
|
| 10 |
+
doc_to_target: "annotations_captions"
|
| 11 |
+
generation_kwargs:
|
| 12 |
+
max_new_tokens: 64
|
| 13 |
+
temperature: 0
|
| 14 |
+
top_p: 0
|
| 15 |
+
num_beams: 1
|
| 16 |
+
do_sample: false
|
| 17 |
+
process_results: !function utils.nocaps_test_process_result
|
| 18 |
+
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
|
| 19 |
+
metric_list:
|
| 20 |
+
- metric: nocaps_passthrough
|
| 21 |
+
aggregation : !function utils.nocaps_test_aggregation_result
|
| 22 |
+
higher_is_better : true
|
| 23 |
+
metadata:
|
| 24 |
+
- version: 0.0
|
| 25 |
+
include: _default_template_nocaps_yaml
|
EAGLE/lmms_eval/tasks/nocaps/nocaps_val.yaml
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset_path: lmms-lab/NoCaps
|
| 2 |
+
dataset_kwargs:
|
| 3 |
+
token: True
|
| 4 |
+
task: "nocaps_val"
|
| 5 |
+
group : "nocaps_caption"
|
| 6 |
+
test_split: validation
|
| 7 |
+
output_type: generate_until
|
| 8 |
+
doc_to_visual: !function utils.nocaps_doc_to_visual
|
| 9 |
+
doc_to_text: !function utils.nocaps_doc_to_text
|
| 10 |
+
doc_to_target: "annotations_captions"
|
| 11 |
+
generation_kwargs:
|
| 12 |
+
max_new_tokens: 64
|
| 13 |
+
temperature: 0
|
| 14 |
+
top_p: 0
|
| 15 |
+
num_beams: 1
|
| 16 |
+
do_sample: false
|
| 17 |
+
process_results: !function utils.nocaps_process_result
|
| 18 |
+
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
|
| 19 |
+
metric_list:
|
| 20 |
+
- metric: nocaps_Bleu_4
|
| 21 |
+
aggregation : !function utils.nocaps_bleu4
|
| 22 |
+
higher_is_better : true
|
| 23 |
+
- metric: nocaps_Bleu_3
|
| 24 |
+
aggregation : !function utils.nocaps_bleu3
|
| 25 |
+
higher_is_better : true
|
| 26 |
+
- metric: nocaps_Bleu_2
|
| 27 |
+
aggregation : !function utils.nocaps_bleu2
|
| 28 |
+
higher_is_better : true
|
| 29 |
+
- metric: nocaps_Bleu_1
|
| 30 |
+
aggregation : !function utils.nocaps_bleu1
|
| 31 |
+
higher_is_better : true
|
| 32 |
+
- metric: nocaps_METEOR
|
| 33 |
+
aggregation : !function utils.nocaps_meteor
|
| 34 |
+
higher_is_better : true
|
| 35 |
+
- metric: nocaps_ROUGE_L
|
| 36 |
+
aggregation : !function utils.nocaps_rougel
|
| 37 |
+
higher_is_better : true
|
| 38 |
+
- metric: nocaps_CIDEr
|
| 39 |
+
aggregation : !function utils.nocaps_cider
|
| 40 |
+
higher_is_better : true
|
| 41 |
+
#- metric: nocaps_SPICE
|
| 42 |
+
# aggregation : !function utils.nocaps_spice
|
| 43 |
+
# higher_is_better : true
|
| 44 |
+
metadata:
|
| 45 |
+
- version: 0.0
|
| 46 |
+
include: _default_template_nocaps_yaml
|
EAGLE/lmms_eval/tasks/nocaps/utils.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
from pycocoevalcap.eval import COCOEvalCap, Bleu, Meteor, Rouge, Cider, Spice
|
| 4 |
+
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
|
| 5 |
+
from pycocotools.coco import COCO
|
| 6 |
+
|
| 7 |
+
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
|
| 8 |
+
|
| 9 |
+
import logging
|
| 10 |
+
|
| 11 |
+
eval_logger = logging.getLogger("lmms-eval")
|
| 12 |
+
|
| 13 |
+
dir_name = os.path.dirname(os.path.abspath(__file__))
|
| 14 |
+
|
| 15 |
+
NOCAPS_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr"] # , "SPICE"]
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def nocaps_doc_to_visual(doc):
|
| 19 |
+
return [doc["image"].convert("RGB")]
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def nocaps_doc_to_text(doc, model_specific_prompt_kwargs=None):
|
| 23 |
+
# question = "Please carefully observe the image and come up with a caption for the image"
|
| 24 |
+
return model_specific_prompt_kwargs["prompt"]
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def nocaps_process_result(doc, result):
|
| 28 |
+
"""
|
| 29 |
+
Args:
|
| 30 |
+
doc: a instance of the eval dataset
|
| 31 |
+
results: [pred]
|
| 32 |
+
Returns:
|
| 33 |
+
a dictionary with key: metric name, value: metric value
|
| 34 |
+
"""
|
| 35 |
+
pred = result[0]
|
| 36 |
+
# The question id in our dataset is the image file itself
|
| 37 |
+
image_id = doc["image_id"]
|
| 38 |
+
|
| 39 |
+
data_dict = {"answer": doc["annotations_captions"], "pred": pred, "image_id": image_id}
|
| 40 |
+
|
| 41 |
+
return {f"nocaps_{metric}": data_dict for metric in NOCAPS_METRICS}
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def nocaps_aggregation_result(results, metric, args=None):
|
| 45 |
+
scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE")]
|
| 46 |
+
scorers_dict = {s[1]: s for s in scorers}
|
| 47 |
+
|
| 48 |
+
stored_results = []
|
| 49 |
+
# In order to make the coco eval tools to successfully create index
|
| 50 |
+
# We need at least two dict in the dataset
|
| 51 |
+
# 'annotation' and 'images'
|
| 52 |
+
# 'annotation' exactly reproduce the original annotation
|
| 53 |
+
# 'images' however only need the image id which is contained in the file name
|
| 54 |
+
dataset = {"annotations": [], "images": []}
|
| 55 |
+
idx = 0
|
| 56 |
+
for result in results:
|
| 57 |
+
stored_results.append({"image_id": int(result["image_id"]), "caption": result["pred"]})
|
| 58 |
+
for a in result["answer"]:
|
| 59 |
+
dataset["annotations"].append({"image_id": int(result["image_id"]), "caption": a, "id": idx})
|
| 60 |
+
idx += 1
|
| 61 |
+
dataset["images"].append({"id": result["image_id"]})
|
| 62 |
+
|
| 63 |
+
coco = COCO()
|
| 64 |
+
# Manually create index here
|
| 65 |
+
coco.dataset = dataset
|
| 66 |
+
coco.createIndex()
|
| 67 |
+
|
| 68 |
+
nocaps_result = coco.loadRes(stored_results)
|
| 69 |
+
nocaps_eval = COCOEvalCap(coco, nocaps_result)
|
| 70 |
+
|
| 71 |
+
imgIds = nocaps_eval.params["image_id"]
|
| 72 |
+
gts = {}
|
| 73 |
+
res = {}
|
| 74 |
+
for imgId in imgIds:
|
| 75 |
+
gts[imgId] = nocaps_eval.coco.imgToAnns[imgId]
|
| 76 |
+
res[imgId] = nocaps_eval.cocoRes.imgToAnns[imgId]
|
| 77 |
+
|
| 78 |
+
eval_logger.info("tokenization...")
|
| 79 |
+
tokenizer = PTBTokenizer()
|
| 80 |
+
gts = tokenizer.tokenize(gts)
|
| 81 |
+
res = tokenizer.tokenize(res)
|
| 82 |
+
|
| 83 |
+
eval_logger.info(f"Computing {metric} scores...")
|
| 84 |
+
|
| 85 |
+
score, scores = scorers_dict[metric][0].compute_score(gts, res)
|
| 86 |
+
# When metric is one of the Bleu, score will be a list
|
| 87 |
+
if type(score) == list:
|
| 88 |
+
n = int(metric.split("_")[-1])
|
| 89 |
+
score = score[n - 1]
|
| 90 |
+
|
| 91 |
+
path = generate_submission_file(f"nocaps_val_{metric}_scores.json", args)
|
| 92 |
+
eval_logger.info("Storing prediction that can be submitted to the server ...")
|
| 93 |
+
with open(path, "w") as f:
|
| 94 |
+
json.dump(stored_results, f, indent=4)
|
| 95 |
+
eval_logger.info(f"Your result has been saved to {path}.")
|
| 96 |
+
|
| 97 |
+
return score
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def nocaps_bleu4(results, args=None):
|
| 101 |
+
return nocaps_aggregation_result(results, "Bleu_4", args)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def nocaps_bleu3(results, args=None):
|
| 105 |
+
return nocaps_aggregation_result(results, "Bleu_3", args)
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def nocaps_bleu2(results, args=None):
|
| 109 |
+
return nocaps_aggregation_result(results, "Bleu_2", args)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def nocaps_bleu1(results, args=None):
|
| 113 |
+
return nocaps_aggregation_result(results, "Bleu_1", args)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def nocaps_meteor(results, args=None):
|
| 117 |
+
return nocaps_aggregation_result(results, "METEOR", args)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def nocaps_rougel(results, args=None):
|
| 121 |
+
return nocaps_aggregation_result(results, "ROUGE_L", args)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def nocaps_cider(results, args=None):
|
| 125 |
+
return nocaps_aggregation_result(results, "CIDEr", args)
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def nocaps_spice(results, args=None):
|
| 129 |
+
return nocaps_aggregation_result(results, "SPICE", args)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def nocaps_test_process_result(doc, result):
|
| 133 |
+
"""
|
| 134 |
+
Args:
|
| 135 |
+
doc: a instance of the eval dataset
|
| 136 |
+
results: [pred]
|
| 137 |
+
Returns:
|
| 138 |
+
a dictionary with key: metric name (in this case nocaps_passthrough), value: metric value
|
| 139 |
+
"""
|
| 140 |
+
return {"nocaps_passthrough": {"pred": result[0], "image_id": doc["image_id"]}}
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def nocaps_test_aggregation_result(results, args=None):
|
| 144 |
+
stored_results = []
|
| 145 |
+
for result in results:
|
| 146 |
+
stored_results.append({"image_id": int(result["image_id"]), "caption": result["pred"]})
|
| 147 |
+
|
| 148 |
+
path = generate_submission_file("nocaps_captions_nocaps_test_alg_results.json", args)
|
| 149 |
+
eval_logger.info("Storing prediction that can be submitted to the server ...")
|
| 150 |
+
with open(path, "w") as f:
|
| 151 |
+
json.dump(stored_results, f, indent=4)
|
| 152 |
+
|
| 153 |
+
eval_logger.info(f"Your test result has been stored in {path}. Make sure you also have the val result stored to submit to the server on https://codalab.lisn.upsaclay.fr/competitions/7404#participate.")
|
EAGLE/lmms_eval/tasks/ok_vqa/_default_template_vqa_yaml
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset_path: lmms-lab/OK-VQA
|
| 2 |
+
output_type: generate_until
|
| 3 |
+
doc_to_visual: !function utils.ok_vqa_doc_to_visual
|
| 4 |
+
doc_to_text: !function utils.ok_vqa_doc_to_text
|
| 5 |
+
doc_to_target: "answer"
|
| 6 |
+
generation_kwargs:
|
| 7 |
+
until:
|
| 8 |
+
- "ASSISTANT:"
|
| 9 |
+
metric_list:
|
| 10 |
+
- metric: exact_match
|
| 11 |
+
aggregation: mean
|
| 12 |
+
higher_is_better: true
|
| 13 |
+
ignore_case: true
|
| 14 |
+
ignore_punctuation: true
|
| 15 |
+
- metric: submission
|
| 16 |
+
aggregation: !function utils.ok_vqa_aggreate_submissions
|
| 17 |
+
higher_is_better: true
|
| 18 |
+
process_results: !function utils.ok_vqa_process_results
|
| 19 |
+
model_specific_prompt_kwargs:
|
| 20 |
+
default:
|
| 21 |
+
pre_prompt: ""
|
| 22 |
+
post_prompt: "\nWhen the provided information is insufficient, respond with 'Unanswerable'.\nAnswer the question using a single word or phrase."
|
| 23 |
+
metadata:
|
| 24 |
+
- version: 0.0
|
EAGLE/lmms_eval/tasks/ok_vqa/_generate_config.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import yaml
|
| 3 |
+
|
| 4 |
+
splits = ["val2014"]
|
| 5 |
+
tasks = ["vqa"]
|
| 6 |
+
|
| 7 |
+
if __name__ == "__main__":
|
| 8 |
+
dump_tasks = []
|
| 9 |
+
for task in tasks:
|
| 10 |
+
for split in splits:
|
| 11 |
+
yaml_dict = {"group": f"ok_vqa", "task": f"ok_vqa_{split}", "include": f"_default_template_{task}_yaml", "test_split": split}
|
| 12 |
+
if split == "train":
|
| 13 |
+
yaml_dict.pop("group")
|
| 14 |
+
else:
|
| 15 |
+
dump_tasks.append(f"ok_vqa_{split}")
|
| 16 |
+
|
| 17 |
+
save_path = f"./ok_vqa_{split}.yaml"
|
| 18 |
+
print(f"Saving to {save_path}")
|
| 19 |
+
with open(save_path, "w") as f:
|
| 20 |
+
yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False)
|
| 21 |
+
|
| 22 |
+
group_dict = {"group": "ok_vqa", "task": dump_tasks}
|
| 23 |
+
|
| 24 |
+
with open("./_ok_vqa.yaml", "w") as f:
|
| 25 |
+
yaml.dump(group_dict, f, default_flow_style=False, indent=4)
|
EAGLE/lmms_eval/tasks/ok_vqa/_ok_vqa.yaml
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
group: ok_vqa
|
| 2 |
+
task:
|
| 3 |
+
- ok_vqa_val2014
|
EAGLE/lmms_eval/tasks/ok_vqa/ok_vqa_val2014.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
group: ok_vqa
|
| 2 |
+
task: ok_vqa_val2014
|
| 3 |
+
test_split: val2014
|
| 4 |
+
include: _default_template_vqa_yaml
|
EAGLE/lmms_eval/tasks/ok_vqa/utils.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
import yaml
|
| 5 |
+
import pathlib
|
| 6 |
+
import logging
|
| 7 |
+
import datetime
|
| 8 |
+
import statistics
|
| 9 |
+
|
| 10 |
+
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
|
| 11 |
+
from lmms_eval.tasks._task_utils.vqa_eval_metric import EvalAIAnswerProcessor
|
| 12 |
+
|
| 13 |
+
eval_logger = logging.getLogger("lmms-eval")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def ok_vqa_doc_to_visual(doc):
|
| 17 |
+
return [doc["image"].convert("RGB")]
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def ok_vqa_process_results(doc, result):
|
| 21 |
+
eval_ai_processor = EvalAIAnswerProcessor()
|
| 22 |
+
assert len(result) == 1, f"The result should be a list of length 1, but got {len(result)}."
|
| 23 |
+
resAns = eval_ai_processor(result[0])
|
| 24 |
+
accuracy = 0
|
| 25 |
+
|
| 26 |
+
if "answers" in doc and doc["answers"] is not None:
|
| 27 |
+
gtAcc = []
|
| 28 |
+
|
| 29 |
+
for i in range(len(doc["answers"])):
|
| 30 |
+
doc["answers"][i] = eval_ai_processor(doc["answers"][i])
|
| 31 |
+
|
| 32 |
+
for i in range(len(doc["answers"])):
|
| 33 |
+
otherGTAns = [doc["answers"][j] for j in range(len(doc["answers"])) if i != j]
|
| 34 |
+
matchingAns = [item for item in otherGTAns if item == resAns]
|
| 35 |
+
acc = min(1, float(len(matchingAns)) / 3)
|
| 36 |
+
gtAcc.append(acc)
|
| 37 |
+
if gtAcc:
|
| 38 |
+
accuracy = statistics.mean(gtAcc)
|
| 39 |
+
else:
|
| 40 |
+
accuracy = 0
|
| 41 |
+
|
| 42 |
+
return {
|
| 43 |
+
"exact_match": accuracy,
|
| 44 |
+
"submission": {
|
| 45 |
+
"image": f"{doc['question_id']}.jpg",
|
| 46 |
+
"answer": resAns,
|
| 47 |
+
},
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def ok_vqa_doc_to_text(doc, model_specific_prompt_kwargs=None):
|
| 52 |
+
question = doc["question"]
|
| 53 |
+
if model_specific_prompt_kwargs is None:
|
| 54 |
+
model_specific_prompt_kwargs = {}
|
| 55 |
+
pre_prompt = ""
|
| 56 |
+
post_prompt = ""
|
| 57 |
+
if "pre_prompt" in model_specific_prompt_kwargs:
|
| 58 |
+
pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
|
| 59 |
+
if "post_prompt" in model_specific_prompt_kwargs:
|
| 60 |
+
post_prompt = model_specific_prompt_kwargs["post_prompt"]
|
| 61 |
+
return f"{pre_prompt}{question}{post_prompt}"
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def ok_vqa_aggreate_submissions(results, args):
|
| 65 |
+
now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S")
|
| 66 |
+
file = f"ok_vqa-test-submission-{now_date_time}.json"
|
| 67 |
+
path = generate_submission_file(file, args)
|
| 68 |
+
with open(path, "w") as f:
|
| 69 |
+
json.dump(results, f)
|
| 70 |
+
print(f"Submission file saved to {path}")
|
EAGLE/lmms_eval/tasks/olympiadbench/cn_utils.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import datetime
|
| 4 |
+
from lmms_eval.tasks.olympiadbench.olympiadbench_evals import OlympiadBenchEvaluator
|
| 5 |
+
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
|
| 6 |
+
|
| 7 |
+
import logging
|
| 8 |
+
eval_logger = logging.getLogger("lmms-eval")
|
| 9 |
+
dir_name = os.path.dirname(os.path.abspath(__file__))
|
| 10 |
+
|
| 11 |
+
olympiadbench_evaluator = OlympiadBenchEvaluator()
|
| 12 |
+
|
| 13 |
+
def olympiadbench_doc_to_visual(doc):
|
| 14 |
+
return [image.convert("RGB") for image in doc["images"]]
|
| 15 |
+
|
| 16 |
+
def olympiadbench_doc_to_text(doc):
|
| 17 |
+
question = doc["question"]
|
| 18 |
+
subject = doc["subfield"]
|
| 19 |
+
mul_ans = doc["is_multiple_answer"]
|
| 20 |
+
if mul_ans is None:
|
| 21 |
+
mul_ans = False
|
| 22 |
+
ans_type = doc["answer_type"]
|
| 23 |
+
if ans_type == "Need_human_evaluate":
|
| 24 |
+
ans_type = "proof based"
|
| 25 |
+
|
| 26 |
+
pre_prompt = f"以下是中国{subject}竞赛中的解答题。\n"
|
| 27 |
+
|
| 28 |
+
post_prompt = ""
|
| 29 |
+
if not mul_ans:
|
| 30 |
+
post_prompt += f"答案类型为{ans_type}。\n"
|
| 31 |
+
else:
|
| 32 |
+
post_prompt += f"题目有多个答案,答案类型均为{ans_type}。\n"
|
| 33 |
+
post_prompt += "请根据题目的要求和所提供的信息计算得出答案。解答过程和结果中使用的变量和公式请使用LaTeX格式表示。请在最后以"
|
| 34 |
+
if not mul_ans:
|
| 35 |
+
post_prompt += '"所以最终答案是\\boxed{答案}。"\n'
|
| 36 |
+
else:
|
| 37 |
+
post_prompt += '"所以最终答案是\\boxed{用英⽂逗号连接的多个答案}。"\n'
|
| 38 |
+
|
| 39 |
+
final_question = pre_prompt + question + '\n' + post_prompt
|
| 40 |
+
return final_question
|
| 41 |
+
|
| 42 |
+
def olympiadbench_process_results(doc, results):
|
| 43 |
+
precision = doc["error"]
|
| 44 |
+
is_proving = "TP" in doc["source"]
|
| 45 |
+
if precision is None:
|
| 46 |
+
precision = 0
|
| 47 |
+
prediction = results[0].strip()
|
| 48 |
+
|
| 49 |
+
if is_proving:
|
| 50 |
+
return {
|
| 51 |
+
"submission": prediction
|
| 52 |
+
}
|
| 53 |
+
else:
|
| 54 |
+
prediction = prediction.split("所以最终答案是")[-1]
|
| 55 |
+
prediction = prediction.replace('"', "").replace("\n", "").replace(" ", "").strip(".").strip("。")
|
| 56 |
+
accuracy = olympiadbench_evaluator.judge(prediction, doc["final_answer"][0], precision)
|
| 57 |
+
accuracy = int(accuracy)
|
| 58 |
+
return {
|
| 59 |
+
"exact_match": accuracy
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
def olympiadbench_aggregate_results(results, args):
|
| 63 |
+
now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S")
|
| 64 |
+
submission_file_name = f"olympiadbench-test-cn-submission-{now_date_time}.json"
|
| 65 |
+
path = generate_submission_file(submission_file_name, args)
|
| 66 |
+
with open(path, "w") as f:
|
| 67 |
+
json.dump(results, f, ensure_ascii=False)
|
| 68 |
+
print(f"Submission file saved to {path}")
|
| 69 |
+
|
EAGLE/lmms_eval/tasks/olympiadbench/en_utils.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import datetime
|
| 4 |
+
from lmms_eval.tasks.olympiadbench.olympiadbench_evals import OlympiadBenchEvaluator
|
| 5 |
+
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
|
| 6 |
+
|
| 7 |
+
import logging
|
| 8 |
+
eval_logger = logging.getLogger("lmms-eval")
|
| 9 |
+
dir_name = os.path.dirname(os.path.abspath(__file__))
|
| 10 |
+
|
| 11 |
+
olympiadbench_evaluator = OlympiadBenchEvaluator()
|
| 12 |
+
|
| 13 |
+
def olympiadbench_doc_to_visual(doc):
|
| 14 |
+
return [image.convert("RGB") for image in doc["images"]]
|
| 15 |
+
|
| 16 |
+
def olympiadbench_doc_to_text(doc):
|
| 17 |
+
question = doc["question"]
|
| 18 |
+
subject = doc["subfield"]
|
| 19 |
+
mul_ans = doc["is_multiple_answer"]
|
| 20 |
+
if mul_ans is None:
|
| 21 |
+
mul_ans = False
|
| 22 |
+
ans_type = doc["answer_type"]
|
| 23 |
+
if ans_type == "Need_human_evaluate":
|
| 24 |
+
ans_type = "proof based"
|
| 25 |
+
|
| 26 |
+
pre_prompt = f"The following is a question from an International {subject} competition.\n"
|
| 27 |
+
|
| 28 |
+
post_prompt = ""
|
| 29 |
+
if not mul_ans:
|
| 30 |
+
post_prompt += f"The answer of the question should be {ans_type}.\n"
|
| 31 |
+
else:
|
| 32 |
+
post_prompt += f"The question has multiple answers, each of them should be {ans_type}.\n"
|
| 33 |
+
post_prompt += "Please calculate the answer according to the given requirements and the information provided. Please use LaTeX format to represent the variables and formulas used in the solution process and results. Please end your solution with "
|
| 34 |
+
if not mul_ans:
|
| 35 |
+
post_prompt += '"So the final answer is \\boxed{answer}."\n'
|
| 36 |
+
else:
|
| 37 |
+
post_prompt += 'So the final answer is \\boxed{multiple answers connected with commas}.\n'
|
| 38 |
+
|
| 39 |
+
final_question = pre_prompt + question + '\n' + post_prompt
|
| 40 |
+
return final_question
|
| 41 |
+
|
| 42 |
+
def olympiadbench_process_results(doc, results):
|
| 43 |
+
precision = doc["error"]
|
| 44 |
+
is_proving = "TP" in doc["source"]
|
| 45 |
+
if precision is None:
|
| 46 |
+
precision = 0
|
| 47 |
+
prediction = results[0].strip()
|
| 48 |
+
|
| 49 |
+
if is_proving:
|
| 50 |
+
return {
|
| 51 |
+
"submission": prediction
|
| 52 |
+
}
|
| 53 |
+
else:
|
| 54 |
+
prediction = prediction.split("final answer is")[-1]
|
| 55 |
+
prediction = prediction.replace('"', "").replace("\n", "").replace(" ", "").strip(".").strip("。")
|
| 56 |
+
accuracy = olympiadbench_evaluator.judge(prediction, doc["final_answer"][0], precision)
|
| 57 |
+
accuracy = int(accuracy)
|
| 58 |
+
return {
|
| 59 |
+
"exact_match": accuracy
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
def olympiadbench_aggregate_results(results, args):
|
| 63 |
+
now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S")
|
| 64 |
+
submission_file_name = f"olympiadbench-test-en-submission-{now_date_time}.json"
|
| 65 |
+
path = generate_submission_file(submission_file_name, args)
|
| 66 |
+
with open(path, "w") as f:
|
| 67 |
+
json.dump(results, f, ensure_ascii=False)
|
| 68 |
+
print(f"Submission file saved to {path}")
|
| 69 |
+
|
EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
group: olympiadbench
|
| 2 |
+
task:
|
| 3 |
+
- olympiadbench_test_en
|
| 4 |
+
- olympiadbench_test_cn
|
| 5 |
+
metadata:
|
| 6 |
+
- version: 0.0
|
EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench_evals.py
ADDED
|
@@ -0,0 +1,355 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import sympy as sp
|
| 3 |
+
from sympy import simplify, Eq, sympify, Pow
|
| 4 |
+
from sympy.parsing.latex import parse_latex
|
| 5 |
+
import math
|
| 6 |
+
|
| 7 |
+
# how to use
|
| 8 |
+
# scorer = OlympiadBenchEvaluator()
|
| 9 |
+
# exp1 = "10^{10^{10^{10}}}"
|
| 10 |
+
# exp2 = "10^{10}"
|
| 11 |
+
# precision = 1e-4
|
| 12 |
+
# res = scorer.judge(exp1, exp2, precision)
|
| 13 |
+
|
| 14 |
+
class OlympiadBenchEvaluator:
|
| 15 |
+
def __init__(self):
|
| 16 |
+
# Map of special symbols to their replacements
|
| 17 |
+
self.special_signal_map = {
|
| 18 |
+
"\\left": "",
|
| 19 |
+
"\\right": "",
|
| 20 |
+
"∶": ":",
|
| 21 |
+
",": ",",
|
| 22 |
+
"$": "",
|
| 23 |
+
"\\approx": "=",
|
| 24 |
+
"\\simeq": "=",
|
| 25 |
+
"\\sim": "=",
|
| 26 |
+
"^\\prime": "'",
|
| 27 |
+
"^{\\prime}": "'",
|
| 28 |
+
"^\\circ": "",
|
| 29 |
+
"%": "",
|
| 30 |
+
}
|
| 31 |
+
self.pi = parse_latex("\\pi")
|
| 32 |
+
self.precision = 1e-8 # Default precision for comparison
|
| 33 |
+
|
| 34 |
+
def split_by_comma(self, expr: str):
|
| 35 |
+
# Splits expressions by commas outside of brackets
|
| 36 |
+
in_bracket_num = 0
|
| 37 |
+
splitted_expr = []
|
| 38 |
+
start_idx = 0
|
| 39 |
+
for i, char in enumerate(expr):
|
| 40 |
+
if char in ["(", "["]:
|
| 41 |
+
in_bracket_num += 1
|
| 42 |
+
elif char in [")", "]"]:
|
| 43 |
+
in_bracket_num -= 1
|
| 44 |
+
elif char == "," and in_bracket_num == 0:
|
| 45 |
+
splitted_expr.append(expr[start_idx:i].strip())
|
| 46 |
+
start_idx = i + 1
|
| 47 |
+
|
| 48 |
+
if start_idx < len(expr):
|
| 49 |
+
splitted_expr.append(expr[start_idx:].strip())
|
| 50 |
+
|
| 51 |
+
return splitted_expr
|
| 52 |
+
|
| 53 |
+
def trans_plus_minus_sign(self, expr_list: list):
|
| 54 |
+
# Translates plus-minus signs into separate expressions
|
| 55 |
+
new_expr_list = []
|
| 56 |
+
for expr in expr_list:
|
| 57 |
+
if "\\pm" in expr:
|
| 58 |
+
new_expr_list.append(expr.replace("\\pm", "+"))
|
| 59 |
+
new_expr_list.append(expr.replace("\\pm", "-"))
|
| 60 |
+
else:
|
| 61 |
+
new_expr_list.append(expr)
|
| 62 |
+
|
| 63 |
+
return new_expr_list
|
| 64 |
+
|
| 65 |
+
def judge(self, expression1, expression2, precision=1e-8):
|
| 66 |
+
# Judge if two expressions are equal (expression1 is considered as the Ground Truth)
|
| 67 |
+
# Default precision is a list for supporting multiple expressions
|
| 68 |
+
precision = precision if isinstance(precision, list) else [precision]
|
| 69 |
+
|
| 70 |
+
try:
|
| 71 |
+
expression1, expression2 = self.preprocess(expression1, expression2)
|
| 72 |
+
except:
|
| 73 |
+
return False
|
| 74 |
+
if expression1 == expression2:
|
| 75 |
+
# print("Exactly equal")
|
| 76 |
+
return True
|
| 77 |
+
|
| 78 |
+
# Remove Chinese characters from the string, as answers like "yes" or "no" in Chinese have been considered
|
| 79 |
+
expression1 = re.sub(r'[\u4e00-\u9fff]+', '', expression1)
|
| 80 |
+
expression2 = re.sub(r'[\u4e00-\u9fff]+', '', expression2)
|
| 81 |
+
|
| 82 |
+
expression1 = self.split_by_comma(expression1)
|
| 83 |
+
expression2 = self.split_by_comma(expression2)
|
| 84 |
+
|
| 85 |
+
temp_list1 = self.trans_plus_minus_sign(expression1)
|
| 86 |
+
temp_list2 = self.trans_plus_minus_sign(expression2)
|
| 87 |
+
|
| 88 |
+
# Set up a list for allowed errors
|
| 89 |
+
if len(precision) <= 1:
|
| 90 |
+
precision = precision * len(temp_list1)
|
| 91 |
+
|
| 92 |
+
if len(temp_list1) != len(temp_list2):
|
| 93 |
+
return False
|
| 94 |
+
|
| 95 |
+
# Check if elements in both lists can be paired and are equal
|
| 96 |
+
idx = -1
|
| 97 |
+
while len(temp_list1) != 0:
|
| 98 |
+
idx = (idx + 1) % len(temp_list1)
|
| 99 |
+
|
| 100 |
+
item1 = temp_list1[idx]
|
| 101 |
+
self.precision = precision[idx]
|
| 102 |
+
|
| 103 |
+
for item2 in temp_list2:
|
| 104 |
+
if self.is_equal(item1, item2):
|
| 105 |
+
temp_list1.remove(item1)
|
| 106 |
+
temp_list2.remove(item2)
|
| 107 |
+
precision.remove(self.precision)
|
| 108 |
+
break
|
| 109 |
+
else:
|
| 110 |
+
# If no match was found, return False
|
| 111 |
+
return False
|
| 112 |
+
|
| 113 |
+
# If all elements are matched, return True
|
| 114 |
+
return True
|
| 115 |
+
|
| 116 |
+
def is_interval(self, expr):
|
| 117 |
+
# Checks if an expression is an interval
|
| 118 |
+
return expr.startswith(("(", "[")) and expr.endswith((")", "]"))
|
| 119 |
+
|
| 120 |
+
def sympy_sub_pi(self, expression_sympy):
|
| 121 |
+
# Replaces the symbol for pi in sympy expressions with its numerical value
|
| 122 |
+
return expression_sympy.subs(self.pi, math.pi)
|
| 123 |
+
|
| 124 |
+
def is_equal(self, expression1, expression2):
|
| 125 |
+
# Default first expression is ground truth. Check if expressions are equal in different aspects
|
| 126 |
+
if expression1 == expression2 and expression1 != "" and expression2 != "":
|
| 127 |
+
# print("Equivalent natively")
|
| 128 |
+
return True
|
| 129 |
+
|
| 130 |
+
# First check if both are intervals
|
| 131 |
+
if self.is_interval(expression1) and self.is_interval(expression2):
|
| 132 |
+
try:
|
| 133 |
+
if self.interval_equal(expression1, expression2):
|
| 134 |
+
# print("Interval equivalent")
|
| 135 |
+
return True
|
| 136 |
+
except:
|
| 137 |
+
return False
|
| 138 |
+
|
| 139 |
+
# Then check for numerical equality
|
| 140 |
+
try:
|
| 141 |
+
if self.numerical_equal(expression1, expression2):
|
| 142 |
+
# print("Numerically equivalent")
|
| 143 |
+
return True
|
| 144 |
+
except:
|
| 145 |
+
pass
|
| 146 |
+
|
| 147 |
+
# Then check if expressions are mathematically equal
|
| 148 |
+
try:
|
| 149 |
+
if self.expression_equal(expression1, expression2) and not ("=" in expression1 and "=" in expression2):
|
| 150 |
+
# print("Expression equivalent")
|
| 151 |
+
return True
|
| 152 |
+
except:
|
| 153 |
+
pass
|
| 154 |
+
|
| 155 |
+
# Lastly, check for equation equality
|
| 156 |
+
try:
|
| 157 |
+
if self.equation_equal(expression1, expression2):
|
| 158 |
+
# print("Equation equivalent")
|
| 159 |
+
return True
|
| 160 |
+
except:
|
| 161 |
+
pass
|
| 162 |
+
|
| 163 |
+
return False
|
| 164 |
+
|
| 165 |
+
def numerical_equal(self, expression1: str, expression2: str, include_percentage: bool = True):
|
| 166 |
+
# Check if two numerical values are equal within an allowed error range
|
| 167 |
+
# Includes possible percentage cases
|
| 168 |
+
reference = float(expression1)
|
| 169 |
+
prediction = float(expression2)
|
| 170 |
+
|
| 171 |
+
if include_percentage:
|
| 172 |
+
gt_result = [reference / 100, reference, reference * 100]
|
| 173 |
+
else:
|
| 174 |
+
gt_result = [reference]
|
| 175 |
+
|
| 176 |
+
for item in gt_result:
|
| 177 |
+
if abs(item - prediction) <= self.precision * 1.01:
|
| 178 |
+
return True
|
| 179 |
+
return False
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def expression_equal(self, exp1, exp2):
|
| 183 |
+
# Check if two expressions are mathematically equivalent
|
| 184 |
+
# Extract expression and use sympy for equivalence checking
|
| 185 |
+
def extract_expression(expression):
|
| 186 |
+
if "=" in expression:
|
| 187 |
+
expression = expression.split("=")[1]
|
| 188 |
+
return expression.strip()
|
| 189 |
+
|
| 190 |
+
exp1 = extract_expression(exp1)
|
| 191 |
+
exp2 = extract_expression(exp2)
|
| 192 |
+
|
| 193 |
+
expr1_sym = sympify(parse_latex(exp1))
|
| 194 |
+
expr2_sym = sympify(parse_latex(exp2))
|
| 195 |
+
|
| 196 |
+
if expr1_sym == expr2_sym:
|
| 197 |
+
return True
|
| 198 |
+
else:
|
| 199 |
+
expr1_sym = self.sympy_sub_pi(expr1_sym)
|
| 200 |
+
expr2_sym = self.sympy_sub_pi(expr2_sym)
|
| 201 |
+
|
| 202 |
+
if (expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol)) or (not expr1_sym.has(sp.Symbol) and expr2_sym.has(sp.Symbol)):
|
| 203 |
+
return False
|
| 204 |
+
elif not expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol):
|
| 205 |
+
try:
|
| 206 |
+
if not (self.can_compute_power(expr1_sym) and self.can_compute_power(expr2_sym)):
|
| 207 |
+
print(f"These two numbers cannot be calculated by the current computer for: \"{str(expr1_sym)}\" and \"{str(expr2_sym)}\"")
|
| 208 |
+
return False
|
| 209 |
+
|
| 210 |
+
if abs(expr1_sym.evalf() - expr2_sym.evalf()) <= self.precision * 1.01:
|
| 211 |
+
return True
|
| 212 |
+
else:
|
| 213 |
+
return False
|
| 214 |
+
except:
|
| 215 |
+
return False
|
| 216 |
+
else:
|
| 217 |
+
try:
|
| 218 |
+
simplified_expr = simplify(expr1_sym - expr2_sym)
|
| 219 |
+
|
| 220 |
+
num_value = simplified_expr.evalf()
|
| 221 |
+
|
| 222 |
+
return abs(num_value) < 1e-3
|
| 223 |
+
except:
|
| 224 |
+
return False
|
| 225 |
+
|
| 226 |
+
def equation_equal(self, expression1, expression2):
|
| 227 |
+
# Check if two equations are mathematically equivalent
|
| 228 |
+
# Simplify equations and use sympy for equivalence checking
|
| 229 |
+
def simplify_equation(latex_eq):
|
| 230 |
+
lhs, rhs = latex_eq.split('=')
|
| 231 |
+
|
| 232 |
+
lhs_expr = parse_latex(lhs)
|
| 233 |
+
rhs_expr = parse_latex(rhs)
|
| 234 |
+
|
| 235 |
+
equation = Eq(lhs_expr, rhs_expr)
|
| 236 |
+
|
| 237 |
+
simplified_eq = simplify(equation.lhs - equation.rhs)
|
| 238 |
+
|
| 239 |
+
return simplified_eq
|
| 240 |
+
|
| 241 |
+
expr1_sym = simplify_equation(expression1)
|
| 242 |
+
expr2_sym = simplify_equation(expression2)
|
| 243 |
+
|
| 244 |
+
division_result_1 = simplify(expr1_sym / expr2_sym)
|
| 245 |
+
division_result_2 = simplify(expr2_sym / expr1_sym)
|
| 246 |
+
|
| 247 |
+
if (division_result_1.is_Integer and division_result_1 != 0) or (division_result_2.is_Integer and division_result_2 != 0):
|
| 248 |
+
return True
|
| 249 |
+
else:
|
| 250 |
+
return False
|
| 251 |
+
|
| 252 |
+
def interval_equal(self, expression1, expression2):
|
| 253 |
+
# Check if two intervals are mathematically equivalent
|
| 254 |
+
def compare_two_interval(inter1, inter2):
|
| 255 |
+
if inter1[0] != inter2[0] or inter1[-1] != inter2[-1]:
|
| 256 |
+
return False
|
| 257 |
+
|
| 258 |
+
inter1 = inter1.strip('[]()')
|
| 259 |
+
inter2 = inter2.strip('[]()')
|
| 260 |
+
|
| 261 |
+
items_1 = inter1.split(',')
|
| 262 |
+
items_2 = inter2.split(',')
|
| 263 |
+
|
| 264 |
+
for item_1, item_2 in zip(items_1, items_2):
|
| 265 |
+
if not self.expression_equal(item_1, item_2):
|
| 266 |
+
return False
|
| 267 |
+
return True
|
| 268 |
+
|
| 269 |
+
interval1 = expression1
|
| 270 |
+
interval2 = expression2
|
| 271 |
+
|
| 272 |
+
if interval1 == interval2:
|
| 273 |
+
return True
|
| 274 |
+
else:
|
| 275 |
+
inter_list1 = interval1.split("\\cup")
|
| 276 |
+
inter_list2 = interval2.split("\\cup")
|
| 277 |
+
|
| 278 |
+
if len(inter_list1) != len(inter_list2):
|
| 279 |
+
return False
|
| 280 |
+
else:
|
| 281 |
+
for inter1, inter2 in zip(inter_list1, inter_list2):
|
| 282 |
+
if not compare_two_interval(inter1, inter2):
|
| 283 |
+
return False
|
| 284 |
+
return True
|
| 285 |
+
|
| 286 |
+
def preprocess(self, expression1, expression2):
|
| 287 |
+
# Preprocess expressions to extract and replace special symbols
|
| 288 |
+
def extract_boxed_content(latex_str):
|
| 289 |
+
boxed_matches = re.finditer(r'\\boxed{', latex_str)
|
| 290 |
+
results = ""
|
| 291 |
+
|
| 292 |
+
for match in boxed_matches:
|
| 293 |
+
start_index = match.end()
|
| 294 |
+
end_index = start_index
|
| 295 |
+
stack = 1
|
| 296 |
+
|
| 297 |
+
while stack > 0 and end_index < len(latex_str):
|
| 298 |
+
if latex_str[end_index] == '{':
|
| 299 |
+
stack += 1
|
| 300 |
+
elif latex_str[end_index] == '}':
|
| 301 |
+
stack -= 1
|
| 302 |
+
end_index += 1
|
| 303 |
+
|
| 304 |
+
if stack == 0:
|
| 305 |
+
content = latex_str[start_index:end_index - 1]
|
| 306 |
+
results += content + ","
|
| 307 |
+
else:
|
| 308 |
+
raise ValueError("Mismatched braces in LaTeX string.")
|
| 309 |
+
|
| 310 |
+
if results == "":
|
| 311 |
+
last_line_ans = latex_str.strip().split("\n")[-1]
|
| 312 |
+
dollar_pattern = r"\$(.*?)\$"
|
| 313 |
+
answers = re.findall(dollar_pattern, last_line_ans)
|
| 314 |
+
|
| 315 |
+
if answers:
|
| 316 |
+
for ans in answers:
|
| 317 |
+
results += ans + ","
|
| 318 |
+
else:
|
| 319 |
+
results = latex_str
|
| 320 |
+
|
| 321 |
+
return results
|
| 322 |
+
|
| 323 |
+
def sepcial_symbol_replace(expression):
|
| 324 |
+
if "\\in " in expression:
|
| 325 |
+
expression = expression.split("\\in ")[1]
|
| 326 |
+
|
| 327 |
+
for signal in self.special_signal_map:
|
| 328 |
+
expression = expression.replace(signal, self.special_signal_map[signal])
|
| 329 |
+
|
| 330 |
+
expression = expression.strip("\n$,.:;^_=+`!@#$%^&*~,。")
|
| 331 |
+
|
| 332 |
+
pattern = r'\\(?:mathrm|mathbf)\{~?([^}]*)\}'
|
| 333 |
+
expression = re.sub(pattern, r'\1', expression)
|
| 334 |
+
|
| 335 |
+
return expression
|
| 336 |
+
|
| 337 |
+
exp1, exp2 = extract_boxed_content(expression1), extract_boxed_content(expression2)
|
| 338 |
+
exp1, exp2 = sepcial_symbol_replace(exp1), sepcial_symbol_replace(exp2)
|
| 339 |
+
|
| 340 |
+
return exp1, exp2
|
| 341 |
+
|
| 342 |
+
def can_compute_power(self, expr):
|
| 343 |
+
# Checks if a power expression can be computed
|
| 344 |
+
if isinstance(expr, Pow):
|
| 345 |
+
base, exp = expr.as_base_exp()
|
| 346 |
+
if base.is_number and exp.is_number:
|
| 347 |
+
MAX_EXP = 1000 # Adjust based on computing environment
|
| 348 |
+
if abs(exp.evalf()) > MAX_EXP:
|
| 349 |
+
return False
|
| 350 |
+
else:
|
| 351 |
+
return True
|
| 352 |
+
else:
|
| 353 |
+
return False
|
| 354 |
+
else:
|
| 355 |
+
return True # Not a power expression, can compute
|
EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench_test_cn.yaml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset_path: lmms-lab/OlympiadBench
|
| 2 |
+
dataset_kwargs:
|
| 3 |
+
token: True
|
| 4 |
+
task : "olympiadbench_test_cn"
|
| 5 |
+
test_split: test_cn
|
| 6 |
+
output_type: generate_until
|
| 7 |
+
doc_to_visual: !function cn_utils.olympiadbench_doc_to_visual
|
| 8 |
+
doc_to_text: !function cn_utils.olympiadbench_doc_to_text
|
| 9 |
+
doc_to_target: "answer"
|
| 10 |
+
generation_kwargs:
|
| 11 |
+
until:
|
| 12 |
+
- "ASSISTANT:"
|
| 13 |
+
max_new_tokens: 1024
|
| 14 |
+
temperature: 0
|
| 15 |
+
top_p: 0
|
| 16 |
+
num_beams: 1
|
| 17 |
+
do_sample: false
|
| 18 |
+
process_results: !function cn_utils.olympiadbench_process_results
|
| 19 |
+
metric_list:
|
| 20 |
+
- metric: submission
|
| 21 |
+
aggregation: !function cn_utils.olympiadbench_aggregate_results
|
| 22 |
+
higher_is_better: true
|
| 23 |
+
- metric: exact_match
|
| 24 |
+
aggregation: mean
|
| 25 |
+
higher_is_better: true
|
EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench_test_en.yaml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset_path: lmms-lab/OlympiadBench
|
| 2 |
+
dataset_kwargs:
|
| 3 |
+
token: True
|
| 4 |
+
task : "olympiadbench_test_en"
|
| 5 |
+
test_split: test_en
|
| 6 |
+
output_type: generate_until
|
| 7 |
+
doc_to_visual: !function en_utils.olympiadbench_doc_to_visual
|
| 8 |
+
doc_to_text: !function en_utils.olympiadbench_doc_to_text
|
| 9 |
+
doc_to_target: "answer"
|
| 10 |
+
generation_kwargs:
|
| 11 |
+
until:
|
| 12 |
+
- "ASSISTANT:"
|
| 13 |
+
max_new_tokens: 1024
|
| 14 |
+
temperature: 0
|
| 15 |
+
top_p: 0
|
| 16 |
+
num_beams: 1
|
| 17 |
+
do_sample: false
|
| 18 |
+
process_results: !function en_utils.olympiadbench_process_results
|
| 19 |
+
metric_list:
|
| 20 |
+
- metric: submission
|
| 21 |
+
aggregation: !function en_utils.olympiadbench_aggregate_results
|
| 22 |
+
higher_is_better: true
|
| 23 |
+
- metric: exact_match
|
| 24 |
+
aggregation: mean
|
| 25 |
+
higher_is_better: true
|
EAGLE/lmms_eval/tasks/seedbench/seedbench.yaml
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset_path: lmms-lab/SEED-Bench
|
| 2 |
+
dataset_kwargs:
|
| 3 |
+
token: True
|
| 4 |
+
task: "seedbench"
|
| 5 |
+
test_split: test
|
| 6 |
+
output_type: generate_until
|
| 7 |
+
doc_to_visual: !function utils.seed_doc_to_visual
|
| 8 |
+
doc_to_text: !function utils.seed_doc_to_text
|
| 9 |
+
doc_to_target: "answer"
|
| 10 |
+
generation_kwargs:
|
| 11 |
+
until:
|
| 12 |
+
- "ASSISTANT:"
|
| 13 |
+
image_aspect_ratio: original
|
| 14 |
+
# The return value of process_results will be used by metrics
|
| 15 |
+
process_results: !function utils.seed_process_result
|
| 16 |
+
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
|
| 17 |
+
metric_list:
|
| 18 |
+
- metric: seed_image
|
| 19 |
+
aggregation: !function utils.seed_aggregation_result
|
| 20 |
+
higher_is_better: true
|
| 21 |
+
- metric: seed_video
|
| 22 |
+
aggregation: !function utils.seed_aggregation_result
|
| 23 |
+
higher_is_better: true
|
| 24 |
+
- metric: seed_all
|
| 25 |
+
aggregation: !function utils.seed_aggregation_result
|
| 26 |
+
higher_is_better: true
|
| 27 |
+
metadata:
|
| 28 |
+
- version: 0.0
|
EAGLE/lmms_eval/tasks/seedbench/seedbench_ppl.yaml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset_path: lmms-lab/SEED-Bench
|
| 2 |
+
dataset_kwargs:
|
| 3 |
+
token: True
|
| 4 |
+
task: "seedbench_ppl"
|
| 5 |
+
test_split: test
|
| 6 |
+
output_type: multiple_choice
|
| 7 |
+
doc_to_visual: !function utils.seed_doc_to_visual
|
| 8 |
+
doc_to_text: !function utils.seed_doc_to_text_mc
|
| 9 |
+
doc_to_choice : !function utils.seed_doc_to_choice
|
| 10 |
+
doc_to_target: !function utils.seed_doc_to_mc_target
|
| 11 |
+
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
|
| 12 |
+
metric_list:
|
| 13 |
+
- metric: acc
|
| 14 |
+
metadata:
|
| 15 |
+
- version: 0.0
|
EAGLE/lmms_eval/tasks/seedbench/utils.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def seed_doc_to_visual(doc):
|
| 5 |
+
return [image.convert("RGB") for image in doc["image"]]
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def seed_doc_to_text(doc):
|
| 9 |
+
question = doc["question"]
|
| 10 |
+
question += "\n" + f"A. {doc['choice_a']}\n"
|
| 11 |
+
question += f"B. {doc['choice_b']}\n"
|
| 12 |
+
question += f"C. {doc['choice_c']}\n"
|
| 13 |
+
question += f"D. {doc['choice_d']}"
|
| 14 |
+
return f"{question}\nAnswer with the option's letter from the given choices directly."
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def seed_process_result(doc, result):
|
| 18 |
+
pred = result[0].strip()
|
| 19 |
+
if len(pred) > 1:
|
| 20 |
+
pred = pred[0]
|
| 21 |
+
answer = doc["answer"]
|
| 22 |
+
data_type = doc["data_type"]
|
| 23 |
+
|
| 24 |
+
return {f"seed_{data_type}": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}, f"seed_all": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}}
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def seed_aggregation_result(results):
|
| 28 |
+
total_count = 0
|
| 29 |
+
total_correct = 0
|
| 30 |
+
for result in results:
|
| 31 |
+
if result["pred"] == result["answer"]:
|
| 32 |
+
total_correct += 1
|
| 33 |
+
total_count += 1
|
| 34 |
+
return total_correct / total_count
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def seed_aggregation_result_all(results):
|
| 38 |
+
score = seed_aggregation_result(results)
|
| 39 |
+
stored_results = []
|
| 40 |
+
for result in results:
|
| 41 |
+
stored_results.append({"question_id": result["question_id"], "prediction": result["pred"]})
|
| 42 |
+
with open("./seed_submission.json", "w") as f:
|
| 43 |
+
json.dump(stored_results, f, indent=4)
|
| 44 |
+
print("Storing files for seed_submission ...")
|
| 45 |
+
|
| 46 |
+
return score
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def seed_doc_to_text_mc(doc):
|
| 50 |
+
question = doc["question"]
|
| 51 |
+
return f"{question} Answer :"
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def seed_doc_to_choice(doc):
|
| 55 |
+
return [doc["choice_a"], doc["choice_b"], doc["choice_c"], doc["choice_d"]]
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def seed_doc_to_mc_target(doc):
|
| 59 |
+
answer2choice = {"A": "choice_a", "B": "choice_b", "C": "choice_c", "D": "choice_d"}
|
| 60 |
+
return doc[answer2choice[doc["answer"]]]
|
EAGLE/lmms_eval/tasks/textvqa/_default_template_textvqa_yaml
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset_path: lmms-lab/textvqa
|
| 2 |
+
output_type: generate_until
|
| 3 |
+
doc_to_visual: !function utils.textvqa_doc_to_visual
|
| 4 |
+
doc_to_text: !function utils.textvqa_doc_to_text
|
| 5 |
+
doc_to_target: "answer"
|
| 6 |
+
generation_kwargs:
|
| 7 |
+
until:
|
| 8 |
+
- "ASSISTANT:"
|
| 9 |
+
process_results: !function utils.textvqa_process_results
|
| 10 |
+
model_specific_prompt_kwargs:
|
| 11 |
+
default:
|
| 12 |
+
pre_prompt: ""
|
| 13 |
+
post_prompt: "\nAnswer the question using a single word or phrase."
|
| 14 |
+
ocr: true
|
| 15 |
+
qwen_vl:
|
| 16 |
+
pre_prompt: ""
|
| 17 |
+
post_prompt: " Answer:"
|
EAGLE/lmms_eval/tasks/textvqa/_textvqa.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
group: textvqa
|
| 2 |
+
task:
|
| 3 |
+
- textvqa_val
|
| 4 |
+
- textvqa_test
|
EAGLE/lmms_eval/tasks/textvqa/textvqa_test.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
task: textvqa_test
|
| 2 |
+
test_split: test
|
| 3 |
+
metric_list:
|
| 4 |
+
- metric: submission
|
| 5 |
+
aggregation: !function utils.textvqa_aggreate_submissions
|
| 6 |
+
higher_is_better: true
|
| 7 |
+
include: _default_template_textvqa_yaml
|
EAGLE/lmms_eval/tasks/textvqa/textvqa_val.yaml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
task: textvqa_val
|
| 2 |
+
test_split: validation
|
| 3 |
+
metric_list:
|
| 4 |
+
- metric: exact_match
|
| 5 |
+
aggregation: mean
|
| 6 |
+
higher_is_better: true
|
| 7 |
+
ignore_case: true
|
| 8 |
+
ignore_punctuation: true
|
| 9 |
+
- metric: submission
|
| 10 |
+
aggregation: !function utils.textvqa_aggreate_submissions
|
| 11 |
+
higher_is_better: true
|
| 12 |
+
include: _default_template_textvqa_yaml
|
EAGLE/lmms_eval/tasks/textvqa/utils.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
import yaml
|
| 5 |
+
import pathlib
|
| 6 |
+
import logging
|
| 7 |
+
import datetime
|
| 8 |
+
import statistics
|
| 9 |
+
|
| 10 |
+
from lmms_eval.tasks._task_utils.vqa_eval_metric import EvalAIAnswerProcessor
|
| 11 |
+
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
|
| 12 |
+
|
| 13 |
+
eval_logger = logging.getLogger("lmms-eval")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def textvqa_doc_to_visual(doc):
|
| 17 |
+
return [doc["image"].convert("RGB")]
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def textvqa_process_results(doc, result):
|
| 21 |
+
eval_ai_processor = EvalAIAnswerProcessor()
|
| 22 |
+
assert len(result) == 1, f"The result should be a list of length 1, but got {len(result)}."
|
| 23 |
+
resAns = eval_ai_processor(result[0])
|
| 24 |
+
accuracy = 0
|
| 25 |
+
|
| 26 |
+
if "answers" in doc and doc["answers"] is not None:
|
| 27 |
+
gtAcc = []
|
| 28 |
+
|
| 29 |
+
for i in range(len(doc["answers"])):
|
| 30 |
+
doc["answers"][i] = eval_ai_processor(doc["answers"][i])
|
| 31 |
+
|
| 32 |
+
for i in range(len(doc["answers"])):
|
| 33 |
+
otherGTAns = [doc["answers"][j] for j in range(len(doc["answers"])) if i != j]
|
| 34 |
+
matchingAns = [item for item in otherGTAns if item == resAns]
|
| 35 |
+
acc = min(1, float(len(matchingAns)) / 3)
|
| 36 |
+
gtAcc.append(acc)
|
| 37 |
+
accuracy = statistics.mean(gtAcc)
|
| 38 |
+
|
| 39 |
+
return {
|
| 40 |
+
"exact_match": accuracy,
|
| 41 |
+
"submission": {
|
| 42 |
+
"question_id": doc["question_id"],
|
| 43 |
+
"answer": resAns,
|
| 44 |
+
},
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def textvqa_doc_to_text(doc, model_specific_prompt_kwargs=None):
|
| 49 |
+
pre_prompt = ""
|
| 50 |
+
post_post = ""
|
| 51 |
+
ocr_ref = ""
|
| 52 |
+
if model_specific_prompt_kwargs:
|
| 53 |
+
if "pre_prompt" in model_specific_prompt_kwargs:
|
| 54 |
+
pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
|
| 55 |
+
if "post_prompt" in model_specific_prompt_kwargs:
|
| 56 |
+
post_prompt = model_specific_prompt_kwargs["post_prompt"]
|
| 57 |
+
if "ocr" in model_specific_prompt_kwargs and model_specific_prompt_kwargs["ocr"]:
|
| 58 |
+
ocr_ref = f"\nReference OCR token: {', '.join(doc['ocr_tokens'])}"
|
| 59 |
+
return f"{pre_prompt}{doc['question'].capitalize()}{ocr_ref}{post_prompt}"
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def textvqa_aggreate_submissions(results, args):
|
| 63 |
+
now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
|
| 64 |
+
path = generate_submission_file(f"textvqa_submission_{now_date_time}.json", args)
|
| 65 |
+
with open(path, "w") as f:
|
| 66 |
+
json.dump(results, f)
|
| 67 |
+
# print(f"Submission file saved to {path}")
|
| 68 |
+
eval_logger.info(f"Submission file saved to {path}")
|
EAGLE/lmms_eval/tasks/vizwiz_vqa/_default_template_vqa_yaml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset_path: lmms-lab/VizWiz-VQA
|
| 2 |
+
output_type: generate_until
|
| 3 |
+
doc_to_visual: !function utils.vizwiz_vqa_doc_to_visual
|
| 4 |
+
doc_to_text: !function utils.vizwiz_vqa_doc_to_text
|
| 5 |
+
doc_to_target: "answer"
|
| 6 |
+
generation_kwargs:
|
| 7 |
+
until:
|
| 8 |
+
- "ASSISTANT:"
|
| 9 |
+
metadata:
|
| 10 |
+
- version: 0.0
|
| 11 |
+
model_specific_prompt_kwargs:
|
| 12 |
+
default:
|
| 13 |
+
pre_prompt: ""
|
| 14 |
+
post_prompt: "\nWhen the provided information is insufficient, respond with 'Unanswerable'.\nAnswer the question using a single word or phrase."
|
| 15 |
+
process_results: !function utils.vizwiz_vqa_process_results
|
EAGLE/lmms_eval/tasks/vizwiz_vqa/_generate_config.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import yaml
|
| 3 |
+
|
| 4 |
+
splits = ["val", "test"]
|
| 5 |
+
tasks = ["vqa"]
|
| 6 |
+
|
| 7 |
+
if __name__ == "__main__":
|
| 8 |
+
dump_tasks = []
|
| 9 |
+
for task in tasks:
|
| 10 |
+
for split in splits:
|
| 11 |
+
yaml_dict = {"group": f"vizwiz_{task}", "task": f"vizwiz_{task}_{split}", "include": f"_default_template_{task}_yaml", "test_split": split}
|
| 12 |
+
if split == "train":
|
| 13 |
+
yaml_dict.pop("group")
|
| 14 |
+
else:
|
| 15 |
+
dump_tasks.append(f"vizwiz_{task}_{split}")
|
| 16 |
+
|
| 17 |
+
save_path = f"./vizwiz_{task}_{split}.yaml"
|
| 18 |
+
print(f"Saving to {save_path}")
|
| 19 |
+
with open(save_path, "w") as f:
|
| 20 |
+
yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False)
|
| 21 |
+
|
| 22 |
+
group_dict = {"group": "vizwiz_vqa", "task": dump_tasks}
|
| 23 |
+
|
| 24 |
+
with open("./_vizwiz_vqa.yaml", "w") as f:
|
| 25 |
+
yaml.dump(group_dict, f, default_flow_style=False, indent=4)
|
EAGLE/lmms_eval/tasks/vizwiz_vqa/_vizwiz_vqa.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
group: vizwiz_vqa
|
| 2 |
+
task:
|
| 3 |
+
- vizwiz_vqa_val
|
| 4 |
+
- vizwiz_vqa_test
|
EAGLE/lmms_eval/tasks/vizwiz_vqa/utils.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
import yaml
|
| 5 |
+
import pathlib
|
| 6 |
+
import logging
|
| 7 |
+
import datetime
|
| 8 |
+
import statistics
|
| 9 |
+
|
| 10 |
+
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
|
| 11 |
+
from lmms_eval.tasks._task_utils.vqa_eval_metric import EvalAIAnswerProcessor
|
| 12 |
+
|
| 13 |
+
eval_logger = logging.getLogger("lmms-eval")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def vizwiz_vqa_doc_to_visual(doc):
|
| 17 |
+
return [doc["image"].convert("RGB")]
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def vizwiz_vqa_process_results(doc, result):
|
| 21 |
+
eval_ai_processor = EvalAIAnswerProcessor()
|
| 22 |
+
assert len(result) == 1, f"The result should be a list of length 1, but got {len(result)}."
|
| 23 |
+
resAns = eval_ai_processor(result[0])
|
| 24 |
+
accuracy = 0
|
| 25 |
+
|
| 26 |
+
if "answers" in doc and doc["answers"] is not None:
|
| 27 |
+
gtAcc = []
|
| 28 |
+
|
| 29 |
+
for i in range(len(doc["answers"])):
|
| 30 |
+
doc["answers"][i] = eval_ai_processor(doc["answers"][i])
|
| 31 |
+
|
| 32 |
+
for i in range(len(doc["answers"])):
|
| 33 |
+
otherGTAns = [doc["answers"][j] for j in range(len(doc["answers"])) if i != j]
|
| 34 |
+
matchingAns = [item for item in otherGTAns if item == resAns]
|
| 35 |
+
acc = min(1, float(len(matchingAns)) / 3)
|
| 36 |
+
gtAcc.append(acc)
|
| 37 |
+
if gtAcc:
|
| 38 |
+
accuracy = statistics.mean(gtAcc)
|
| 39 |
+
else:
|
| 40 |
+
accuracy = 0
|
| 41 |
+
|
| 42 |
+
return {
|
| 43 |
+
"exact_match": accuracy,
|
| 44 |
+
"submission": {
|
| 45 |
+
"image": f"{doc['question_id']}.jpg",
|
| 46 |
+
"answer": resAns,
|
| 47 |
+
},
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def vizwiz_vqa_doc_to_text(doc, model_specific_prompt_kwargs=None):
|
| 52 |
+
if model_specific_prompt_kwargs is None:
|
| 53 |
+
model_specific_prompt_kwargs = {}
|
| 54 |
+
pre_prompt = ""
|
| 55 |
+
post_prompt = ""
|
| 56 |
+
if "pre_prompt" in model_specific_prompt_kwargs:
|
| 57 |
+
pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
|
| 58 |
+
if "post_prompt" in model_specific_prompt_kwargs:
|
| 59 |
+
post_prompt = model_specific_prompt_kwargs["post_prompt"]
|
| 60 |
+
text = f"{pre_prompt}{doc['question'].capitalize()}{post_prompt}"
|
| 61 |
+
return text
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def vizwiz_vqa_aggreate_submissions(results, args):
|
| 65 |
+
now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S")
|
| 66 |
+
submission_file_name = f"vizwiz_vqa-test-submission-{now_date_time}.json"
|
| 67 |
+
path = generate_submission_file(submission_file_name, args)
|
| 68 |
+
with open(path, "w") as f:
|
| 69 |
+
json.dump(results, f)
|
| 70 |
+
print(f"Submission file saved to {path}")
|
EAGLE/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_test.yaml
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
group: vizwiz_vqa
|
| 2 |
+
task: vizwiz_vqa_test
|
| 3 |
+
test_split: test
|
| 4 |
+
include: _default_template_vqa_yaml
|
| 5 |
+
process_results: !function utils.vizwiz_vqa_process_results
|
| 6 |
+
metric_list:
|
| 7 |
+
# - metric: exact_match
|
| 8 |
+
# aggregation: mean
|
| 9 |
+
# higher_is_better: true
|
| 10 |
+
# ignore_case: true
|
| 11 |
+
# ignore_punctuation: true
|
| 12 |
+
- metric: submission
|
| 13 |
+
aggregation: !function utils.vizwiz_vqa_aggreate_submissions
|
| 14 |
+
higher_is_better: true
|
EAGLE/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_val.yaml
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
group: vizwiz_vqa
|
| 2 |
+
task: vizwiz_vqa_val
|
| 3 |
+
test_split: val
|
| 4 |
+
include: _default_template_vqa_yaml
|
| 5 |
+
metric_list:
|
| 6 |
+
- metric: exact_match
|
| 7 |
+
aggregation: mean
|
| 8 |
+
higher_is_better: true
|
| 9 |
+
ignore_case: true
|
| 10 |
+
ignore_punctuation: true
|
| 11 |
+
# - metric: submission
|
| 12 |
+
# aggregation: !function utils.vizwiz_vqa_aggreate_submissions
|
| 13 |
+
# higher_is_better: true
|
EAGLE/lmms_eval/tasks/vqav2/_default_template_vqav2_yaml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset_path: lmms-lab/VQAv2
|
| 2 |
+
dataset_kwargs:
|
| 3 |
+
token: True
|
| 4 |
+
output_type: generate_until
|
| 5 |
+
doc_to_visual: !function utils.vqav2_doc_to_visual
|
| 6 |
+
doc_to_text: !function utils.vqav2_doc_to_text
|
| 7 |
+
doc_to_target: "answer"
|
| 8 |
+
generation_kwargs:
|
| 9 |
+
max_new_tokens: 16
|
| 10 |
+
metadata:
|
| 11 |
+
- version: 0.0
|
| 12 |
+
model_specific_prompt_kwargs:
|
| 13 |
+
default:
|
| 14 |
+
pre_prompt: ""
|
| 15 |
+
post_prompt: "\nAnswer the question using a single word or phrase."
|
EAGLE/lmms_eval/tasks/vqav2/_vqav2.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
group: vqav2
|
| 2 |
+
task:
|
| 3 |
+
- vqav2_val
|
| 4 |
+
- vqav2_test
|
EAGLE/lmms_eval/tasks/vqav2/utils.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
import logging
|
| 5 |
+
import datetime
|
| 6 |
+
import statistics
|
| 7 |
+
|
| 8 |
+
import lmms_eval.tasks._task_utils.file_utils as file_utils
|
| 9 |
+
|
| 10 |
+
from lmms_eval.tasks._task_utils.vqa_eval_metric import EvalAIAnswerProcessor
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
eval_logger = logging.getLogger("lmms-eval")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def vqav2_doc_to_visual(doc):
|
| 17 |
+
return [doc["image"].convert("RGB")]
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def vqav2_process_results(doc, result):
|
| 21 |
+
eval_ai_processor = EvalAIAnswerProcessor()
|
| 22 |
+
assert len(result) == 1, f"The result should be a list of length 1, but got {len(result)}."
|
| 23 |
+
resAns = eval_ai_processor(result[0])
|
| 24 |
+
accuracy = 0
|
| 25 |
+
|
| 26 |
+
if "answers" in doc and doc["answers"] is not None:
|
| 27 |
+
for ansDic in doc["answers"]:
|
| 28 |
+
ansDic["answer"] = ansDic["answer"].replace("\n", " ")
|
| 29 |
+
ansDic["answer"] = ansDic["answer"].replace("\t", " ")
|
| 30 |
+
ansDic["answer"] = ansDic["answer"].strip()
|
| 31 |
+
gtAcc = []
|
| 32 |
+
gtAnswers = [ans["answer"] for ans in doc["answers"]]
|
| 33 |
+
|
| 34 |
+
if len(set(gtAnswers)) > 1:
|
| 35 |
+
for ansDic in doc["answers"]:
|
| 36 |
+
ansDic["answer"] = eval_ai_processor.process_punctuation(ansDic["answer"])
|
| 37 |
+
ansDic["answer"] = eval_ai_processor.process_digit_article(ansDic["answer"])
|
| 38 |
+
resAns = eval_ai_processor.process_punctuation(resAns)
|
| 39 |
+
resAns = eval_ai_processor.process_digit_article(resAns)
|
| 40 |
+
|
| 41 |
+
for gtAnsDatum in doc["answers"]:
|
| 42 |
+
otherGTAns = [item for item in doc["answers"] if item != gtAnsDatum]
|
| 43 |
+
matchingAns = [item for item in otherGTAns if item["answer"] == resAns]
|
| 44 |
+
acc = min(1, float(len(matchingAns)) / 3)
|
| 45 |
+
gtAcc.append(acc)
|
| 46 |
+
accuracy = statistics.mean(gtAcc)
|
| 47 |
+
|
| 48 |
+
return {
|
| 49 |
+
"exact_match": accuracy,
|
| 50 |
+
"submission": {
|
| 51 |
+
"question_id": doc["question_id"],
|
| 52 |
+
"answer": resAns,
|
| 53 |
+
},
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def vqav2_process_results_test(doc, result):
|
| 58 |
+
res = vqav2_process_results(doc, result)
|
| 59 |
+
return {
|
| 60 |
+
"submission": res["submission"],
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def vqav2_process_results_val(doc, result):
|
| 65 |
+
res = vqav2_process_results(doc, result)
|
| 66 |
+
return {
|
| 67 |
+
"exact_match": res["exact_match"],
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def vqav2_doc_to_text(doc, model_specific_prompt_kwargs=None):
|
| 72 |
+
if model_specific_prompt_kwargs is None:
|
| 73 |
+
model_specific_prompt_kwargs = {}
|
| 74 |
+
pre_prompt = ""
|
| 75 |
+
post_prompt = ""
|
| 76 |
+
if "pre_prompt" in model_specific_prompt_kwargs:
|
| 77 |
+
pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
|
| 78 |
+
if "post_prompt" in model_specific_prompt_kwargs:
|
| 79 |
+
post_prompt = model_specific_prompt_kwargs["post_prompt"]
|
| 80 |
+
return f"{pre_prompt}{doc['question']}{post_prompt}"
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def vqav2_aggreate_submissions(results, args):
|
| 84 |
+
now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
|
| 85 |
+
submission_file_name = f"vqav2-test-submission-{now_date_time}.json"
|
| 86 |
+
path = file_utils.generate_submission_file(submission_file_name, args)
|
| 87 |
+
with open(path, "w") as f:
|
| 88 |
+
json.dump(results, f)
|
| 89 |
+
eval_logger.info(f"Submission file saved to {path}")
|
EAGLE/lmms_eval/tasks/vqav2/vqav2_test.yaml
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
task: "vqav2_test"
|
| 2 |
+
include: _default_template_vqav2_yaml
|
| 3 |
+
test_split: test
|
| 4 |
+
metric_list:
|
| 5 |
+
- metric: submission
|
| 6 |
+
aggregation: !function utils.vqav2_aggreate_submissions
|
| 7 |
+
higher_is_better: true
|
| 8 |
+
process_results: !function utils.vqav2_process_results_test
|