tuandunghcmut commited on
Commit
e196b86
·
verified ·
1 Parent(s): b1661d3

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. EAGLE/lmms_eval/tasks/ferret/ferret.yaml +39 -0
  2. EAGLE/lmms_eval/tasks/ferret/rule.json +5 -0
  3. EAGLE/lmms_eval/tasks/ferret/utils.py +206 -0
  4. EAGLE/lmms_eval/tasks/flickr30k/flickr30k.yaml +3 -0
  5. EAGLE/lmms_eval/tasks/flickr30k/flickr30k_test.yaml +44 -0
  6. EAGLE/lmms_eval/tasks/flickr30k/utils.py +141 -0
  7. EAGLE/lmms_eval/tasks/hallusion_bench/evaluate_hb.py +129 -0
  8. EAGLE/lmms_eval/tasks/hallusion_bench/hallusion_bench_image.yaml +41 -0
  9. EAGLE/lmms_eval/tasks/hallusion_bench/utils.py +306 -0
  10. EAGLE/lmms_eval/tasks/iconqa/utils.py +57 -0
  11. EAGLE/lmms_eval/tasks/mme/mme.yaml +37 -0
  12. EAGLE/lmms_eval/tasks/mme/utils.py +120 -0
  13. EAGLE/lmms_eval/tasks/multidocvqa/multidocvqa.yaml +4 -0
  14. EAGLE/lmms_eval/tasks/multidocvqa/multidocvqa_test.yaml +20 -0
  15. EAGLE/lmms_eval/tasks/multidocvqa/multidocvqa_val.yaml +23 -0
  16. EAGLE/lmms_eval/tasks/multidocvqa/utils.py +116 -0
  17. EAGLE/lmms_eval/tasks/nocaps/_default_template_nocaps_yaml +3 -0
  18. EAGLE/lmms_eval/tasks/nocaps/nocaps.yaml +4 -0
  19. EAGLE/lmms_eval/tasks/nocaps/nocaps_test.yaml +25 -0
  20. EAGLE/lmms_eval/tasks/nocaps/nocaps_val.yaml +46 -0
  21. EAGLE/lmms_eval/tasks/nocaps/utils.py +153 -0
  22. EAGLE/lmms_eval/tasks/ok_vqa/_default_template_vqa_yaml +24 -0
  23. EAGLE/lmms_eval/tasks/ok_vqa/_generate_config.py +25 -0
  24. EAGLE/lmms_eval/tasks/ok_vqa/_ok_vqa.yaml +3 -0
  25. EAGLE/lmms_eval/tasks/ok_vqa/ok_vqa_val2014.yaml +4 -0
  26. EAGLE/lmms_eval/tasks/ok_vqa/utils.py +70 -0
  27. EAGLE/lmms_eval/tasks/olympiadbench/cn_utils.py +69 -0
  28. EAGLE/lmms_eval/tasks/olympiadbench/en_utils.py +69 -0
  29. EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench.yaml +6 -0
  30. EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench_evals.py +355 -0
  31. EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench_test_cn.yaml +25 -0
  32. EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench_test_en.yaml +25 -0
  33. EAGLE/lmms_eval/tasks/seedbench/seedbench.yaml +28 -0
  34. EAGLE/lmms_eval/tasks/seedbench/seedbench_ppl.yaml +15 -0
  35. EAGLE/lmms_eval/tasks/seedbench/utils.py +60 -0
  36. EAGLE/lmms_eval/tasks/textvqa/_default_template_textvqa_yaml +17 -0
  37. EAGLE/lmms_eval/tasks/textvqa/_textvqa.yaml +4 -0
  38. EAGLE/lmms_eval/tasks/textvqa/textvqa_test.yaml +7 -0
  39. EAGLE/lmms_eval/tasks/textvqa/textvqa_val.yaml +12 -0
  40. EAGLE/lmms_eval/tasks/textvqa/utils.py +68 -0
  41. EAGLE/lmms_eval/tasks/vizwiz_vqa/_default_template_vqa_yaml +15 -0
  42. EAGLE/lmms_eval/tasks/vizwiz_vqa/_generate_config.py +25 -0
  43. EAGLE/lmms_eval/tasks/vizwiz_vqa/_vizwiz_vqa.yaml +4 -0
  44. EAGLE/lmms_eval/tasks/vizwiz_vqa/utils.py +70 -0
  45. EAGLE/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_test.yaml +14 -0
  46. EAGLE/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_val.yaml +13 -0
  47. EAGLE/lmms_eval/tasks/vqav2/_default_template_vqav2_yaml +15 -0
  48. EAGLE/lmms_eval/tasks/vqav2/_vqav2.yaml +4 -0
  49. EAGLE/lmms_eval/tasks/vqav2/utils.py +89 -0
  50. EAGLE/lmms_eval/tasks/vqav2/vqav2_test.yaml +8 -0
EAGLE/lmms_eval/tasks/ferret/ferret.yaml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_path: lmms-lab/Ferret-Bench
2
+ dataset_kwargs:
3
+ token: True
4
+ task: "ferret"
5
+ test_split: test
6
+ output_type: generate_until
7
+ doc_to_visual: !function utils.ferret_doc_to_visual
8
+ doc_to_text: !function utils.ferret_doc_to_text
9
+ doc_to_target: "gpt_answer"
10
+ generation_kwargs:
11
+ until:
12
+ - "ASSISTANT:"
13
+ image_aspect_ratio: original
14
+ max_new_tokens: 1024
15
+ temperature: 0
16
+ top_p: 0
17
+ num_beams: 1
18
+ do_sample: false
19
+ process_results: !function utils.ferret_process_results
20
+ metric_list:
21
+ - metric: gpt_eval_ferret_all
22
+ aggregation: !function utils.ferret_all_aggregation
23
+ higher_is_better: true
24
+ - metric: gpt_eval_ferret_refer_desc
25
+ aggregation: !function utils.ferret_refer_desc_aggregation
26
+ higher_is_better: true
27
+ - metric: gpt_eval_ferret_refer_reason
28
+ aggregation: !function utils.ferret_refer_reason_aggregation
29
+ higher_is_better: true
30
+ - metric: gpt_eval_ferret_ground_conv
31
+ aggregation: !function utils.ferret_ground_conv_aggregation
32
+ higher_is_better: true
33
+ metadata:
34
+ version: 0.0
35
+ gpt_eval_model_name: "gpt-4-0314"
36
+ model_specific_prompt_kwargs:
37
+ default:
38
+ pre_prompt: ""
39
+ post_prompt: ""
EAGLE/lmms_eval/tasks/ferret/rule.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "refer_desc": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question about specific region of an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image. In addition, specific object locations within the image are given, along with detailed coordinates. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. Also, the relationships between pairs of objects are provided, in the format of object -> relationship -> subject, where the object/subject are indexed by object id from previous object lists as well as the object names. Also, several region description are given, each describing a box region of image, with detailed coordinates. \nPlease rate the spatial correspondence, helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
3
+ "refer_reason": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question about specific region of an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image. In addition, specific object locations within the image are given, along with detailed coordinates. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. Also, the relationships between pairs of objects are provided, in the format of object -> relationship -> subject, where the object/subject are indexed by object id from previous object lists as well as the object names. Also, several region description are given, each describing a box region of image, with detailed coordinates. \nPlease rate the spatial correspondence, helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
4
+ "ground_conv": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question that requires model to predict the coordinates of relevant object. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image. In addition, specific object locations within the image are given, along with detailed coordinates. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. Also, the relationships between pairs of objects are provided, in the format of object -> relationship -> subject, where the object/subject are indexed by object id from previous object lists as well as the object names. Also, several region description are given, each describing a box region of image, with detailed coordinates. \nPlease rate the predicted coordinates, helpfulness, relevance, accuracy, level of details of their responses. Specifically, pay your attention to the precision of the coordinates and whether it matches the object. Small deviation (<20% of ground-truth box width or height) of coordinates is allowed and shouldn't be punished. More than that, the degree of deviation should be reflected in scoring too. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}
5
+ }
EAGLE/lmms_eval/tasks/ferret/utils.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import requests
5
+ import numpy as np
6
+ import openai
7
+ from openai import OpenAI
8
+ import time
9
+ import yaml
10
+ from pathlib import Path
11
+ from copy import deepcopy
12
+
13
+ eval_logger = logging.getLogger("lmms-eval")
14
+ NUM_SECONDS_TO_SLEEP = 0.5
15
+
16
+ FERRET_W_METRICS = ["gpt_eval_ferret_refer_desc", "gpt_eval_ferret_refer_reason", "gpt_eval_ferret_ground_conv"]
17
+
18
+ rule_dict = json.load(open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "rule.json"), "r"))
19
+
20
+ with open(Path(__file__).parent / "ferret.yaml", "r") as f:
21
+ raw_data = f.readlines()
22
+ safe_data = []
23
+ for i, line in enumerate(raw_data):
24
+ # remove function definition since yaml load cannot handle it
25
+ if "!function" not in line:
26
+ safe_data.append(line)
27
+
28
+ config = yaml.safe_load("".join(safe_data))
29
+
30
+ GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
31
+
32
+ API_TYPE = os.getenv("API_TYPE", "openai")
33
+
34
+ if API_TYPE == "openai":
35
+ API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
36
+ API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
37
+ headers = {
38
+ "Authorization": f"Bearer {API_KEY}",
39
+ "Content-Type": "application/json",
40
+ }
41
+ elif API_TYPE == "azure":
42
+ API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
43
+ API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
44
+ headers = {
45
+ "api-key": API_KEY,
46
+ "Content-Type": "application/json",
47
+ }
48
+
49
+
50
+ def get_eval(content: str, max_tokens: int, retries: int = 3):
51
+ global headers
52
+
53
+ messages = [
54
+ {
55
+ "role": "system",
56
+ "content": "You are a helpful and precise assistant for checking the quality of the answer.",
57
+ },
58
+ {"role": "user", "content": content},
59
+ ]
60
+
61
+ payload = {
62
+ "model": GPT_EVAL_MODEL_NAME,
63
+ "messages": messages,
64
+ "temperature": 0.2,
65
+ "max_tokens": max_tokens,
66
+ }
67
+
68
+ for attempt in range(retries):
69
+ try:
70
+ response = requests.post(API_URL, headers=headers, json=payload)
71
+ response.raise_for_status()
72
+ response_data = response.json()
73
+
74
+ content = response_data["choices"][0]["message"]["content"].strip()
75
+ if content != "":
76
+ return content, response_data["model"]
77
+ break # If successful, break out of the loop
78
+
79
+ except Exception as e:
80
+ eval_logger.info(f"Attempt {attempt + 1} failed with error: {e}")
81
+ if attempt < retries - 1: # If we have retries left, sleep and then continue to next attempt
82
+ time.sleep(NUM_SECONDS_TO_SLEEP)
83
+ else: # If this was the last attempt, log and return empty
84
+ eval_logger.error(f"All {retries} attempts failed. Last error message: {e}")
85
+ return "", ""
86
+ return "", ""
87
+
88
+
89
+ def parse_score(review):
90
+ try:
91
+ score_pair = review.split("\n")[0]
92
+ score_pair = score_pair.replace(",", " ")
93
+ sp = score_pair.split(" ")
94
+ if len(sp) == 2:
95
+ return [float(sp[0]), float(sp[1])]
96
+ else:
97
+ eval_logger.debug(f"Can not split: {review}. Returning [-1, -1]")
98
+ return [-1, -1]
99
+ except Exception as e:
100
+ eval_logger.debug(f"Error: {e}. Returning [-1, -1]")
101
+ return [-1, -1]
102
+
103
+
104
+ def ferret_doc_to_visual(doc):
105
+ return [doc["image"].convert("RGB")]
106
+
107
+
108
+ def ferret_doc_to_text(doc, model_specific_prompt_kwargs=None):
109
+ if model_specific_prompt_kwargs is None:
110
+ model_specific_prompt_kwargs = {}
111
+ pre_prompt = model_specific_prompt_kwargs.get("pre_prompt", "")
112
+ post_prompt = model_specific_prompt_kwargs.get("post_prompt", "")
113
+ question = f"{pre_prompt}{doc['question']}{post_prompt}"
114
+ return question
115
+
116
+
117
+ def ferret_process_results(doc, result):
118
+ """
119
+ Args:
120
+ doc: a instance of the eval dataset
121
+ results: [pred]
122
+ Returns:
123
+ a dictionary with key: metric name (in this case coco_bleu), value: metric value
124
+ """
125
+ try:
126
+ question = doc.get("question", "")
127
+ ans1 = doc.get("gpt_answer", "")
128
+ ans2 = result[0] if result else ""
129
+ context = doc.get("context", [])
130
+ context = "\n".join(context) if isinstance(context, list) else context
131
+ category = doc.get("category", "")
132
+ rule = rule_dict.get(category, {})
133
+ prompt = rule.get("prompt", "")
134
+ role = rule.get("role", "user")
135
+ content = f"[Context]\n{context}\n\n" f"[Question]\n{question}\n\n" f"[{role} 1]\n{ans1}\n\n[End of {role} 1]\n\n" f"[{role} 2]\n{ans2}\n\n[End of {role} 2]\n\n" f"[System]\n{prompt}\n\n"
136
+ review, model_name = get_eval(content, 1024)
137
+ scores = parse_score(review)
138
+ except Exception as e:
139
+ eval_logger.error(f"Error for Question ID: {doc.get('question_id', 'Unknown')}: {e}")
140
+ review = "Failed to Get a Proper Review."
141
+ model_name = "Failed Request"
142
+ scores = [-1, -1]
143
+
144
+ metric = f"gpt_eval_ferret_{doc.get('category', 'all')}"
145
+ category_review_dict = {
146
+ "question": question,
147
+ "ans1": ans1,
148
+ "ans2": ans2,
149
+ "context": context,
150
+ "category": category,
151
+ "review": review,
152
+ "scores": scores,
153
+ "eval_model": model_name,
154
+ }
155
+
156
+ non_category_review_dict = deepcopy(category_review_dict)
157
+ non_category_review_dict["scores"] = [-999, -999]
158
+
159
+ data_dict = {}
160
+ for m in FERRET_W_METRICS:
161
+ if m == metric:
162
+ data_dict[m] = category_review_dict
163
+ else:
164
+ data_dict[m] = non_category_review_dict
165
+ data_dict["gpt_eval_ferret_all"] = category_review_dict
166
+
167
+ # return {"gpt_eval_ferret_all": review_dict}
168
+ return data_dict
169
+
170
+
171
+ def ferret_refer_desc_aggregation(results):
172
+ return ferret_aggregation(results, "refer_desc")
173
+
174
+
175
+ def ferret_refer_reason_aggregation(results):
176
+ return ferret_aggregation(results, "refer_reason")
177
+
178
+
179
+ def ferret_ground_conv_aggregation(results):
180
+ return ferret_aggregation(results, "ground_conv")
181
+
182
+
183
+ def ferret_all_aggregation(results):
184
+ return ferret_aggregation(results, "all")
185
+
186
+
187
+ def ferret_aggregation(results, category):
188
+ try:
189
+ scores = []
190
+ for result in results:
191
+ if -999 in result["scores"]:
192
+ continue
193
+ scores.append(result["scores"])
194
+
195
+ stats = np.asarray(scores).mean(0).tolist()
196
+ stats = [round(x, 3) for x in stats]
197
+ # gpt4_score_percentage = stats[0] * 10
198
+ # model_score_percentage = stats[1] * 10
199
+ # eval_logger.info(f"Category: {category}")
200
+ # eval_logger.info(f"GPT4 Score: {gpt4_score_percentage:.1f}%")
201
+ # eval_logger.info(f"Model Score: {model_score_percentage:.1f}%")
202
+ # eval_logger.info("=========================")
203
+ return round(stats[1] / stats[0] * 100, 1)
204
+ except Exception as e:
205
+ eval_logger.info(f"Error in ferret_aggregation: {e}, and in category: {category}")
206
+ return None
EAGLE/lmms_eval/tasks/flickr30k/flickr30k.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ group: flickr30k
2
+ task:
3
+ - flickr30k_test
EAGLE/lmms_eval/tasks/flickr30k/flickr30k_test.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_path: lmms-lab/flickr30k
2
+ dataset_kwargs:
3
+ token: True
4
+ task : "flickr30k_test"
5
+ test_split: test
6
+ output_type: generate_until
7
+ doc_to_visual: !function utils.flickr_doc_to_visual
8
+ doc_to_text: !function utils.flickr_doc_to_text
9
+ doc_to_target: "answer"
10
+ generation_kwargs:
11
+ max_new_tokens: 64
12
+ temperature: 0
13
+ top_p: 0
14
+ num_beams: 1
15
+ do_sample: false
16
+ process_results: !function utils.flickr_process_result
17
+ # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
18
+ metric_list:
19
+ - metric: flickr_Bleu_4
20
+ aggregation : !function utils.flickr_bleu4
21
+ higher_is_better : true
22
+ - metric: flickr_Bleu_3
23
+ aggregation : !function utils.flickr_bleu3
24
+ higher_is_better : true
25
+ - metric: flickr_Bleu_2
26
+ aggregation : !function utils.flickr_bleu2
27
+ higher_is_better : true
28
+ - metric: flickr_Bleu_1
29
+ aggregation : !function utils.flickr_bleu1
30
+ higher_is_better : true
31
+ - metric: flickr_METEOR
32
+ aggregation : !function utils.flickr_meteor
33
+ higher_is_better : true
34
+ - metric: flickr_ROUGE_L
35
+ aggregation : !function utils.flickr_rougel
36
+ higher_is_better : true
37
+ - metric: flickr_CIDEr
38
+ aggregation : !function utils.flickr_cider
39
+ higher_is_better : true
40
+ #- metric: flickr_SPICE
41
+ # aggregation : !function utils.flickr_spice
42
+ # higher_is_better : true
43
+ metadata:
44
+ - version: 0.0
EAGLE/lmms_eval/tasks/flickr30k/utils.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from pycocoevalcap.eval import COCOEvalCap, Bleu, Meteor, Rouge, Cider, Spice
4
+ from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
5
+ from pycocotools.coco import COCO
6
+ from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
7
+ import datetime
8
+
9
+ import logging
10
+
11
+ eval_logger = logging.getLogger("lmms-eval")
12
+
13
+ dir_name = os.path.dirname(os.path.abspath(__file__))
14
+
15
+ FLICKR_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr"] # , "SPICE"]
16
+
17
+
18
+ def flickr_doc_to_visual(doc):
19
+ return [doc["image"].convert("RGB")]
20
+
21
+
22
+ def flickr_doc_to_text(doc):
23
+ # question = "Please carefully observe the image and come up with a caption for the image"
24
+ return f"Provide a one-sentence caption for the provided image."
25
+
26
+
27
+ def flickr_process_result(doc, result):
28
+ """
29
+ Args:
30
+ doc: a instance of the eval dataset
31
+ results: [pred]
32
+ Returns:
33
+ a dictionary with key: metric name, value: metric value
34
+ """
35
+ pred = result[0] if len(result) > 0 else ""
36
+ image_id = int(doc["img_id"])
37
+
38
+ data_dict = {"answer": doc["caption"], "pred": pred, "image_id": image_id}
39
+
40
+ return {f"flickr_{metric}": data_dict for metric in FLICKR_METRICS}
41
+
42
+
43
+ def flickr_aggregation_result(results, metric, args):
44
+ scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE")]
45
+ scorers_dict = {s[1]: s for s in scorers}
46
+
47
+ stored_results = []
48
+ # In order to make the coco eval tools to successfully create index
49
+ # We need at least two dict in the dataset
50
+ # 'annotation' and 'images'
51
+ # 'annotation' exactly reproduce the original annotation
52
+ # 'images' however only need the image id which is contained in the file name
53
+ dataset = {"annotations": [], "images": []}
54
+ idx = 0
55
+ for result in results:
56
+ stored_results.append({"image_id": int(result["image_id"]), "caption": result["pred"]})
57
+ for a in result["answer"]:
58
+ dataset["annotations"].append({"image_id": int(result["image_id"]), "caption": a, "id": idx})
59
+ idx += 1
60
+ dataset["images"].append({"id": int(result["image_id"])})
61
+
62
+ coco = COCO()
63
+ # Manually create index here
64
+ coco.dataset = dataset
65
+ coco.createIndex()
66
+
67
+ flickr_result = coco.loadRes(stored_results)
68
+ flickr_eval = COCOEvalCap(coco, flickr_result)
69
+
70
+ imgIds = flickr_eval.params["image_id"]
71
+ gts = {}
72
+ res = {}
73
+ for imgId in imgIds:
74
+ gts[imgId] = flickr_eval.coco.imgToAnns[imgId]
75
+ res[imgId] = flickr_eval.cocoRes.imgToAnns[imgId]
76
+
77
+ eval_logger.info("tokenization...")
78
+ tokenizer = PTBTokenizer()
79
+ gts = tokenizer.tokenize(gts)
80
+ res = tokenizer.tokenize(res)
81
+
82
+ eval_logger.info(f"Computing {metric} scores...")
83
+
84
+ score, scores = scorers_dict[metric][0].compute_score(gts, res)
85
+ # When metric is one of the Bleu, score will be a list
86
+ if type(score) == list:
87
+ n = int(metric.split("_")[-1])
88
+ score = score[n - 1]
89
+
90
+ path = generate_submission_file(f"flickr30k_captions_val2014_alg_results_{metric}.json", args)
91
+
92
+ eval_logger.info("Storing prediction that can be submitted to the server ...")
93
+ with open(path, "w") as f:
94
+ json.dump(stored_results, f, indent=4)
95
+
96
+ return score
97
+
98
+
99
+ def flickr_bleu4(results, args):
100
+ return flickr_aggregation_result(results, "Bleu_4", args)
101
+
102
+
103
+ def flickr_bleu3(results, args):
104
+ return flickr_aggregation_result(results, "Bleu_3", args)
105
+
106
+
107
+ def flickr_bleu2(results, args):
108
+ return flickr_aggregation_result(results, "Bleu_2", args)
109
+
110
+
111
+ def flickr_bleu1(results, args):
112
+ return flickr_aggregation_result(results, "Bleu_1", args)
113
+
114
+
115
+ def flickr_meteor(results, args):
116
+ return flickr_aggregation_result(results, "METEOR", args)
117
+
118
+
119
+ def flickr_rougel(results, args):
120
+ return flickr_aggregation_result(results, "ROUGE_L", args)
121
+
122
+
123
+ def flickr_cider(results, args):
124
+ return flickr_aggregation_result(results, "CIDEr", args)
125
+
126
+
127
+ def flickr_spice(results, args):
128
+ return flickr_aggregation_result(results, "SPICE", args)
129
+
130
+
131
+ def flickr_test_process_result(doc, result):
132
+ """
133
+ Args:
134
+ doc: a instance of the eval dataset
135
+ results: [pred]
136
+ Returns:
137
+ a dictionary with key: metric name (in this case flickr_passthrough), value: metric value
138
+ """
139
+ # The question id in our dataset is the image file itself
140
+ image_id = doc["img_id"]
141
+ return {"flickr_passthrough": {"pred": result, "image_id": image_id}}
EAGLE/lmms_eval/tasks/hallusion_bench/evaluate_hb.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import logging
4
+ from tqdm import tqdm
5
+
6
+ from lmms_eval.tasks.hallusion_bench.utils import evaluate_by_chatgpt, check_same_by_chatgpt, assign_correctness, get_eval_all, get_eval_fig, get_eval_pair_all
7
+
8
+ cur_dir = os.path.dirname(os.path.abspath(__file__))
9
+ output_entry = "model_prediction"
10
+ correctness_entry = "gpt4v_output_gpt_check"
11
+
12
+ metric = ["aAcc", "fAcc", "qAcc"]
13
+
14
+ eval_logger = logging.getLogger("lmms-eval")
15
+
16
+
17
+ def hb_doc_to_text(doc, model_specific_prompt_kwargs=None):
18
+ if model_specific_prompt_kwargs is None:
19
+ model_specific_prompt_kwargs = {}
20
+ pre_prompt = model_specific_prompt_kwargs.get("pre_prompt", "")
21
+ post_prompt = model_specific_prompt_kwargs.get("post_prompt", "")
22
+ return f"{pre_prompt}{doc['question']}{post_prompt}"
23
+
24
+
25
+ def hb_doc_to_visual(doc):
26
+ return [doc["image"].convert("RGB")]
27
+
28
+
29
+ def hb_process_results(doc, result):
30
+ sample = doc
31
+ # doc.pop("image")
32
+ sample["model_prediction"] = result[0]
33
+ return {k: sample for k in metric}
34
+
35
+
36
+ def hb_aggregation_result(results, metric, args):
37
+ data_vd = []
38
+ data_vs = []
39
+ for data in tqdm(results, desc="Split vd and vs"):
40
+ if data["category"] == "VD":
41
+ data_vd.append(data)
42
+ if data["category"] == "VS":
43
+ data_vs.append(data)
44
+ eval_logger.info("Do gpt eval vd ...")
45
+ path = os.path.join(args.output_path, "gpt_response")
46
+ os.makedirs(path, exist_ok=True)
47
+ save_json_path_vd = f"{path}/hallusion_output_vd_model.json"
48
+ save_json_path_vs = f"{path}/hallusion_output_vs_model.json"
49
+ data_vd = evaluate_by_chatgpt(data_vd, output_entry=output_entry, correctness_entry=correctness_entry, load_json=True, save_json_path=save_json_path_vd)
50
+ # data_vd = check_same_by_chatgpt(data_vd, output_entry=output_entry, load_json=True, save_json_path=save_json_path_vd)
51
+ data_vd = assign_correctness(data_vd, correctness_entry=correctness_entry)
52
+ eval_logger.info("Do gpt eval vs")
53
+ data_vs = evaluate_by_chatgpt(data_vs, output_entry=output_entry, correctness_entry=correctness_entry, load_json=True, save_json_path=save_json_path_vs)
54
+ # data_vs = check_same_by_chatgpt(data_vs, output_entry=output_entry, load_json=True, save_json_path=save_json_path_vs)
55
+ data_vs = assign_correctness(data_vs, correctness_entry=correctness_entry)
56
+ results = data_vs + data_vd
57
+
58
+ if metric == "aAcc":
59
+ all_data = get_eval_all(results, model_correctness_entry=correctness_entry)
60
+ return round(100 * all_data["correct"] / all_data["total"], 4)
61
+ elif metric == "fAcc":
62
+ fig_all = get_eval_fig(results)
63
+ return round(100 * fig_all["correct"] / fig_all["total"], 4)
64
+ elif metric == "qAcc":
65
+ all_data = get_eval_pair_all(results, model_correctness_entry=correctness_entry)
66
+ return round(100 * all_data["correct"] / all_data["total"], 4)
67
+
68
+
69
+ def hb_aggregation_result_qAcc(results, args):
70
+ return hb_aggregation_result(results, "qAcc", args)
71
+
72
+
73
+ def hb_aggregation_result_fAcc(results, args):
74
+ return hb_aggregation_result(results, "fAcc", args)
75
+
76
+
77
+ def hb_aggregation_result_aAcc(results, args):
78
+ return hb_aggregation_result(results, "aAcc", args)
79
+
80
+
81
+ def hb_aggregation_result_intern(results, metric):
82
+ scores = []
83
+ for result in results:
84
+ ans = "1" if result["model_prediction"].lower().find("yes") != -1 else "0"
85
+ scores.append(ans == result["gt_answer"])
86
+ result["answer"] = ans
87
+
88
+ if metric == "aAcc":
89
+ return sum(scores) / len(scores)
90
+ elif metric == "qAcc":
91
+ qlist = {}
92
+ for r in results:
93
+ key = "_".join([r["category"], r["subcategory"], str(r["set_id"]), str(r["question_id"])])
94
+ try:
95
+ qlist[key].append(r["answer"] == r["gt_answer"])
96
+ except:
97
+ qlist[key] = [r["answer"] == r["gt_answer"]]
98
+ out = []
99
+ for q, v in qlist.items():
100
+ out.append(min(v))
101
+
102
+ return sum(out) / len(out)
103
+ elif metric == "fAcc":
104
+ qlist = {}
105
+ for r in results:
106
+ key = "_".join([r["category"], r["subcategory"], str(r["set_id"]), str(r["figure_id"])])
107
+ try:
108
+ qlist[key].append(r["answer"] == r["gt_answer"])
109
+ except:
110
+ qlist[key] = [r["answer"] == r["gt_answer"]]
111
+ out = []
112
+ for q, v in qlist.items():
113
+ out.append(min(v))
114
+ return sum(out) / len(out)
115
+
116
+
117
+ def hb_aggregation_result_qAcc_intern(results):
118
+ eval_logger.info("Calculating qAcc ...")
119
+ return hb_aggregation_result_intern(results, "qAcc")
120
+
121
+
122
+ def hb_aggregation_result_fAcc_intern(results):
123
+ eval_logger.info("Calculating fAcc ...")
124
+ return hb_aggregation_result_intern(results, "fAcc")
125
+
126
+
127
+ def hb_aggregation_result_aAcc_intern(results):
128
+ eval_logger.info("Calculating aAcc ...")
129
+ return hb_aggregation_result_intern(results, "aAcc")
EAGLE/lmms_eval/tasks/hallusion_bench/hallusion_bench_image.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_path: lmms-lab/HallusionBench
2
+ dataset_kwargs:
3
+ token: True
4
+ task: "hallusion_bench_image"
5
+ test_split: image
6
+ output_type: generate_until
7
+ doc_to_visual: !function evaluate_hb.hb_doc_to_visual
8
+ doc_to_text: !function evaluate_hb.hb_doc_to_text
9
+ doc_to_target: "gt_answer_details"
10
+ process_results: !function evaluate_hb.hb_process_results
11
+ model_specific_prompt_kwargs:
12
+ default:
13
+ pre_prompt: ""
14
+ post_prompt: ""
15
+ generation_kwargs:
16
+ max_new_tokens: 128
17
+ temperature: 0
18
+ top_p: 0
19
+ num_beams: 1
20
+ do_sample: false
21
+ metric_list:
22
+ - metric: aAcc
23
+ aggregation: !function evaluate_hb.hb_aggregation_result_aAcc
24
+ higher_is_better: true
25
+ - metric: qAcc
26
+ aggregation: !function evaluate_hb.hb_aggregation_result_qAcc
27
+ higher_is_better: true
28
+ - metric: fAcc
29
+ aggregation: !function evaluate_hb.hb_aggregation_result_fAcc
30
+ higher_is_better: true
31
+ # - metric: aAcc
32
+ # aggregation: !function evaluate_hb.hb_aggregation_result_aAcc_intern
33
+ # higher_is_better: true
34
+ # - metric: qAcc
35
+ # aggregation: !function evaluate_hb.hb_aggregation_result_qAcc_intern
36
+ # higher_is_better: true
37
+ # - metric: fAcc
38
+ # aggregation: !function evaluate_hb.hb_aggregation_result_fAcc_intern
39
+ # higher_is_better: true
40
+ metadata:
41
+ - version: 0.0
EAGLE/lmms_eval/tasks/hallusion_bench/utils.py ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import json
3
+ from tqdm import tqdm
4
+ import numpy as np
5
+ import os
6
+ import time
7
+ import openai
8
+ import threading
9
+ import requests
10
+ import logging
11
+
12
+ API_TYPE = os.getenv("API_TYPE", "openai")
13
+
14
+ if API_TYPE == "openai":
15
+ API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
16
+ API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
17
+ headers = {
18
+ "Authorization": f"Bearer {API_KEY}",
19
+ "Content-Type": "application/json",
20
+ }
21
+ elif API_TYPE == "azure":
22
+ API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
23
+ API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
24
+ headers = {
25
+ "api-key": API_KEY,
26
+ "Content-Type": "application/json",
27
+ }
28
+
29
+ eval_logger = logging.getLogger("lmms-eval")
30
+
31
+
32
+ def evaluate_by_chatgpt(data, output_entry, correctness_entry, gpt_model="gpt-4", load_json=False, save_json_path="./hallusion_output.json", retries=3):
33
+ if load_json and os.path.exists(save_json_path):
34
+ with open(save_json_path, "r") as f:
35
+ output = json.load(f)
36
+ else:
37
+ output = []
38
+ for sample in tqdm(data[len(output) :], desc="Eval by GPT"):
39
+ prompt = "Imagine you are an intelligent teacher. Thoroughly read the question, reference answer and the prediction answer to ensure a clear understanding of the information provided. Assess the correctness of the predictions. "
40
+ prompt += 'If the prediction answer does not conflict with the reference answer, please generate “correct”. If the prediction answer conflict with the reference answer, please generate “incorrect”. If the prediction answer is unclear about the answer, please generate "unclear". \n\n Question:'
41
+ prompt += sample["question"]
42
+ prompt += "\nReference answer: "
43
+ prompt += sample["gt_answer_details"]
44
+ prompt += "\nPrediction answer:"
45
+ prompt += sample[output_entry]
46
+ prompt += "\nOutput:"
47
+
48
+ # https://github.com/openai/openai-python/issues/322#issuecomment-1767841683
49
+ for attempt in range(retries):
50
+ try:
51
+ messages = [{"role": "user", "content": prompt}]
52
+ payload = {
53
+ "messages": messages,
54
+ "max_tokens": 16,
55
+ }
56
+ # set model when using openai api_key. Azure api_key does not need model since the endpoint fixed the model.
57
+ if API_TYPE == "openai":
58
+ payload["model"] = gpt_model
59
+ response = requests.post(API_URL, headers=headers, json=payload, timeout=30)
60
+ response.raise_for_status()
61
+ response = response.json()
62
+ break
63
+ except Exception as e:
64
+ eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}")
65
+ if attempt < retries - 1: # If we have retries left, sleep and then continue to next attempt
66
+ time.sleep(5)
67
+ else: # If this was the last attempt, log and return empty
68
+ eval_logger.error(f"All {retries} attempts failed. Last error message: {str(e)}")
69
+ try:
70
+ output_text = response["choices"][0]["message"]["content"]
71
+ except Exception as e:
72
+ eval_logger.info(f"Get error {str(e)} when extracting response")
73
+ output_text = "unclear"
74
+
75
+ if "incorrect" in output_text.lower():
76
+ gpt_correctness = "0"
77
+
78
+ elif "correct" in output_text.lower():
79
+ gpt_correctness = "1"
80
+ else:
81
+ gpt_correctness = "2"
82
+
83
+ sample[correctness_entry] = gpt_correctness
84
+ sample["gpt_answer"] = prompt + output_text
85
+
86
+ output.append(sample)
87
+
88
+ with open(save_json_path, "w") as f:
89
+ json.dump(output, f, indent=4)
90
+
91
+ return output
92
+
93
+
94
+ def check_same_by_chatgpt(data, output_entry, gpt_model="gpt-4", load_json=False, save_json_path="./hallusion_output.json", retries=3):
95
+ orig_response = {}
96
+
97
+ for r in data:
98
+ if str(r["figure_id"]) == "0":
99
+ key = "_".join([r["category"], r["subcategory"], str(r["set_id"]), str(r["question_id"])])
100
+ orig_response[key] = r[output_entry]
101
+
102
+ for sample in tqdm(data, desc="Check same by GPT"):
103
+ if "same" not in sample.keys():
104
+ key = "_".join([sample["category"], sample["subcategory"], str(sample["set_id"]), str(sample["question_id"])])
105
+ response2 = orig_response[key]
106
+
107
+ prompt = "Imagine you are an intelligent teacher. Thoroughly read the two responses to two different questions. Assess the consistency of the information provided within those two responses. "
108
+ prompt += "You do not know the specific questions, but you can asssess the consistency among the two responses by checking for logical conflicts if both responses are correct. "
109
+ prompt += 'If response1 does not conflict with response2, please generate “same”. Otherwise, generate "different". \n\n response1:'
110
+ prompt += sample[output_entry]
111
+ prompt += "\nresponse2: "
112
+ prompt += response2
113
+ prompt += "\nOutput:"
114
+
115
+ # https://github.com/openai/openai-python/issues/322#issuecomment-1767841683
116
+ for attempt in range(retries):
117
+ try:
118
+ headers = {
119
+ "api-key": API_KEY,
120
+ "Content-Type": "application/json",
121
+ }
122
+
123
+ messages = [{"role": "user", "content": prompt}]
124
+
125
+ payload = {
126
+ "model": gpt_model,
127
+ "messages": messages,
128
+ "max_tokens": 16,
129
+ }
130
+ response = requests.post(API_URL, headers=headers, json=payload)
131
+ response.raise_for_status()
132
+ response = response.json()
133
+
134
+ break
135
+ except Exception as e:
136
+ eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}")
137
+ if attempt < retries - 1: # If we have retries left, sleep and then continue to next attempt
138
+ time.sleep(5)
139
+ else: # If this was the last attempt, log and return empty
140
+ eval_logger.error(f"All {retries} attempts failed. Last error message: {str(e)}")
141
+
142
+ try:
143
+ output_text = response["choices"][0]["message"]["content"]
144
+ except Exception as e:
145
+ eval_logger.info(f"Get error {str(e)} when extracting response")
146
+ output_text = "different"
147
+
148
+ gpt_same = "0"
149
+
150
+ if "same" in output_text.lower():
151
+ gpt_same = "1"
152
+
153
+ elif "different" in output_text.lower():
154
+ gpt_same = "0"
155
+
156
+ sample["same"] = gpt_same
157
+
158
+ with open(save_json_path, "w") as f:
159
+ json.dump(data, f, indent=4)
160
+
161
+ return data
162
+
163
+
164
+ def assign_correctness(data_arr, correctness_entry):
165
+ for r in data_arr:
166
+ assert int(r[correctness_entry]) == 0 or int(r[correctness_entry]) == 1 or int(r[correctness_entry]) == 2
167
+ if r["category"] == "VS" and int(r["figure_id"]) == 0: # if there is no visual supplement and the model does not know, count it as correct
168
+ r["correct"] = 1 if int(r[correctness_entry]) == 1 or int(r[correctness_entry]) == 2 else 0
169
+ else:
170
+ r["correct"] = 1 if int(r[correctness_entry]) == 1 else 0
171
+ return data_arr
172
+
173
+
174
+ def get_eval_fig(data): # per figure
175
+ eval_fig_dict = dict()
176
+
177
+ for r in data:
178
+ if r["category"] == "VS" and str(r["figure_id"]) == "0": # no figure
179
+ continue
180
+ name = "_".join([r["category"], r["subcategory"], str(r["set_id"]), str(r["figure_id"])])
181
+ if name in eval_fig_dict:
182
+ c, t = eval_fig_dict[name]
183
+ eval_fig_dict[name] = (c + r["correct"], t + 1)
184
+ else:
185
+ eval_fig_dict[name] = (r["correct"], 1)
186
+
187
+ eval_fig_stat = {}
188
+ eval_fig_stat["note"] = "all accuracy per image (consistency test)"
189
+ eval_fig_stat["total"] = len(eval_fig_dict.keys())
190
+ eval_fig_stat["correct"] = 0
191
+ eval_fig_stat["wrong"] = 0
192
+ eval_fig_stat["inconsistent"] = 0
193
+ eval_fig_stat["score"] = 0
194
+
195
+ for v in eval_fig_dict.values():
196
+ if v[0] == v[1]:
197
+ eval_fig_stat["correct"] += 1
198
+ elif v[0] == 0:
199
+ eval_fig_stat["wrong"] += 1
200
+ else:
201
+ eval_fig_stat["inconsistent"] += 1
202
+ eval_fig_stat["score"] += v[0] / v[1]
203
+
204
+ eval_fig_stat["score"] = eval_fig_stat["score"] / eval_fig_stat["total"]
205
+ return eval_fig_stat
206
+
207
+
208
+ def get_eval_all(data, model_correctness_entry): # per question
209
+ eval_all_dict = dict()
210
+ eval_all_stat = {}
211
+ eval_all_stat["LH"] = 0
212
+ eval_all_stat["VI"] = 0
213
+ eval_all_stat["Mix"] = 0
214
+
215
+ for r in data:
216
+ name = "_".join([r["category"], r["subcategory"], str(r["set_id"]), str(r["figure_id"]), str(r["question_id"])])
217
+ assert name not in eval_all_dict
218
+
219
+ eval_all_dict[name] = r["correct"]
220
+
221
+ if str(r["category"]) == "VD": # VD
222
+ if str(r["figure_id"]) == "0":
223
+ if str(r[model_correctness_entry]) == "0" or str(r[model_correctness_entry]) == "2":
224
+ eval_all_stat["VI"] += 1
225
+ else:
226
+ if str(r[model_correctness_entry]) == "0":
227
+ eval_all_stat["Mix"] += 1
228
+ elif str(r[model_correctness_entry]) == "2":
229
+ eval_all_stat["VI"] += 1
230
+ else: # VS
231
+ if str(r["visual_input"]) == "0": # no visual
232
+ if str(r[model_correctness_entry]) == "0":
233
+ eval_all_stat["LH"] += 1
234
+ else: # original visual or modified visual (isual_input == 1 or 2)
235
+ if str(r[model_correctness_entry]) == "0":
236
+ eval_all_stat["Mix"] += 1
237
+ elif str(r[model_correctness_entry]) == "2":
238
+ eval_all_stat["VI"] += 1
239
+
240
+ eval_all_stat["note"] = "all accuracy per question"
241
+ eval_all_stat["total"] = len(eval_all_dict.keys())
242
+ eval_all_stat["correct"] = np.count_nonzero(list(eval_all_dict.values()))
243
+ eval_all_stat["wrong"] = eval_all_stat["total"] - eval_all_stat["correct"]
244
+
245
+ return eval_all_stat
246
+
247
+
248
+ def get_eval_pair_all(data, model_correctness_entry): # per question pair
249
+ orig_correctness = dict()
250
+ counter = 0
251
+ lh_counter = 0
252
+ vi_counter = 0
253
+ both_counter = 0
254
+
255
+ for r in data:
256
+ if str(r["figure_id"]) == "0":
257
+ key = "_".join([r["category"], r["subcategory"], str(r["set_id"]), str(r["question_id"])])
258
+ orig_correctness[key] = r[model_correctness_entry]
259
+
260
+ get_eval_pair_dict = dict()
261
+
262
+ for r in data:
263
+ name = "_".join([r["category"], r["subcategory"], str(r["set_id"]), str(r["question_id"])])
264
+ if name in get_eval_pair_dict:
265
+ c, t = get_eval_pair_dict[name]
266
+ get_eval_pair_dict[name] = (c + r["correct"], t + 1)
267
+ else:
268
+ get_eval_pair_dict[name] = (r["correct"], 1)
269
+ counter += 1
270
+
271
+ eval_all_pair_stat = {}
272
+ eval_all_pair_stat["note"] = "all accuracy per question pair"
273
+ eval_all_pair_stat["total"] = len(get_eval_pair_dict.keys())
274
+ eval_all_pair_stat["total_q"] = counter
275
+ eval_all_pair_stat["correct"] = 0
276
+ eval_all_pair_stat["wrong"] = 0
277
+ eval_all_pair_stat["LH"] = 0
278
+ eval_all_pair_stat["VI"] = 0
279
+ eval_all_pair_stat["Mix"] = 0
280
+
281
+ eval_all_pair_stat["LH_cg"] = lh_counter
282
+ eval_all_pair_stat["VI_cg"] = vi_counter
283
+ eval_all_pair_stat["Mix_cg"] = both_counter
284
+
285
+ # for v in get_eval_pair_dict.values():
286
+ # if v[0] == v[1]:
287
+ # eval_all_pair_stat["correct"] += 1
288
+ # else:
289
+ # eval_all_pair_stat["wrong"] += 1
290
+
291
+ # for v in get_analysis_pair_dict.values():
292
+ # if v[0] > 0 and v[1] > 0:
293
+ # eval_all_pair_stat["Mix"] += 1
294
+ # elif v[0] > 0:
295
+ # eval_all_pair_stat["LH"] += 1
296
+ # elif v[1] > 0:
297
+ # eval_all_pair_stat["VI"] += 1
298
+
299
+ for k in get_eval_pair_dict.keys():
300
+ v = get_eval_pair_dict[k]
301
+ if v[0] == v[1]:
302
+ eval_all_pair_stat["correct"] += 1
303
+ else:
304
+ eval_all_pair_stat["wrong"] += 1
305
+
306
+ return eval_all_pair_stat
EAGLE/lmms_eval/tasks/iconqa/utils.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+
5
+ def options_to_str(options_prompt):
6
+ option_prompt_str = ""
7
+ for i, option in enumerate(options_prompt):
8
+ option_choice = chr(ord("A") + i)
9
+ option_prompt_str += f"{option_choice}. {option}\n"
10
+
11
+ option_prompt_str = option_prompt_str.rstrip("\n")
12
+ return option_prompt_str
13
+
14
+
15
+ def doc_to_visual(doc):
16
+ image_list = []
17
+ if "query_image" in doc:
18
+ image_list.append(doc["query_image"].convert("RGB"))
19
+ for i in range(5):
20
+ id = f"choice_image_{i}"
21
+ if id in doc and doc[id] is not None:
22
+ image_list.append(doc[id].convert("RGB"))
23
+ assert len(image_list) < 6, "Maximum 5 images allowed for ICON-QA"
24
+ return image_list
25
+
26
+
27
+ def doc_to_text(doc, model_specific_prompt_kwargs):
28
+ question = doc["question"]
29
+ ques_type = doc["ques_type"]
30
+ options_prompt = []
31
+
32
+ if ques_type == "choose_img":
33
+ options_prompt.append("The first image.")
34
+ options_prompt.append("The second image.")
35
+
36
+ options_str = options_to_str(options_prompt)
37
+ full_prompt = f"{model_specific_prompt_kwargs['pre_prompt']}{model_specific_prompt_kwargs['statement']}{model_specific_prompt_kwargs['options_statement'].format(question=question, options=options_str)}"
38
+
39
+ elif ques_type == "choose_txt":
40
+ choices = doc["choices"].split(",")
41
+ for i, choice in enumerate(choices):
42
+ options_prompt.append(f"{choice}")
43
+
44
+ options_str = options_to_str(options_prompt)
45
+ full_prompt = f"{model_specific_prompt_kwargs['pre_prompt']}{model_specific_prompt_kwargs['statement']}{model_specific_prompt_kwargs['options_statement'].format(question=question, options=options_str)}"
46
+
47
+ elif ques_type == "fill_in_blank":
48
+ full_prompt = f"{model_specific_prompt_kwargs['pre_prompt']}{model_specific_prompt_kwargs['statement']}{model_specific_prompt_kwargs['freeform_statement'].format(question=question)}"
49
+
50
+ return full_prompt
51
+
52
+
53
+ def test_process_results(doc, results):
54
+ pred = results[0]
55
+ questionId = doc["question_id"]
56
+ answer = doc["answer"]
57
+ return {"anls": {"questionId": int(questionId), "answer": answer, "pred_answer": pred}}
EAGLE/lmms_eval/tasks/mme/mme.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_path: lmms-lab/MME
2
+ dataset_kwargs:
3
+ token: True
4
+ task: "mme"
5
+ test_split: test
6
+ output_type: generate_until
7
+ doc_to_visual: !function utils.mme_doc_to_visual
8
+ doc_to_text: !function utils.mme_doc_to_text
9
+ doc_to_target: "answer"
10
+ generation_kwargs:
11
+ max_new_tokens: 16
12
+ temperature: 0
13
+ top_p: 0
14
+ num_beams: 1
15
+ do_sample: false
16
+ # The return value of process_results will be used by metrics
17
+ process_results: !function utils.mme_process_results
18
+ # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19
+ metric_list:
20
+ - metric: mme_percetion_score
21
+ aggregation: !function utils.mme_aggregate_results
22
+ higher_is_better: true
23
+ - metric: mme_cognition_score
24
+ aggregation: !function utils.mme_aggregate_results
25
+ higher_is_better: true
26
+ model_specific_prompt_kwargs:
27
+ default:
28
+ pre_prompt: ""
29
+ post_prompt: "\nAnswer the question using a single word or phrase."
30
+ qwen_vl:
31
+ pre_prompt: ""
32
+ post_prompt: " Answer:"
33
+ otterhd:
34
+ pre_prompt: ""
35
+ post_prompt: " Answer:"
36
+ metadata:
37
+ - version: 0.0
EAGLE/lmms_eval/tasks/mme/utils.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import defaultdict
2
+ import os
3
+ import datetime
4
+ import json
5
+ from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
6
+
7
+ import logging
8
+
9
+ eval_logger = logging.getLogger("lmms-eval")
10
+
11
+ dir_name = os.path.dirname(os.path.abspath(__file__))
12
+
13
+ eval_type_dict = {
14
+ "Perception": [
15
+ "existence",
16
+ "count",
17
+ "position",
18
+ "color",
19
+ "posters",
20
+ "celebrity",
21
+ "scene",
22
+ "landmark",
23
+ "artwork",
24
+ "OCR",
25
+ ],
26
+ "Cognition": [
27
+ "commonsense_reasoning",
28
+ "numerical_calculation",
29
+ "text_translation",
30
+ "code_reasoning",
31
+ ],
32
+ }
33
+
34
+
35
+ replace_prompt = " Please answer yes or no."
36
+
37
+
38
+ def mme_doc_to_visual(doc):
39
+ return [doc["image"].convert("RGB")]
40
+
41
+
42
+ def mme_doc_to_text(doc, model_specific_prompt_kwargs=None):
43
+ question = doc["question"].strip()
44
+ if "pre_prompt" in model_specific_prompt_kwargs and model_specific_prompt_kwargs["pre_prompt"] != "":
45
+ question = question.replace(replace_prompt, "")
46
+ question = f"{model_specific_prompt_kwargs['pre_prompt']}{question}"
47
+ if "post_prompt" in model_specific_prompt_kwargs and model_specific_prompt_kwargs["post_prompt"] != "":
48
+ question = question.replace(replace_prompt, "")
49
+ question = f"{question}{model_specific_prompt_kwargs['post_prompt']}"
50
+ return question
51
+
52
+
53
+ def parse_pred_ans(pred_ans):
54
+ """Brought from Otter Eval"""
55
+ pred_ans = pred_ans.lower().strip().replace(".", "")
56
+ pred_label = None
57
+ if pred_ans in ["yes", "no"]:
58
+ pred_label = pred_ans
59
+ else:
60
+ prefix_pred_ans = pred_ans[:4]
61
+ if "yes" in prefix_pred_ans:
62
+ pred_label = "yes"
63
+ elif "no" in prefix_pred_ans:
64
+ pred_label = "no"
65
+ else:
66
+ pred_label = "other"
67
+ return pred_label
68
+
69
+
70
+ def mme_process_results(doc, results):
71
+ """
72
+ Args:
73
+ doc: a instance of the eval dataset
74
+ results: [pred]
75
+ Returns:
76
+ a dictionary with key: metric name (in this case mme score), value: metric value
77
+ """
78
+ pred = results[0]
79
+ pred_ans = parse_pred_ans(pred)
80
+ gt_ans = doc["answer"].lower().strip().replace(".", "")
81
+ assert gt_ans in ["yes", "no"]
82
+ assert pred_ans in ["yes", "no", "other"]
83
+ score = 1.0 if pred_ans == gt_ans else 0.0
84
+ category = doc["category"]
85
+ key_name = "mme_percetion_score" if category in eval_type_dict["Perception"] else "mme_cognition_score"
86
+ # Note: the key name here is very important. It decides which aggregation function will receive the results
87
+ # We note down the question id/category to help us aggregate the results later
88
+ return {key_name: {"question_id": doc["question_id"], "category": category, "score": score}}
89
+
90
+
91
+ def mme_aggregate_results(results):
92
+ """
93
+ Args:
94
+ results: a list of values returned by process_results
95
+ Returns:
96
+ A score
97
+ """
98
+ category2score = defaultdict(dict)
99
+ for result in results:
100
+ question_id = result["question_id"]
101
+ score = result["score"]
102
+ category = result["category"]
103
+ if question_id not in category2score[category]:
104
+ category2score[category][question_id] = []
105
+ category2score[category][question_id].append(score)
106
+ category2avg_score = {}
107
+ for category, question2scores in category2score.items():
108
+ total_score = 0
109
+ for question_id, scores in question2scores.items():
110
+ assert len(scores) == 2
111
+ acc = sum(scores) / len(scores) * 100.0
112
+ acc_plus = (sum(scores) == 2) * 100.0
113
+ score = acc_plus + acc
114
+ total_score += score
115
+ avg_score = total_score / len(question2scores)
116
+ category2avg_score[category] = avg_score
117
+ for category, avg_score in category2avg_score.items():
118
+ eval_logger.info(f"{category}: {avg_score:.2f}")
119
+ total_score = sum(category2avg_score.values())
120
+ return total_score
EAGLE/lmms_eval/tasks/multidocvqa/multidocvqa.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ group: multidocvqa
2
+ task:
3
+ - multidocvqa_val
4
+ - multidocvqa_test
EAGLE/lmms_eval/tasks/multidocvqa/multidocvqa_test.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_path: lmms-lab/MP-DocVQA
2
+ task: "multidocvqa_test"
3
+ test_split: test
4
+ output_type: generate_until
5
+ doc_to_visual: !function utils.multidocvqa_doc_to_visual
6
+ doc_to_text: !function utils.multidocvqa_doc_to_text
7
+ doc_to_target: "answers"
8
+ generation_kwargs:
9
+ max_new_tokens: 32
10
+ temperature: 0
11
+ do_sample: False
12
+ process_results: !function utils.multidocvqa_process_test_results_for_submission
13
+ metric_list:
14
+ - metric: submission
15
+ aggregation: !function utils.multidocvqa_test_aggregate_results_for_submission
16
+ model_specific_prompt_kwargs:
17
+ default:
18
+ pre_prompt: ""
19
+ post_prompt: "\nAnswer the question using a single word or phrase."
20
+
EAGLE/lmms_eval/tasks/multidocvqa/multidocvqa_val.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_path: lmms-lab/MP-DocVQA
2
+ task: "multidocvqa_val"
3
+ test_split: val
4
+ output_type: generate_until
5
+ doc_to_visual: !function utils.multidocvqa_doc_to_visual
6
+ doc_to_text: !function utils.multidocvqa_doc_to_text
7
+ doc_to_target: "answers"
8
+ generation_kwargs:
9
+ max_new_tokens: 32
10
+ temperature: 0
11
+ do_sample: False
12
+ process_results: !function utils.multidocvqa_process_results
13
+ metric_list:
14
+ - metric: anls
15
+ aggregation: !function utils.multidocvqa_aggregate_results_anls
16
+ higher_is_better: true
17
+ - metric: accuracy
18
+ aggregation: !function utils.multidocvqa_aggregate_results_accuracy
19
+ higher_is_better: true
20
+ model_specific_prompt_kwargs:
21
+ default:
22
+ pre_prompt: ""
23
+ post_prompt: "\nAnswer the question using a single word or phrase."
EAGLE/lmms_eval/tasks/multidocvqa/utils.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import ast
4
+ import json
5
+ import logging
6
+ from lmms_eval.api.metrics import levenshtein_distance
7
+ from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
8
+
9
+ lmms_logger = logging.getLogger("lmms-eval")
10
+
11
+
12
+ def multidocvqa_doc_to_text(doc, model_specific_prompt_kwargs):
13
+ question = doc["question"]
14
+ pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
15
+ post_prompt = model_specific_prompt_kwargs["post_prompt"]
16
+
17
+ return f"{pre_prompt}{question}{post_prompt}"
18
+
19
+
20
+ def multidocvqa_doc_to_visual(doc):
21
+ return [doc[f"image_{i}"].convert("RGB") for i in range(1, 21) if doc[f"image_{i}"] is not None]
22
+
23
+
24
+ def multidocvqa_process_results(doc, results):
25
+ pred_answer = results[0]
26
+ answer = ast.literal_eval(doc["answers"])
27
+
28
+ return {"anls": {"questionId": int(doc["questionId"]), "answer": answer, "pred_answer": pred_answer}, "accuracy": {"questionId": int(doc["questionId"]), "answer": answer, "pred_answer": pred_answer}}
29
+
30
+
31
+ def multidocvqa_aggregate_results_anls(results):
32
+ keys = {k for result in results for k in result}
33
+ results = {key: [result.get(key, None) for result in results] for key in keys}
34
+ evaluator = Evaluator(case_sensitive=False)
35
+ metric = evaluator.get_metrics(results["answer"], results["pred_answer"])
36
+
37
+ return sum(metric["anls"]) / len(metric["anls"])
38
+
39
+
40
+ def multidocvqa_aggregate_results_accuracy(results):
41
+ keys = {k for result in results for k in result}
42
+ results = {key: [result.get(key, None) for result in results] for key in keys}
43
+ evaluator = Evaluator(case_sensitive=False)
44
+ metric = evaluator.get_metrics(results["answer"], results["pred_answer"])
45
+
46
+ return sum(metric["accuracy"]) / len(metric["accuracy"])
47
+
48
+
49
+ def multidocvqa_process_test_results_for_submission(doc, results):
50
+ answer = results[0]
51
+ return {"submission": {"questionId": int(doc["questionId"]), "answer": answer, "answer_page": None}}
52
+
53
+
54
+ def multidocvqa_test_aggregate_results_for_submission(results, args):
55
+ path = generate_submission_file("multidocvqa_test_for_submission.json", args)
56
+ with open(path, "w") as f:
57
+ json.dump(results, f)
58
+ lmms_logger.info(f"Results saved to {path}.")
59
+
60
+
61
+ ##################
62
+ # Helper functions
63
+ ##################
64
+
65
+
66
+ class Evaluator:
67
+ def __init__(self, case_sensitive=False):
68
+ self.case_sensitive = case_sensitive
69
+ self.get_edit_distance = levenshtein_distance
70
+ self.anls_threshold = 0.5
71
+
72
+ def get_metrics(self, gt_answers, preds):
73
+ batch_accuracy = []
74
+ batch_anls = []
75
+ for batch_idx in range(len(preds)):
76
+ gt = [self._preprocess_str(gt_elm) for gt_elm in gt_answers[batch_idx]]
77
+ pred = self._preprocess_str(preds[batch_idx])
78
+
79
+ batch_accuracy.append(self._calculate_accuracy(gt, pred))
80
+ batch_anls.append(self._calculate_anls(gt, pred))
81
+
82
+ return {"accuracy": batch_accuracy, "anls": batch_anls}
83
+
84
+ def _preprocess_str(self, string):
85
+ if not self.case_sensitive:
86
+ string = string.lower()
87
+
88
+ return string.strip()
89
+
90
+ def _calculate_accuracy(self, gt, pred):
91
+ if pred == "none":
92
+ return 0
93
+
94
+ for gt_elm in gt:
95
+ if gt_elm == pred:
96
+ return 1
97
+
98
+ return 0
99
+
100
+ def _calculate_anls(self, gt, pred):
101
+ if len(pred) == 0:
102
+ return 0
103
+
104
+ if pred == "none":
105
+ return 0
106
+
107
+ answers_similarity = [1 - self.get_edit_distance(gt_elm, pred) / max(len(gt_elm), len(pred)) for gt_elm in gt]
108
+ max_similarity = max(answers_similarity)
109
+
110
+ anls = max_similarity if max_similarity >= self.anls_threshold else 0
111
+ return anls
112
+
113
+
114
+ if __name__ == "__main__":
115
+ print("-----------------")
116
+ multidocvqa_aggregate_results_anls([{"questionId": 1, "answer": ["answer"], "pred_answer": "pred_answer"}, {"questionId": 2, "answer": ["nswer"], "pred_answer": "nswer"}])
EAGLE/lmms_eval/tasks/nocaps/_default_template_nocaps_yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ model_specific_prompt_kwargs:
2
+ default:
3
+ prompt: "Provide a one-sentence caption for the provided image."
EAGLE/lmms_eval/tasks/nocaps/nocaps.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ group : nocaps
2
+ task:
3
+ - nocaps_test
4
+ - nocaps_val
EAGLE/lmms_eval/tasks/nocaps/nocaps_test.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_path: lmms-lab/NoCaps
2
+ dataset_kwargs:
3
+ token: True
4
+ task : "nocaps_test"
5
+ group : "nocaps_caption"
6
+ test_split: test
7
+ output_type: generate_until
8
+ doc_to_visual: !function utils.nocaps_doc_to_visual
9
+ doc_to_text: !function utils.nocaps_doc_to_text
10
+ doc_to_target: "annotations_captions"
11
+ generation_kwargs:
12
+ max_new_tokens: 64
13
+ temperature: 0
14
+ top_p: 0
15
+ num_beams: 1
16
+ do_sample: false
17
+ process_results: !function utils.nocaps_test_process_result
18
+ # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19
+ metric_list:
20
+ - metric: nocaps_passthrough
21
+ aggregation : !function utils.nocaps_test_aggregation_result
22
+ higher_is_better : true
23
+ metadata:
24
+ - version: 0.0
25
+ include: _default_template_nocaps_yaml
EAGLE/lmms_eval/tasks/nocaps/nocaps_val.yaml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_path: lmms-lab/NoCaps
2
+ dataset_kwargs:
3
+ token: True
4
+ task: "nocaps_val"
5
+ group : "nocaps_caption"
6
+ test_split: validation
7
+ output_type: generate_until
8
+ doc_to_visual: !function utils.nocaps_doc_to_visual
9
+ doc_to_text: !function utils.nocaps_doc_to_text
10
+ doc_to_target: "annotations_captions"
11
+ generation_kwargs:
12
+ max_new_tokens: 64
13
+ temperature: 0
14
+ top_p: 0
15
+ num_beams: 1
16
+ do_sample: false
17
+ process_results: !function utils.nocaps_process_result
18
+ # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19
+ metric_list:
20
+ - metric: nocaps_Bleu_4
21
+ aggregation : !function utils.nocaps_bleu4
22
+ higher_is_better : true
23
+ - metric: nocaps_Bleu_3
24
+ aggregation : !function utils.nocaps_bleu3
25
+ higher_is_better : true
26
+ - metric: nocaps_Bleu_2
27
+ aggregation : !function utils.nocaps_bleu2
28
+ higher_is_better : true
29
+ - metric: nocaps_Bleu_1
30
+ aggregation : !function utils.nocaps_bleu1
31
+ higher_is_better : true
32
+ - metric: nocaps_METEOR
33
+ aggregation : !function utils.nocaps_meteor
34
+ higher_is_better : true
35
+ - metric: nocaps_ROUGE_L
36
+ aggregation : !function utils.nocaps_rougel
37
+ higher_is_better : true
38
+ - metric: nocaps_CIDEr
39
+ aggregation : !function utils.nocaps_cider
40
+ higher_is_better : true
41
+ #- metric: nocaps_SPICE
42
+ # aggregation : !function utils.nocaps_spice
43
+ # higher_is_better : true
44
+ metadata:
45
+ - version: 0.0
46
+ include: _default_template_nocaps_yaml
EAGLE/lmms_eval/tasks/nocaps/utils.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from pycocoevalcap.eval import COCOEvalCap, Bleu, Meteor, Rouge, Cider, Spice
4
+ from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
5
+ from pycocotools.coco import COCO
6
+
7
+ from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
8
+
9
+ import logging
10
+
11
+ eval_logger = logging.getLogger("lmms-eval")
12
+
13
+ dir_name = os.path.dirname(os.path.abspath(__file__))
14
+
15
+ NOCAPS_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr"] # , "SPICE"]
16
+
17
+
18
+ def nocaps_doc_to_visual(doc):
19
+ return [doc["image"].convert("RGB")]
20
+
21
+
22
+ def nocaps_doc_to_text(doc, model_specific_prompt_kwargs=None):
23
+ # question = "Please carefully observe the image and come up with a caption for the image"
24
+ return model_specific_prompt_kwargs["prompt"]
25
+
26
+
27
+ def nocaps_process_result(doc, result):
28
+ """
29
+ Args:
30
+ doc: a instance of the eval dataset
31
+ results: [pred]
32
+ Returns:
33
+ a dictionary with key: metric name, value: metric value
34
+ """
35
+ pred = result[0]
36
+ # The question id in our dataset is the image file itself
37
+ image_id = doc["image_id"]
38
+
39
+ data_dict = {"answer": doc["annotations_captions"], "pred": pred, "image_id": image_id}
40
+
41
+ return {f"nocaps_{metric}": data_dict for metric in NOCAPS_METRICS}
42
+
43
+
44
+ def nocaps_aggregation_result(results, metric, args=None):
45
+ scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE")]
46
+ scorers_dict = {s[1]: s for s in scorers}
47
+
48
+ stored_results = []
49
+ # In order to make the coco eval tools to successfully create index
50
+ # We need at least two dict in the dataset
51
+ # 'annotation' and 'images'
52
+ # 'annotation' exactly reproduce the original annotation
53
+ # 'images' however only need the image id which is contained in the file name
54
+ dataset = {"annotations": [], "images": []}
55
+ idx = 0
56
+ for result in results:
57
+ stored_results.append({"image_id": int(result["image_id"]), "caption": result["pred"]})
58
+ for a in result["answer"]:
59
+ dataset["annotations"].append({"image_id": int(result["image_id"]), "caption": a, "id": idx})
60
+ idx += 1
61
+ dataset["images"].append({"id": result["image_id"]})
62
+
63
+ coco = COCO()
64
+ # Manually create index here
65
+ coco.dataset = dataset
66
+ coco.createIndex()
67
+
68
+ nocaps_result = coco.loadRes(stored_results)
69
+ nocaps_eval = COCOEvalCap(coco, nocaps_result)
70
+
71
+ imgIds = nocaps_eval.params["image_id"]
72
+ gts = {}
73
+ res = {}
74
+ for imgId in imgIds:
75
+ gts[imgId] = nocaps_eval.coco.imgToAnns[imgId]
76
+ res[imgId] = nocaps_eval.cocoRes.imgToAnns[imgId]
77
+
78
+ eval_logger.info("tokenization...")
79
+ tokenizer = PTBTokenizer()
80
+ gts = tokenizer.tokenize(gts)
81
+ res = tokenizer.tokenize(res)
82
+
83
+ eval_logger.info(f"Computing {metric} scores...")
84
+
85
+ score, scores = scorers_dict[metric][0].compute_score(gts, res)
86
+ # When metric is one of the Bleu, score will be a list
87
+ if type(score) == list:
88
+ n = int(metric.split("_")[-1])
89
+ score = score[n - 1]
90
+
91
+ path = generate_submission_file(f"nocaps_val_{metric}_scores.json", args)
92
+ eval_logger.info("Storing prediction that can be submitted to the server ...")
93
+ with open(path, "w") as f:
94
+ json.dump(stored_results, f, indent=4)
95
+ eval_logger.info(f"Your result has been saved to {path}.")
96
+
97
+ return score
98
+
99
+
100
+ def nocaps_bleu4(results, args=None):
101
+ return nocaps_aggregation_result(results, "Bleu_4", args)
102
+
103
+
104
+ def nocaps_bleu3(results, args=None):
105
+ return nocaps_aggregation_result(results, "Bleu_3", args)
106
+
107
+
108
+ def nocaps_bleu2(results, args=None):
109
+ return nocaps_aggregation_result(results, "Bleu_2", args)
110
+
111
+
112
+ def nocaps_bleu1(results, args=None):
113
+ return nocaps_aggregation_result(results, "Bleu_1", args)
114
+
115
+
116
+ def nocaps_meteor(results, args=None):
117
+ return nocaps_aggregation_result(results, "METEOR", args)
118
+
119
+
120
+ def nocaps_rougel(results, args=None):
121
+ return nocaps_aggregation_result(results, "ROUGE_L", args)
122
+
123
+
124
+ def nocaps_cider(results, args=None):
125
+ return nocaps_aggregation_result(results, "CIDEr", args)
126
+
127
+
128
+ def nocaps_spice(results, args=None):
129
+ return nocaps_aggregation_result(results, "SPICE", args)
130
+
131
+
132
+ def nocaps_test_process_result(doc, result):
133
+ """
134
+ Args:
135
+ doc: a instance of the eval dataset
136
+ results: [pred]
137
+ Returns:
138
+ a dictionary with key: metric name (in this case nocaps_passthrough), value: metric value
139
+ """
140
+ return {"nocaps_passthrough": {"pred": result[0], "image_id": doc["image_id"]}}
141
+
142
+
143
+ def nocaps_test_aggregation_result(results, args=None):
144
+ stored_results = []
145
+ for result in results:
146
+ stored_results.append({"image_id": int(result["image_id"]), "caption": result["pred"]})
147
+
148
+ path = generate_submission_file("nocaps_captions_nocaps_test_alg_results.json", args)
149
+ eval_logger.info("Storing prediction that can be submitted to the server ...")
150
+ with open(path, "w") as f:
151
+ json.dump(stored_results, f, indent=4)
152
+
153
+ eval_logger.info(f"Your test result has been stored in {path}. Make sure you also have the val result stored to submit to the server on https://codalab.lisn.upsaclay.fr/competitions/7404#participate.")
EAGLE/lmms_eval/tasks/ok_vqa/_default_template_vqa_yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_path: lmms-lab/OK-VQA
2
+ output_type: generate_until
3
+ doc_to_visual: !function utils.ok_vqa_doc_to_visual
4
+ doc_to_text: !function utils.ok_vqa_doc_to_text
5
+ doc_to_target: "answer"
6
+ generation_kwargs:
7
+ until:
8
+ - "ASSISTANT:"
9
+ metric_list:
10
+ - metric: exact_match
11
+ aggregation: mean
12
+ higher_is_better: true
13
+ ignore_case: true
14
+ ignore_punctuation: true
15
+ - metric: submission
16
+ aggregation: !function utils.ok_vqa_aggreate_submissions
17
+ higher_is_better: true
18
+ process_results: !function utils.ok_vqa_process_results
19
+ model_specific_prompt_kwargs:
20
+ default:
21
+ pre_prompt: ""
22
+ post_prompt: "\nWhen the provided information is insufficient, respond with 'Unanswerable'.\nAnswer the question using a single word or phrase."
23
+ metadata:
24
+ - version: 0.0
EAGLE/lmms_eval/tasks/ok_vqa/_generate_config.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import yaml
3
+
4
+ splits = ["val2014"]
5
+ tasks = ["vqa"]
6
+
7
+ if __name__ == "__main__":
8
+ dump_tasks = []
9
+ for task in tasks:
10
+ for split in splits:
11
+ yaml_dict = {"group": f"ok_vqa", "task": f"ok_vqa_{split}", "include": f"_default_template_{task}_yaml", "test_split": split}
12
+ if split == "train":
13
+ yaml_dict.pop("group")
14
+ else:
15
+ dump_tasks.append(f"ok_vqa_{split}")
16
+
17
+ save_path = f"./ok_vqa_{split}.yaml"
18
+ print(f"Saving to {save_path}")
19
+ with open(save_path, "w") as f:
20
+ yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False)
21
+
22
+ group_dict = {"group": "ok_vqa", "task": dump_tasks}
23
+
24
+ with open("./_ok_vqa.yaml", "w") as f:
25
+ yaml.dump(group_dict, f, default_flow_style=False, indent=4)
EAGLE/lmms_eval/tasks/ok_vqa/_ok_vqa.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ group: ok_vqa
2
+ task:
3
+ - ok_vqa_val2014
EAGLE/lmms_eval/tasks/ok_vqa/ok_vqa_val2014.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ group: ok_vqa
2
+ task: ok_vqa_val2014
3
+ test_split: val2014
4
+ include: _default_template_vqa_yaml
EAGLE/lmms_eval/tasks/ok_vqa/utils.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import json
4
+ import yaml
5
+ import pathlib
6
+ import logging
7
+ import datetime
8
+ import statistics
9
+
10
+ from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
11
+ from lmms_eval.tasks._task_utils.vqa_eval_metric import EvalAIAnswerProcessor
12
+
13
+ eval_logger = logging.getLogger("lmms-eval")
14
+
15
+
16
+ def ok_vqa_doc_to_visual(doc):
17
+ return [doc["image"].convert("RGB")]
18
+
19
+
20
+ def ok_vqa_process_results(doc, result):
21
+ eval_ai_processor = EvalAIAnswerProcessor()
22
+ assert len(result) == 1, f"The result should be a list of length 1, but got {len(result)}."
23
+ resAns = eval_ai_processor(result[0])
24
+ accuracy = 0
25
+
26
+ if "answers" in doc and doc["answers"] is not None:
27
+ gtAcc = []
28
+
29
+ for i in range(len(doc["answers"])):
30
+ doc["answers"][i] = eval_ai_processor(doc["answers"][i])
31
+
32
+ for i in range(len(doc["answers"])):
33
+ otherGTAns = [doc["answers"][j] for j in range(len(doc["answers"])) if i != j]
34
+ matchingAns = [item for item in otherGTAns if item == resAns]
35
+ acc = min(1, float(len(matchingAns)) / 3)
36
+ gtAcc.append(acc)
37
+ if gtAcc:
38
+ accuracy = statistics.mean(gtAcc)
39
+ else:
40
+ accuracy = 0
41
+
42
+ return {
43
+ "exact_match": accuracy,
44
+ "submission": {
45
+ "image": f"{doc['question_id']}.jpg",
46
+ "answer": resAns,
47
+ },
48
+ }
49
+
50
+
51
+ def ok_vqa_doc_to_text(doc, model_specific_prompt_kwargs=None):
52
+ question = doc["question"]
53
+ if model_specific_prompt_kwargs is None:
54
+ model_specific_prompt_kwargs = {}
55
+ pre_prompt = ""
56
+ post_prompt = ""
57
+ if "pre_prompt" in model_specific_prompt_kwargs:
58
+ pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
59
+ if "post_prompt" in model_specific_prompt_kwargs:
60
+ post_prompt = model_specific_prompt_kwargs["post_prompt"]
61
+ return f"{pre_prompt}{question}{post_prompt}"
62
+
63
+
64
+ def ok_vqa_aggreate_submissions(results, args):
65
+ now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S")
66
+ file = f"ok_vqa-test-submission-{now_date_time}.json"
67
+ path = generate_submission_file(file, args)
68
+ with open(path, "w") as f:
69
+ json.dump(results, f)
70
+ print(f"Submission file saved to {path}")
EAGLE/lmms_eval/tasks/olympiadbench/cn_utils.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import datetime
4
+ from lmms_eval.tasks.olympiadbench.olympiadbench_evals import OlympiadBenchEvaluator
5
+ from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
6
+
7
+ import logging
8
+ eval_logger = logging.getLogger("lmms-eval")
9
+ dir_name = os.path.dirname(os.path.abspath(__file__))
10
+
11
+ olympiadbench_evaluator = OlympiadBenchEvaluator()
12
+
13
+ def olympiadbench_doc_to_visual(doc):
14
+ return [image.convert("RGB") for image in doc["images"]]
15
+
16
+ def olympiadbench_doc_to_text(doc):
17
+ question = doc["question"]
18
+ subject = doc["subfield"]
19
+ mul_ans = doc["is_multiple_answer"]
20
+ if mul_ans is None:
21
+ mul_ans = False
22
+ ans_type = doc["answer_type"]
23
+ if ans_type == "Need_human_evaluate":
24
+ ans_type = "proof based"
25
+
26
+ pre_prompt = f"以下是中国{subject}竞赛中的解答题。\n"
27
+
28
+ post_prompt = ""
29
+ if not mul_ans:
30
+ post_prompt += f"答案类型为{ans_type}。\n"
31
+ else:
32
+ post_prompt += f"题目有多个答案,答案类型均为{ans_type}。\n"
33
+ post_prompt += "请根据题目的要求和所提供的信息计算得出答案。解答过程和结果中使用的变量和公式请使用LaTeX格式表示。请在最后以"
34
+ if not mul_ans:
35
+ post_prompt += '"所以最终答案是\\boxed{答案}。"\n'
36
+ else:
37
+ post_prompt += '"所以最终答案是\\boxed{用英⽂逗号连接的多个答案}。"\n'
38
+
39
+ final_question = pre_prompt + question + '\n' + post_prompt
40
+ return final_question
41
+
42
+ def olympiadbench_process_results(doc, results):
43
+ precision = doc["error"]
44
+ is_proving = "TP" in doc["source"]
45
+ if precision is None:
46
+ precision = 0
47
+ prediction = results[0].strip()
48
+
49
+ if is_proving:
50
+ return {
51
+ "submission": prediction
52
+ }
53
+ else:
54
+ prediction = prediction.split("所以最终答案是")[-1]
55
+ prediction = prediction.replace('"', "").replace("\n", "").replace(" ", "").strip(".").strip("。")
56
+ accuracy = olympiadbench_evaluator.judge(prediction, doc["final_answer"][0], precision)
57
+ accuracy = int(accuracy)
58
+ return {
59
+ "exact_match": accuracy
60
+ }
61
+
62
+ def olympiadbench_aggregate_results(results, args):
63
+ now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S")
64
+ submission_file_name = f"olympiadbench-test-cn-submission-{now_date_time}.json"
65
+ path = generate_submission_file(submission_file_name, args)
66
+ with open(path, "w") as f:
67
+ json.dump(results, f, ensure_ascii=False)
68
+ print(f"Submission file saved to {path}")
69
+
EAGLE/lmms_eval/tasks/olympiadbench/en_utils.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import datetime
4
+ from lmms_eval.tasks.olympiadbench.olympiadbench_evals import OlympiadBenchEvaluator
5
+ from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
6
+
7
+ import logging
8
+ eval_logger = logging.getLogger("lmms-eval")
9
+ dir_name = os.path.dirname(os.path.abspath(__file__))
10
+
11
+ olympiadbench_evaluator = OlympiadBenchEvaluator()
12
+
13
+ def olympiadbench_doc_to_visual(doc):
14
+ return [image.convert("RGB") for image in doc["images"]]
15
+
16
+ def olympiadbench_doc_to_text(doc):
17
+ question = doc["question"]
18
+ subject = doc["subfield"]
19
+ mul_ans = doc["is_multiple_answer"]
20
+ if mul_ans is None:
21
+ mul_ans = False
22
+ ans_type = doc["answer_type"]
23
+ if ans_type == "Need_human_evaluate":
24
+ ans_type = "proof based"
25
+
26
+ pre_prompt = f"The following is a question from an International {subject} competition.\n"
27
+
28
+ post_prompt = ""
29
+ if not mul_ans:
30
+ post_prompt += f"The answer of the question should be {ans_type}.\n"
31
+ else:
32
+ post_prompt += f"The question has multiple answers, each of them should be {ans_type}.\n"
33
+ post_prompt += "Please calculate the answer according to the given requirements and the information provided. Please use LaTeX format to represent the variables and formulas used in the solution process and results. Please end your solution with "
34
+ if not mul_ans:
35
+ post_prompt += '"So the final answer is \\boxed{answer}."\n'
36
+ else:
37
+ post_prompt += 'So the final answer is \\boxed{multiple answers connected with commas}.\n'
38
+
39
+ final_question = pre_prompt + question + '\n' + post_prompt
40
+ return final_question
41
+
42
+ def olympiadbench_process_results(doc, results):
43
+ precision = doc["error"]
44
+ is_proving = "TP" in doc["source"]
45
+ if precision is None:
46
+ precision = 0
47
+ prediction = results[0].strip()
48
+
49
+ if is_proving:
50
+ return {
51
+ "submission": prediction
52
+ }
53
+ else:
54
+ prediction = prediction.split("final answer is")[-1]
55
+ prediction = prediction.replace('"', "").replace("\n", "").replace(" ", "").strip(".").strip("。")
56
+ accuracy = olympiadbench_evaluator.judge(prediction, doc["final_answer"][0], precision)
57
+ accuracy = int(accuracy)
58
+ return {
59
+ "exact_match": accuracy
60
+ }
61
+
62
+ def olympiadbench_aggregate_results(results, args):
63
+ now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S")
64
+ submission_file_name = f"olympiadbench-test-en-submission-{now_date_time}.json"
65
+ path = generate_submission_file(submission_file_name, args)
66
+ with open(path, "w") as f:
67
+ json.dump(results, f, ensure_ascii=False)
68
+ print(f"Submission file saved to {path}")
69
+
EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ group: olympiadbench
2
+ task:
3
+ - olympiadbench_test_en
4
+ - olympiadbench_test_cn
5
+ metadata:
6
+ - version: 0.0
EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench_evals.py ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import sympy as sp
3
+ from sympy import simplify, Eq, sympify, Pow
4
+ from sympy.parsing.latex import parse_latex
5
+ import math
6
+
7
+ # how to use
8
+ # scorer = OlympiadBenchEvaluator()
9
+ # exp1 = "10^{10^{10^{10}}}"
10
+ # exp2 = "10^{10}"
11
+ # precision = 1e-4
12
+ # res = scorer.judge(exp1, exp2, precision)
13
+
14
+ class OlympiadBenchEvaluator:
15
+ def __init__(self):
16
+ # Map of special symbols to their replacements
17
+ self.special_signal_map = {
18
+ "\\left": "",
19
+ "\\right": "",
20
+ "∶": ":",
21
+ ",": ",",
22
+ "$": "",
23
+ "\\approx": "=",
24
+ "\\simeq": "=",
25
+ "\\sim": "=",
26
+ "^\\prime": "'",
27
+ "^{\\prime}": "'",
28
+ "^\\circ": "",
29
+ "%": "",
30
+ }
31
+ self.pi = parse_latex("\\pi")
32
+ self.precision = 1e-8 # Default precision for comparison
33
+
34
+ def split_by_comma(self, expr: str):
35
+ # Splits expressions by commas outside of brackets
36
+ in_bracket_num = 0
37
+ splitted_expr = []
38
+ start_idx = 0
39
+ for i, char in enumerate(expr):
40
+ if char in ["(", "["]:
41
+ in_bracket_num += 1
42
+ elif char in [")", "]"]:
43
+ in_bracket_num -= 1
44
+ elif char == "," and in_bracket_num == 0:
45
+ splitted_expr.append(expr[start_idx:i].strip())
46
+ start_idx = i + 1
47
+
48
+ if start_idx < len(expr):
49
+ splitted_expr.append(expr[start_idx:].strip())
50
+
51
+ return splitted_expr
52
+
53
+ def trans_plus_minus_sign(self, expr_list: list):
54
+ # Translates plus-minus signs into separate expressions
55
+ new_expr_list = []
56
+ for expr in expr_list:
57
+ if "\\pm" in expr:
58
+ new_expr_list.append(expr.replace("\\pm", "+"))
59
+ new_expr_list.append(expr.replace("\\pm", "-"))
60
+ else:
61
+ new_expr_list.append(expr)
62
+
63
+ return new_expr_list
64
+
65
+ def judge(self, expression1, expression2, precision=1e-8):
66
+ # Judge if two expressions are equal (expression1 is considered as the Ground Truth)
67
+ # Default precision is a list for supporting multiple expressions
68
+ precision = precision if isinstance(precision, list) else [precision]
69
+
70
+ try:
71
+ expression1, expression2 = self.preprocess(expression1, expression2)
72
+ except:
73
+ return False
74
+ if expression1 == expression2:
75
+ # print("Exactly equal")
76
+ return True
77
+
78
+ # Remove Chinese characters from the string, as answers like "yes" or "no" in Chinese have been considered
79
+ expression1 = re.sub(r'[\u4e00-\u9fff]+', '', expression1)
80
+ expression2 = re.sub(r'[\u4e00-\u9fff]+', '', expression2)
81
+
82
+ expression1 = self.split_by_comma(expression1)
83
+ expression2 = self.split_by_comma(expression2)
84
+
85
+ temp_list1 = self.trans_plus_minus_sign(expression1)
86
+ temp_list2 = self.trans_plus_minus_sign(expression2)
87
+
88
+ # Set up a list for allowed errors
89
+ if len(precision) <= 1:
90
+ precision = precision * len(temp_list1)
91
+
92
+ if len(temp_list1) != len(temp_list2):
93
+ return False
94
+
95
+ # Check if elements in both lists can be paired and are equal
96
+ idx = -1
97
+ while len(temp_list1) != 0:
98
+ idx = (idx + 1) % len(temp_list1)
99
+
100
+ item1 = temp_list1[idx]
101
+ self.precision = precision[idx]
102
+
103
+ for item2 in temp_list2:
104
+ if self.is_equal(item1, item2):
105
+ temp_list1.remove(item1)
106
+ temp_list2.remove(item2)
107
+ precision.remove(self.precision)
108
+ break
109
+ else:
110
+ # If no match was found, return False
111
+ return False
112
+
113
+ # If all elements are matched, return True
114
+ return True
115
+
116
+ def is_interval(self, expr):
117
+ # Checks if an expression is an interval
118
+ return expr.startswith(("(", "[")) and expr.endswith((")", "]"))
119
+
120
+ def sympy_sub_pi(self, expression_sympy):
121
+ # Replaces the symbol for pi in sympy expressions with its numerical value
122
+ return expression_sympy.subs(self.pi, math.pi)
123
+
124
+ def is_equal(self, expression1, expression2):
125
+ # Default first expression is ground truth. Check if expressions are equal in different aspects
126
+ if expression1 == expression2 and expression1 != "" and expression2 != "":
127
+ # print("Equivalent natively")
128
+ return True
129
+
130
+ # First check if both are intervals
131
+ if self.is_interval(expression1) and self.is_interval(expression2):
132
+ try:
133
+ if self.interval_equal(expression1, expression2):
134
+ # print("Interval equivalent")
135
+ return True
136
+ except:
137
+ return False
138
+
139
+ # Then check for numerical equality
140
+ try:
141
+ if self.numerical_equal(expression1, expression2):
142
+ # print("Numerically equivalent")
143
+ return True
144
+ except:
145
+ pass
146
+
147
+ # Then check if expressions are mathematically equal
148
+ try:
149
+ if self.expression_equal(expression1, expression2) and not ("=" in expression1 and "=" in expression2):
150
+ # print("Expression equivalent")
151
+ return True
152
+ except:
153
+ pass
154
+
155
+ # Lastly, check for equation equality
156
+ try:
157
+ if self.equation_equal(expression1, expression2):
158
+ # print("Equation equivalent")
159
+ return True
160
+ except:
161
+ pass
162
+
163
+ return False
164
+
165
+ def numerical_equal(self, expression1: str, expression2: str, include_percentage: bool = True):
166
+ # Check if two numerical values are equal within an allowed error range
167
+ # Includes possible percentage cases
168
+ reference = float(expression1)
169
+ prediction = float(expression2)
170
+
171
+ if include_percentage:
172
+ gt_result = [reference / 100, reference, reference * 100]
173
+ else:
174
+ gt_result = [reference]
175
+
176
+ for item in gt_result:
177
+ if abs(item - prediction) <= self.precision * 1.01:
178
+ return True
179
+ return False
180
+
181
+
182
+ def expression_equal(self, exp1, exp2):
183
+ # Check if two expressions are mathematically equivalent
184
+ # Extract expression and use sympy for equivalence checking
185
+ def extract_expression(expression):
186
+ if "=" in expression:
187
+ expression = expression.split("=")[1]
188
+ return expression.strip()
189
+
190
+ exp1 = extract_expression(exp1)
191
+ exp2 = extract_expression(exp2)
192
+
193
+ expr1_sym = sympify(parse_latex(exp1))
194
+ expr2_sym = sympify(parse_latex(exp2))
195
+
196
+ if expr1_sym == expr2_sym:
197
+ return True
198
+ else:
199
+ expr1_sym = self.sympy_sub_pi(expr1_sym)
200
+ expr2_sym = self.sympy_sub_pi(expr2_sym)
201
+
202
+ if (expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol)) or (not expr1_sym.has(sp.Symbol) and expr2_sym.has(sp.Symbol)):
203
+ return False
204
+ elif not expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol):
205
+ try:
206
+ if not (self.can_compute_power(expr1_sym) and self.can_compute_power(expr2_sym)):
207
+ print(f"These two numbers cannot be calculated by the current computer for: \"{str(expr1_sym)}\" and \"{str(expr2_sym)}\"")
208
+ return False
209
+
210
+ if abs(expr1_sym.evalf() - expr2_sym.evalf()) <= self.precision * 1.01:
211
+ return True
212
+ else:
213
+ return False
214
+ except:
215
+ return False
216
+ else:
217
+ try:
218
+ simplified_expr = simplify(expr1_sym - expr2_sym)
219
+
220
+ num_value = simplified_expr.evalf()
221
+
222
+ return abs(num_value) < 1e-3
223
+ except:
224
+ return False
225
+
226
+ def equation_equal(self, expression1, expression2):
227
+ # Check if two equations are mathematically equivalent
228
+ # Simplify equations and use sympy for equivalence checking
229
+ def simplify_equation(latex_eq):
230
+ lhs, rhs = latex_eq.split('=')
231
+
232
+ lhs_expr = parse_latex(lhs)
233
+ rhs_expr = parse_latex(rhs)
234
+
235
+ equation = Eq(lhs_expr, rhs_expr)
236
+
237
+ simplified_eq = simplify(equation.lhs - equation.rhs)
238
+
239
+ return simplified_eq
240
+
241
+ expr1_sym = simplify_equation(expression1)
242
+ expr2_sym = simplify_equation(expression2)
243
+
244
+ division_result_1 = simplify(expr1_sym / expr2_sym)
245
+ division_result_2 = simplify(expr2_sym / expr1_sym)
246
+
247
+ if (division_result_1.is_Integer and division_result_1 != 0) or (division_result_2.is_Integer and division_result_2 != 0):
248
+ return True
249
+ else:
250
+ return False
251
+
252
+ def interval_equal(self, expression1, expression2):
253
+ # Check if two intervals are mathematically equivalent
254
+ def compare_two_interval(inter1, inter2):
255
+ if inter1[0] != inter2[0] or inter1[-1] != inter2[-1]:
256
+ return False
257
+
258
+ inter1 = inter1.strip('[]()')
259
+ inter2 = inter2.strip('[]()')
260
+
261
+ items_1 = inter1.split(',')
262
+ items_2 = inter2.split(',')
263
+
264
+ for item_1, item_2 in zip(items_1, items_2):
265
+ if not self.expression_equal(item_1, item_2):
266
+ return False
267
+ return True
268
+
269
+ interval1 = expression1
270
+ interval2 = expression2
271
+
272
+ if interval1 == interval2:
273
+ return True
274
+ else:
275
+ inter_list1 = interval1.split("\\cup")
276
+ inter_list2 = interval2.split("\\cup")
277
+
278
+ if len(inter_list1) != len(inter_list2):
279
+ return False
280
+ else:
281
+ for inter1, inter2 in zip(inter_list1, inter_list2):
282
+ if not compare_two_interval(inter1, inter2):
283
+ return False
284
+ return True
285
+
286
+ def preprocess(self, expression1, expression2):
287
+ # Preprocess expressions to extract and replace special symbols
288
+ def extract_boxed_content(latex_str):
289
+ boxed_matches = re.finditer(r'\\boxed{', latex_str)
290
+ results = ""
291
+
292
+ for match in boxed_matches:
293
+ start_index = match.end()
294
+ end_index = start_index
295
+ stack = 1
296
+
297
+ while stack > 0 and end_index < len(latex_str):
298
+ if latex_str[end_index] == '{':
299
+ stack += 1
300
+ elif latex_str[end_index] == '}':
301
+ stack -= 1
302
+ end_index += 1
303
+
304
+ if stack == 0:
305
+ content = latex_str[start_index:end_index - 1]
306
+ results += content + ","
307
+ else:
308
+ raise ValueError("Mismatched braces in LaTeX string.")
309
+
310
+ if results == "":
311
+ last_line_ans = latex_str.strip().split("\n")[-1]
312
+ dollar_pattern = r"\$(.*?)\$"
313
+ answers = re.findall(dollar_pattern, last_line_ans)
314
+
315
+ if answers:
316
+ for ans in answers:
317
+ results += ans + ","
318
+ else:
319
+ results = latex_str
320
+
321
+ return results
322
+
323
+ def sepcial_symbol_replace(expression):
324
+ if "\\in " in expression:
325
+ expression = expression.split("\\in ")[1]
326
+
327
+ for signal in self.special_signal_map:
328
+ expression = expression.replace(signal, self.special_signal_map[signal])
329
+
330
+ expression = expression.strip("\n$,.:;^_=+`!@#$%^&*~,。")
331
+
332
+ pattern = r'\\(?:mathrm|mathbf)\{~?([^}]*)\}'
333
+ expression = re.sub(pattern, r'\1', expression)
334
+
335
+ return expression
336
+
337
+ exp1, exp2 = extract_boxed_content(expression1), extract_boxed_content(expression2)
338
+ exp1, exp2 = sepcial_symbol_replace(exp1), sepcial_symbol_replace(exp2)
339
+
340
+ return exp1, exp2
341
+
342
+ def can_compute_power(self, expr):
343
+ # Checks if a power expression can be computed
344
+ if isinstance(expr, Pow):
345
+ base, exp = expr.as_base_exp()
346
+ if base.is_number and exp.is_number:
347
+ MAX_EXP = 1000 # Adjust based on computing environment
348
+ if abs(exp.evalf()) > MAX_EXP:
349
+ return False
350
+ else:
351
+ return True
352
+ else:
353
+ return False
354
+ else:
355
+ return True # Not a power expression, can compute
EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench_test_cn.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_path: lmms-lab/OlympiadBench
2
+ dataset_kwargs:
3
+ token: True
4
+ task : "olympiadbench_test_cn"
5
+ test_split: test_cn
6
+ output_type: generate_until
7
+ doc_to_visual: !function cn_utils.olympiadbench_doc_to_visual
8
+ doc_to_text: !function cn_utils.olympiadbench_doc_to_text
9
+ doc_to_target: "answer"
10
+ generation_kwargs:
11
+ until:
12
+ - "ASSISTANT:"
13
+ max_new_tokens: 1024
14
+ temperature: 0
15
+ top_p: 0
16
+ num_beams: 1
17
+ do_sample: false
18
+ process_results: !function cn_utils.olympiadbench_process_results
19
+ metric_list:
20
+ - metric: submission
21
+ aggregation: !function cn_utils.olympiadbench_aggregate_results
22
+ higher_is_better: true
23
+ - metric: exact_match
24
+ aggregation: mean
25
+ higher_is_better: true
EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench_test_en.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_path: lmms-lab/OlympiadBench
2
+ dataset_kwargs:
3
+ token: True
4
+ task : "olympiadbench_test_en"
5
+ test_split: test_en
6
+ output_type: generate_until
7
+ doc_to_visual: !function en_utils.olympiadbench_doc_to_visual
8
+ doc_to_text: !function en_utils.olympiadbench_doc_to_text
9
+ doc_to_target: "answer"
10
+ generation_kwargs:
11
+ until:
12
+ - "ASSISTANT:"
13
+ max_new_tokens: 1024
14
+ temperature: 0
15
+ top_p: 0
16
+ num_beams: 1
17
+ do_sample: false
18
+ process_results: !function en_utils.olympiadbench_process_results
19
+ metric_list:
20
+ - metric: submission
21
+ aggregation: !function en_utils.olympiadbench_aggregate_results
22
+ higher_is_better: true
23
+ - metric: exact_match
24
+ aggregation: mean
25
+ higher_is_better: true
EAGLE/lmms_eval/tasks/seedbench/seedbench.yaml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_path: lmms-lab/SEED-Bench
2
+ dataset_kwargs:
3
+ token: True
4
+ task: "seedbench"
5
+ test_split: test
6
+ output_type: generate_until
7
+ doc_to_visual: !function utils.seed_doc_to_visual
8
+ doc_to_text: !function utils.seed_doc_to_text
9
+ doc_to_target: "answer"
10
+ generation_kwargs:
11
+ until:
12
+ - "ASSISTANT:"
13
+ image_aspect_ratio: original
14
+ # The return value of process_results will be used by metrics
15
+ process_results: !function utils.seed_process_result
16
+ # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
17
+ metric_list:
18
+ - metric: seed_image
19
+ aggregation: !function utils.seed_aggregation_result
20
+ higher_is_better: true
21
+ - metric: seed_video
22
+ aggregation: !function utils.seed_aggregation_result
23
+ higher_is_better: true
24
+ - metric: seed_all
25
+ aggregation: !function utils.seed_aggregation_result
26
+ higher_is_better: true
27
+ metadata:
28
+ - version: 0.0
EAGLE/lmms_eval/tasks/seedbench/seedbench_ppl.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_path: lmms-lab/SEED-Bench
2
+ dataset_kwargs:
3
+ token: True
4
+ task: "seedbench_ppl"
5
+ test_split: test
6
+ output_type: multiple_choice
7
+ doc_to_visual: !function utils.seed_doc_to_visual
8
+ doc_to_text: !function utils.seed_doc_to_text_mc
9
+ doc_to_choice : !function utils.seed_doc_to_choice
10
+ doc_to_target: !function utils.seed_doc_to_mc_target
11
+ # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
12
+ metric_list:
13
+ - metric: acc
14
+ metadata:
15
+ - version: 0.0
EAGLE/lmms_eval/tasks/seedbench/utils.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+
4
+ def seed_doc_to_visual(doc):
5
+ return [image.convert("RGB") for image in doc["image"]]
6
+
7
+
8
+ def seed_doc_to_text(doc):
9
+ question = doc["question"]
10
+ question += "\n" + f"A. {doc['choice_a']}\n"
11
+ question += f"B. {doc['choice_b']}\n"
12
+ question += f"C. {doc['choice_c']}\n"
13
+ question += f"D. {doc['choice_d']}"
14
+ return f"{question}\nAnswer with the option's letter from the given choices directly."
15
+
16
+
17
+ def seed_process_result(doc, result):
18
+ pred = result[0].strip()
19
+ if len(pred) > 1:
20
+ pred = pred[0]
21
+ answer = doc["answer"]
22
+ data_type = doc["data_type"]
23
+
24
+ return {f"seed_{data_type}": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}, f"seed_all": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}}
25
+
26
+
27
+ def seed_aggregation_result(results):
28
+ total_count = 0
29
+ total_correct = 0
30
+ for result in results:
31
+ if result["pred"] == result["answer"]:
32
+ total_correct += 1
33
+ total_count += 1
34
+ return total_correct / total_count
35
+
36
+
37
+ def seed_aggregation_result_all(results):
38
+ score = seed_aggregation_result(results)
39
+ stored_results = []
40
+ for result in results:
41
+ stored_results.append({"question_id": result["question_id"], "prediction": result["pred"]})
42
+ with open("./seed_submission.json", "w") as f:
43
+ json.dump(stored_results, f, indent=4)
44
+ print("Storing files for seed_submission ...")
45
+
46
+ return score
47
+
48
+
49
+ def seed_doc_to_text_mc(doc):
50
+ question = doc["question"]
51
+ return f"{question} Answer :"
52
+
53
+
54
+ def seed_doc_to_choice(doc):
55
+ return [doc["choice_a"], doc["choice_b"], doc["choice_c"], doc["choice_d"]]
56
+
57
+
58
+ def seed_doc_to_mc_target(doc):
59
+ answer2choice = {"A": "choice_a", "B": "choice_b", "C": "choice_c", "D": "choice_d"}
60
+ return doc[answer2choice[doc["answer"]]]
EAGLE/lmms_eval/tasks/textvqa/_default_template_textvqa_yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_path: lmms-lab/textvqa
2
+ output_type: generate_until
3
+ doc_to_visual: !function utils.textvqa_doc_to_visual
4
+ doc_to_text: !function utils.textvqa_doc_to_text
5
+ doc_to_target: "answer"
6
+ generation_kwargs:
7
+ until:
8
+ - "ASSISTANT:"
9
+ process_results: !function utils.textvqa_process_results
10
+ model_specific_prompt_kwargs:
11
+ default:
12
+ pre_prompt: ""
13
+ post_prompt: "\nAnswer the question using a single word or phrase."
14
+ ocr: true
15
+ qwen_vl:
16
+ pre_prompt: ""
17
+ post_prompt: " Answer:"
EAGLE/lmms_eval/tasks/textvqa/_textvqa.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ group: textvqa
2
+ task:
3
+ - textvqa_val
4
+ - textvqa_test
EAGLE/lmms_eval/tasks/textvqa/textvqa_test.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ task: textvqa_test
2
+ test_split: test
3
+ metric_list:
4
+ - metric: submission
5
+ aggregation: !function utils.textvqa_aggreate_submissions
6
+ higher_is_better: true
7
+ include: _default_template_textvqa_yaml
EAGLE/lmms_eval/tasks/textvqa/textvqa_val.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: textvqa_val
2
+ test_split: validation
3
+ metric_list:
4
+ - metric: exact_match
5
+ aggregation: mean
6
+ higher_is_better: true
7
+ ignore_case: true
8
+ ignore_punctuation: true
9
+ - metric: submission
10
+ aggregation: !function utils.textvqa_aggreate_submissions
11
+ higher_is_better: true
12
+ include: _default_template_textvqa_yaml
EAGLE/lmms_eval/tasks/textvqa/utils.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import json
4
+ import yaml
5
+ import pathlib
6
+ import logging
7
+ import datetime
8
+ import statistics
9
+
10
+ from lmms_eval.tasks._task_utils.vqa_eval_metric import EvalAIAnswerProcessor
11
+ from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
12
+
13
+ eval_logger = logging.getLogger("lmms-eval")
14
+
15
+
16
+ def textvqa_doc_to_visual(doc):
17
+ return [doc["image"].convert("RGB")]
18
+
19
+
20
+ def textvqa_process_results(doc, result):
21
+ eval_ai_processor = EvalAIAnswerProcessor()
22
+ assert len(result) == 1, f"The result should be a list of length 1, but got {len(result)}."
23
+ resAns = eval_ai_processor(result[0])
24
+ accuracy = 0
25
+
26
+ if "answers" in doc and doc["answers"] is not None:
27
+ gtAcc = []
28
+
29
+ for i in range(len(doc["answers"])):
30
+ doc["answers"][i] = eval_ai_processor(doc["answers"][i])
31
+
32
+ for i in range(len(doc["answers"])):
33
+ otherGTAns = [doc["answers"][j] for j in range(len(doc["answers"])) if i != j]
34
+ matchingAns = [item for item in otherGTAns if item == resAns]
35
+ acc = min(1, float(len(matchingAns)) / 3)
36
+ gtAcc.append(acc)
37
+ accuracy = statistics.mean(gtAcc)
38
+
39
+ return {
40
+ "exact_match": accuracy,
41
+ "submission": {
42
+ "question_id": doc["question_id"],
43
+ "answer": resAns,
44
+ },
45
+ }
46
+
47
+
48
+ def textvqa_doc_to_text(doc, model_specific_prompt_kwargs=None):
49
+ pre_prompt = ""
50
+ post_post = ""
51
+ ocr_ref = ""
52
+ if model_specific_prompt_kwargs:
53
+ if "pre_prompt" in model_specific_prompt_kwargs:
54
+ pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
55
+ if "post_prompt" in model_specific_prompt_kwargs:
56
+ post_prompt = model_specific_prompt_kwargs["post_prompt"]
57
+ if "ocr" in model_specific_prompt_kwargs and model_specific_prompt_kwargs["ocr"]:
58
+ ocr_ref = f"\nReference OCR token: {', '.join(doc['ocr_tokens'])}"
59
+ return f"{pre_prompt}{doc['question'].capitalize()}{ocr_ref}{post_prompt}"
60
+
61
+
62
+ def textvqa_aggreate_submissions(results, args):
63
+ now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
64
+ path = generate_submission_file(f"textvqa_submission_{now_date_time}.json", args)
65
+ with open(path, "w") as f:
66
+ json.dump(results, f)
67
+ # print(f"Submission file saved to {path}")
68
+ eval_logger.info(f"Submission file saved to {path}")
EAGLE/lmms_eval/tasks/vizwiz_vqa/_default_template_vqa_yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_path: lmms-lab/VizWiz-VQA
2
+ output_type: generate_until
3
+ doc_to_visual: !function utils.vizwiz_vqa_doc_to_visual
4
+ doc_to_text: !function utils.vizwiz_vqa_doc_to_text
5
+ doc_to_target: "answer"
6
+ generation_kwargs:
7
+ until:
8
+ - "ASSISTANT:"
9
+ metadata:
10
+ - version: 0.0
11
+ model_specific_prompt_kwargs:
12
+ default:
13
+ pre_prompt: ""
14
+ post_prompt: "\nWhen the provided information is insufficient, respond with 'Unanswerable'.\nAnswer the question using a single word or phrase."
15
+ process_results: !function utils.vizwiz_vqa_process_results
EAGLE/lmms_eval/tasks/vizwiz_vqa/_generate_config.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import yaml
3
+
4
+ splits = ["val", "test"]
5
+ tasks = ["vqa"]
6
+
7
+ if __name__ == "__main__":
8
+ dump_tasks = []
9
+ for task in tasks:
10
+ for split in splits:
11
+ yaml_dict = {"group": f"vizwiz_{task}", "task": f"vizwiz_{task}_{split}", "include": f"_default_template_{task}_yaml", "test_split": split}
12
+ if split == "train":
13
+ yaml_dict.pop("group")
14
+ else:
15
+ dump_tasks.append(f"vizwiz_{task}_{split}")
16
+
17
+ save_path = f"./vizwiz_{task}_{split}.yaml"
18
+ print(f"Saving to {save_path}")
19
+ with open(save_path, "w") as f:
20
+ yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False)
21
+
22
+ group_dict = {"group": "vizwiz_vqa", "task": dump_tasks}
23
+
24
+ with open("./_vizwiz_vqa.yaml", "w") as f:
25
+ yaml.dump(group_dict, f, default_flow_style=False, indent=4)
EAGLE/lmms_eval/tasks/vizwiz_vqa/_vizwiz_vqa.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ group: vizwiz_vqa
2
+ task:
3
+ - vizwiz_vqa_val
4
+ - vizwiz_vqa_test
EAGLE/lmms_eval/tasks/vizwiz_vqa/utils.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import json
4
+ import yaml
5
+ import pathlib
6
+ import logging
7
+ import datetime
8
+ import statistics
9
+
10
+ from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
11
+ from lmms_eval.tasks._task_utils.vqa_eval_metric import EvalAIAnswerProcessor
12
+
13
+ eval_logger = logging.getLogger("lmms-eval")
14
+
15
+
16
+ def vizwiz_vqa_doc_to_visual(doc):
17
+ return [doc["image"].convert("RGB")]
18
+
19
+
20
+ def vizwiz_vqa_process_results(doc, result):
21
+ eval_ai_processor = EvalAIAnswerProcessor()
22
+ assert len(result) == 1, f"The result should be a list of length 1, but got {len(result)}."
23
+ resAns = eval_ai_processor(result[0])
24
+ accuracy = 0
25
+
26
+ if "answers" in doc and doc["answers"] is not None:
27
+ gtAcc = []
28
+
29
+ for i in range(len(doc["answers"])):
30
+ doc["answers"][i] = eval_ai_processor(doc["answers"][i])
31
+
32
+ for i in range(len(doc["answers"])):
33
+ otherGTAns = [doc["answers"][j] for j in range(len(doc["answers"])) if i != j]
34
+ matchingAns = [item for item in otherGTAns if item == resAns]
35
+ acc = min(1, float(len(matchingAns)) / 3)
36
+ gtAcc.append(acc)
37
+ if gtAcc:
38
+ accuracy = statistics.mean(gtAcc)
39
+ else:
40
+ accuracy = 0
41
+
42
+ return {
43
+ "exact_match": accuracy,
44
+ "submission": {
45
+ "image": f"{doc['question_id']}.jpg",
46
+ "answer": resAns,
47
+ },
48
+ }
49
+
50
+
51
+ def vizwiz_vqa_doc_to_text(doc, model_specific_prompt_kwargs=None):
52
+ if model_specific_prompt_kwargs is None:
53
+ model_specific_prompt_kwargs = {}
54
+ pre_prompt = ""
55
+ post_prompt = ""
56
+ if "pre_prompt" in model_specific_prompt_kwargs:
57
+ pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
58
+ if "post_prompt" in model_specific_prompt_kwargs:
59
+ post_prompt = model_specific_prompt_kwargs["post_prompt"]
60
+ text = f"{pre_prompt}{doc['question'].capitalize()}{post_prompt}"
61
+ return text
62
+
63
+
64
+ def vizwiz_vqa_aggreate_submissions(results, args):
65
+ now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S")
66
+ submission_file_name = f"vizwiz_vqa-test-submission-{now_date_time}.json"
67
+ path = generate_submission_file(submission_file_name, args)
68
+ with open(path, "w") as f:
69
+ json.dump(results, f)
70
+ print(f"Submission file saved to {path}")
EAGLE/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_test.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ group: vizwiz_vqa
2
+ task: vizwiz_vqa_test
3
+ test_split: test
4
+ include: _default_template_vqa_yaml
5
+ process_results: !function utils.vizwiz_vqa_process_results
6
+ metric_list:
7
+ # - metric: exact_match
8
+ # aggregation: mean
9
+ # higher_is_better: true
10
+ # ignore_case: true
11
+ # ignore_punctuation: true
12
+ - metric: submission
13
+ aggregation: !function utils.vizwiz_vqa_aggreate_submissions
14
+ higher_is_better: true
EAGLE/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_val.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ group: vizwiz_vqa
2
+ task: vizwiz_vqa_val
3
+ test_split: val
4
+ include: _default_template_vqa_yaml
5
+ metric_list:
6
+ - metric: exact_match
7
+ aggregation: mean
8
+ higher_is_better: true
9
+ ignore_case: true
10
+ ignore_punctuation: true
11
+ # - metric: submission
12
+ # aggregation: !function utils.vizwiz_vqa_aggreate_submissions
13
+ # higher_is_better: true
EAGLE/lmms_eval/tasks/vqav2/_default_template_vqav2_yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_path: lmms-lab/VQAv2
2
+ dataset_kwargs:
3
+ token: True
4
+ output_type: generate_until
5
+ doc_to_visual: !function utils.vqav2_doc_to_visual
6
+ doc_to_text: !function utils.vqav2_doc_to_text
7
+ doc_to_target: "answer"
8
+ generation_kwargs:
9
+ max_new_tokens: 16
10
+ metadata:
11
+ - version: 0.0
12
+ model_specific_prompt_kwargs:
13
+ default:
14
+ pre_prompt: ""
15
+ post_prompt: "\nAnswer the question using a single word or phrase."
EAGLE/lmms_eval/tasks/vqav2/_vqav2.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ group: vqav2
2
+ task:
3
+ - vqav2_val
4
+ - vqav2_test
EAGLE/lmms_eval/tasks/vqav2/utils.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import json
4
+ import logging
5
+ import datetime
6
+ import statistics
7
+
8
+ import lmms_eval.tasks._task_utils.file_utils as file_utils
9
+
10
+ from lmms_eval.tasks._task_utils.vqa_eval_metric import EvalAIAnswerProcessor
11
+
12
+
13
+ eval_logger = logging.getLogger("lmms-eval")
14
+
15
+
16
+ def vqav2_doc_to_visual(doc):
17
+ return [doc["image"].convert("RGB")]
18
+
19
+
20
+ def vqav2_process_results(doc, result):
21
+ eval_ai_processor = EvalAIAnswerProcessor()
22
+ assert len(result) == 1, f"The result should be a list of length 1, but got {len(result)}."
23
+ resAns = eval_ai_processor(result[0])
24
+ accuracy = 0
25
+
26
+ if "answers" in doc and doc["answers"] is not None:
27
+ for ansDic in doc["answers"]:
28
+ ansDic["answer"] = ansDic["answer"].replace("\n", " ")
29
+ ansDic["answer"] = ansDic["answer"].replace("\t", " ")
30
+ ansDic["answer"] = ansDic["answer"].strip()
31
+ gtAcc = []
32
+ gtAnswers = [ans["answer"] for ans in doc["answers"]]
33
+
34
+ if len(set(gtAnswers)) > 1:
35
+ for ansDic in doc["answers"]:
36
+ ansDic["answer"] = eval_ai_processor.process_punctuation(ansDic["answer"])
37
+ ansDic["answer"] = eval_ai_processor.process_digit_article(ansDic["answer"])
38
+ resAns = eval_ai_processor.process_punctuation(resAns)
39
+ resAns = eval_ai_processor.process_digit_article(resAns)
40
+
41
+ for gtAnsDatum in doc["answers"]:
42
+ otherGTAns = [item for item in doc["answers"] if item != gtAnsDatum]
43
+ matchingAns = [item for item in otherGTAns if item["answer"] == resAns]
44
+ acc = min(1, float(len(matchingAns)) / 3)
45
+ gtAcc.append(acc)
46
+ accuracy = statistics.mean(gtAcc)
47
+
48
+ return {
49
+ "exact_match": accuracy,
50
+ "submission": {
51
+ "question_id": doc["question_id"],
52
+ "answer": resAns,
53
+ },
54
+ }
55
+
56
+
57
+ def vqav2_process_results_test(doc, result):
58
+ res = vqav2_process_results(doc, result)
59
+ return {
60
+ "submission": res["submission"],
61
+ }
62
+
63
+
64
+ def vqav2_process_results_val(doc, result):
65
+ res = vqav2_process_results(doc, result)
66
+ return {
67
+ "exact_match": res["exact_match"],
68
+ }
69
+
70
+
71
+ def vqav2_doc_to_text(doc, model_specific_prompt_kwargs=None):
72
+ if model_specific_prompt_kwargs is None:
73
+ model_specific_prompt_kwargs = {}
74
+ pre_prompt = ""
75
+ post_prompt = ""
76
+ if "pre_prompt" in model_specific_prompt_kwargs:
77
+ pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
78
+ if "post_prompt" in model_specific_prompt_kwargs:
79
+ post_prompt = model_specific_prompt_kwargs["post_prompt"]
80
+ return f"{pre_prompt}{doc['question']}{post_prompt}"
81
+
82
+
83
+ def vqav2_aggreate_submissions(results, args):
84
+ now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
85
+ submission_file_name = f"vqav2-test-submission-{now_date_time}.json"
86
+ path = file_utils.generate_submission_file(submission_file_name, args)
87
+ with open(path, "w") as f:
88
+ json.dump(results, f)
89
+ eval_logger.info(f"Submission file saved to {path}")
EAGLE/lmms_eval/tasks/vqav2/vqav2_test.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ task: "vqav2_test"
2
+ include: _default_template_vqav2_yaml
3
+ test_split: test
4
+ metric_list:
5
+ - metric: submission
6
+ aggregation: !function utils.vqav2_aggreate_submissions
7
+ higher_is_better: true
8
+ process_results: !function utils.vqav2_process_results_test