1f commited on
Commit
fa29beb
·
verified ·
1 Parent(s): 052bf16

Add files using upload-large-folder tool

Browse files
Files changed (20) hide show
  1. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/README.md +59 -0
  2. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/__init__.py +12 -0
  3. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/common.py +222 -0
  4. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/doc_parsing_evaluator.py +256 -0
  5. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/kie_evaluator.py +385 -0
  6. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmdu.py +126 -0
  7. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmniah.py +298 -0
  8. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmvet.py +106 -0
  9. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/multiple_choice.py +558 -0
  10. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mvbench.py +509 -0
  11. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/naturalbench.py +145 -0
  12. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/ocrbench.py +65 -0
  13. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/olympiadbench.py +532 -0
  14. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/qspatial.py +123 -0
  15. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/tablevqabench.py +500 -0
  16. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/tempcompass.py +254 -0
  17. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/videomme.py +150 -0
  18. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/vqa_eval.py +285 -0
  19. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/wemath.py +896 -0
  20. r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/yorn.py +244 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating Large Multimodal Models in Literacy
2
+
3
+ ## Introduction
4
+
5
+ Please refer to our [GitHub](https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/Benchmarks/CC-OCR) for more information.
6
+
7
+ ## Running Scripts
8
+
9
+ Once the environment is ready, execute the following script from the root directory of VLMEvalKit
10
+ to perform inference and evaluation tasks in batch.
11
+
12
+ ```shell
13
+ MODEL_NAME="QwenVLMax"
14
+ OUTPUT_DIR="/your/path/to/output_dir"
15
+
16
+ SUB_OUTPUT_DIR=${OUTPUT_DIR}/multi_scene_ocr
17
+ python run.py --data CCOCR_MultiSceneOcr_Cord CCOCR_MultiSceneOcr_Funsd CCOCR_MultiSceneOcr_Iam CCOCR_MultiSceneOcr_ZhDoc CCOCR_MultiSceneOcr_ZhHandwriting CCOCR_MultiSceneOcr_Hieragent CCOCR_MultiSceneOcr_Ic15 CCOCR_MultiSceneOcr_Inversetext CCOCR_MultiSceneOcr_Totaltext CCOCR_MultiSceneOcr_ZhScene CCOCR_MultiSceneOcr_UgcLaion CCOCR_MultiSceneOcr_ZhDense CCOCR_MultiSceneOcr_ZhVertical --model ${MODEL_NAME} --work-dir ${SUB_OUTPUT_DIR} --verbose
18
+ python vlmeval/dataset/utils/ccocr_evaluator/common.py ${SUB_OUTPUT_DIR}
19
+
20
+ SUB_OUTPUT_DIR=${OUTPUT_DIR}/multi_lan_ocr
21
+ python run.py --data CCOCR_MultiLanOcr_Arabic CCOCR_MultiLanOcr_French CCOCR_MultiLanOcr_German CCOCR_MultiLanOcr_Italian CCOCR_MultiLanOcr_Japanese CCOCR_MultiLanOcr_Korean CCOCR_MultiLanOcr_Portuguese CCOCR_MultiLanOcr_Russian CCOCR_MultiLanOcr_Spanish CCOCR_MultiLanOcr_Vietnamese --model ${MODEL_NAME} --work-dir ${SUB_OUTPUT_DIR} --verbose
22
+ python vlmeval/dataset/utils/ccocr_evaluator/common.py ${SUB_OUTPUT_DIR}
23
+
24
+ SUB_OUTPUT_DIR=${OUTPUT_DIR}/doc_parsing
25
+ python run.py --data CCOCR_DocParsing_DocPhotoChn CCOCR_DocParsing_DocPhotoEng CCOCR_DocParsing_DocScanChn CCOCR_DocParsing_DocScanEng CCOCR_DocParsing_TablePhotoChn CCOCR_DocParsing_TablePhotoEng CCOCR_DocParsing_TableScanChn CCOCR_DocParsing_TableScanEng CCOCR_DocParsing_MolecularHandwriting CCOCR_DocParsing_FormulaHandwriting --model ${MODEL_NAME} --work-dir ${SUB_OUTPUT_DIR} --verbose
26
+ python vlmeval/dataset/utils/ccocr_evaluator/common.py ${SUB_OUTPUT_DIR}
27
+
28
+ SUB_OUTPUT_DIR=${OUTPUT_DIR}/kie
29
+ python run.py --data CCOCR_Kie_Sroie2019Word CCOCR_Kie_Cord CCOCR_Kie_EphoieScut CCOCR_Kie_Poie CCOCR_Kie_ColdSibr CCOCR_Kie_ColdCell --model ${MODEL_NAME} --work-dir ${SUB_OUTPUT_DIR} --verbose
30
+ python vlmeval/dataset/utils/ccocr_evaluator/common.py ${SUB_OUTPUT_DIR}
31
+ ```
32
+
33
+ ## Example Output
34
+ The evaluation results will be saved in `${SUB_OUTPUT_DIR}/summary.md`. For example, for the KIE subset,
35
+ the output is as follows:
36
+
37
+ | exp_name(f1_score) | COLD_CELL | COLD_SIBR | CORD | EPHOIE_SCUT | POIE | sroie2019_word | summary |
38
+ |:-------------------|------------:|------------:|-------:|--------------:|-------:|-----------------:|----------:|
39
+ | QwenVLMax | 81.01 | 72.46 | 69.33 | 71.2 | 60.85 | 76.37 | 71.87 |
40
+
41
+
42
+ ## Citation
43
+ If you find our work helpful, feel free to give us a cite.
44
+
45
+ ```
46
+ @misc{yang2024ccocr,
47
+ title={CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating Large Multimodal Models in Literacy},
48
+ author={Zhibo Yang and Jun Tang and Zhaohai Li and Pengfei Wang and Jianqiang Wan and Humen Zhong and Xuejing Liu and Mingkun Yang and Peng Wang and Shuai Bai and LianWen Jin and Junyang Lin},
49
+ year={2024},
50
+ eprint={2412.02210},
51
+ archivePrefix={arXiv},
52
+ primaryClass={cs.CV},
53
+ url={https://arxiv.org/abs/2412.02210},
54
+ }
55
+ ```
56
+
57
+ ## Contact Us
58
+
59
+ If you have any questions, feel free to send an email to: wpf272043@alibaba-inc.com or xixing.tj@alibaba-inc.com
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .kie_evaluator import KieEvaluator
2
+ from .doc_parsing_evaluator import ParsingEvaluator
3
+ from .ocr_evaluator import OcrEvaluator
4
+ from .common import summary
5
+
6
+
7
+ evaluator_map_info = {
8
+ "kie": KieEvaluator("kie"),
9
+ "doc_parsing": ParsingEvaluator("doc_parsing"),
10
+ "multi_lan_ocr": OcrEvaluator("multi_lan_ocr"),
11
+ "multi_scene_ocr": OcrEvaluator("multi_scene_ocr")
12
+ }
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/common.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import time
4
+ import sys
5
+ from abc import abstractmethod
6
+ from tabulate import tabulate
7
+
8
+
9
+ def pick_response_text(json_path):
10
+ """
11
+ """
12
+ try:
13
+ with open(json_path, "r") as f:
14
+ json_data = json.load(f)
15
+ except Exception as e:
16
+ print("--> file error: msg: {}, path: {}".format(e, json_path))
17
+ return None
18
+
19
+ for required_key in ["model_name", "response"]:
20
+ if required_key not in json_data:
21
+ print("--> required key not exists, name: {}, path: {}".format(required_key, json_path))
22
+ return None
23
+
24
+ model_name = json_data["model_name"]
25
+ model_response = json_data["response"]
26
+
27
+ response_text = None
28
+ if model_name.startswith("gpt") or model_name.startswith("o1"):
29
+ response_text = model_response.get("data", {}).get("response", {}).get("choices", [{}])[0].get("message", {}).get("content", None) # noqa: E501
30
+ elif model_name.startswith("local_"):
31
+ response_text = model_response
32
+ else:
33
+ if model_name.startswith("claude"):
34
+ content_list = model_response.get("content", None)
35
+ elif model_name.startswith("gemini"):
36
+ content_list = model_response.get("candidates", [{}])[0].get("content", {}).get("parts", None)
37
+ elif model_name.startswith("qwen"):
38
+ content_list = model_response.get("output", {}).get("choices", [{}])[0].get("message", {}).get("content", None) # noqa: E501
39
+ else:
40
+ raise NotImplementedError("The pick_response_text NOT implemented for model: {}".format(model_name))
41
+
42
+ if isinstance(content_list, list) and len(content_list) > 0:
43
+ response_text = content_list[0].get("text", None)
44
+
45
+ if response_text is None:
46
+ print("--> [error][{}] text pick error, path: {}".format(model_name, json_path))
47
+ return response_text
48
+
49
+
50
+ def load_response_from_dir(res_dir):
51
+ """
52
+ """
53
+ response_info = {}
54
+ for file_name in os.listdir(res_dir):
55
+ file_path = os.path.abspath(os.path.join(res_dir, file_name))
56
+ if not file_name.endswith(".json"):
57
+ print("--> skip: result file should be a json: but got: {}".format(file_path))
58
+ continue
59
+
60
+ response_text = pick_response_text(file_path)
61
+ if response_text is None:
62
+ continue
63
+
64
+ file_name_wo_ext, ext = os.path.splitext(file_name)
65
+ response_info[file_name_wo_ext] = response_text
66
+ return response_info
67
+
68
+
69
+ class BaseMetric(object):
70
+ """ BaseMetric """
71
+ """ OCRMetric """
72
+ def __init__(self, group_name, **kwargs):
73
+ self.group_name = group_name
74
+ self.kwargs = kwargs
75
+
76
+ def response_post_func(self, response_text, **kwargs):
77
+ return response_text
78
+
79
+ @abstractmethod
80
+ # Given the prediction and gt, return the evaluation results in the format of a dictionary
81
+ # results should contain a 'summary' key, for example:
82
+ # {
83
+ # "summary": {
84
+ # "f1-score": 99.99,
85
+ # "metric_name": "metric_value" # used for summary,only metric info could be placed in this dict.
86
+ # },
87
+ # "your other info": "xxx"
88
+ # }
89
+ def evaluate(self, response_info, gt_info, normalize_func=None, **kwargs):
90
+ pass
91
+
92
+ def __call__(self, pdt_res_dir, gt_info, with_response_ratio=True, **kwargs):
93
+ if isinstance(pdt_res_dir, dict):
94
+ raw_response_info = pdt_res_dir
95
+ elif os.path.exists(pdt_res_dir) and os.path.isdir(pdt_res_dir):
96
+ raw_response_info = load_response_from_dir(pdt_res_dir)
97
+ else:
98
+ return ValueError("invalid input: response dict or folder are required, but got {}".format(pdt_res_dir))
99
+
100
+ post_error_list, response_info = [], {}
101
+ response_error_list = list(gt_info.keys() - raw_response_info.keys())
102
+ for file_name, single_pdt_str in raw_response_info.items():
103
+ single_pdt_str = self.response_post_func(single_pdt_str, **kwargs)
104
+ if single_pdt_str is None:
105
+ post_error_list.append(file_name)
106
+ continue
107
+ response_info[file_name] = single_pdt_str
108
+
109
+ meta_info = {
110
+ "gt_total_num": len(gt_info), "pdt_total_num": len(response_info),
111
+ "post_error_list": post_error_list, "response_error_list": response_error_list,
112
+ }
113
+ eval_info = self.evaluate(response_info, gt_info, **kwargs)
114
+
115
+ # add response_success_ratio
116
+ if "summary" in eval_info and with_response_ratio:
117
+ success_ratio = (len(response_info) + len(post_error_list)) / (len(gt_info) + 1e-9)
118
+ eval_info["summary"].update({"response_success_ratio": success_ratio})
119
+ return meta_info, eval_info
120
+
121
+
122
+ def summary(index_path, exp_dir_base, is_weighted_sum=False):
123
+ """
124
+ """
125
+ with open(index_path, "r") as f:
126
+ data_list = json.load(f)
127
+
128
+ all_data_info = {}
129
+ for data_info_item in data_list:
130
+ data_name = data_info_item["dataset"]
131
+ if not data_info_item.get("release", True):
132
+ continue
133
+ all_data_info[data_name] = data_info_item
134
+ dataset_list = list(all_data_info.keys())
135
+ summary_path = summary_multi_exp(exp_dir_base, dataset_list, is_weighted_sum=is_weighted_sum)
136
+ return summary_path
137
+
138
+
139
+ def summary_multi_exp(exp_dir_base, dataset_list=None, is_weighted_sum=False):
140
+ """
141
+ """
142
+ if dataset_list is None:
143
+ all_dataset_name = []
144
+ for exp_name in os.listdir(exp_dir_base):
145
+ dir_status_path = os.path.join(exp_dir_base, exp_name, "status.json")
146
+ if not os.path.exists(dir_status_path):
147
+ continue
148
+ with open(dir_status_path, "r") as f:
149
+ data_status_info = json.load(f)
150
+ all_dataset_name.extend(data_status_info.keys())
151
+ dataset_list = sorted(set(all_dataset_name))
152
+
153
+ # summary main code
154
+ all_evaluate_info, _ = {}, 0
155
+ for exp_name in os.listdir(exp_dir_base):
156
+ dir_status_path = os.path.join(exp_dir_base, exp_name, "status.json")
157
+ if not os.path.exists(dir_status_path):
158
+ print("--> skip: status.json not exist: {}".format(dir_status_path))
159
+ continue
160
+
161
+ with open(dir_status_path, "r") as f:
162
+ all_status_info = json.load(f)
163
+
164
+ for data_name in dataset_list:
165
+ total_num = all_status_info.get(data_name, {}).get("config", {}).get("num", "-1")
166
+ summary_info = all_status_info.get(data_name, {}).get("evaluation", {}).get("summary", {})
167
+ for metric_name, metric_value in summary_info.items():
168
+ if metric_name not in all_evaluate_info:
169
+ all_evaluate_info[metric_name] = {}
170
+ if exp_name not in all_evaluate_info[metric_name]:
171
+ all_evaluate_info[metric_name][exp_name] = {}
172
+ all_evaluate_info[metric_name][exp_name][data_name] = (metric_value, total_num)
173
+
174
+ all_table_md = []
175
+ for metric_name, metric_info in all_evaluate_info.items():
176
+ formatted_time = time.strftime("%Y-%m-%d %H:%M", time.localtime(time.time()))
177
+ summary_line_list = []
178
+ summary_key_name = "summary(weighted)" if is_weighted_sum else "summary"
179
+ summary_head = [f"exp_name({metric_name}_{formatted_time})"] + dataset_list + [summary_key_name]
180
+ for exp_name, data_eval_info in metric_info.items():
181
+ summary_line = [exp_name, ]
182
+
183
+ all_metric_value = 0
184
+ is_summary_valid, all_total_num, all_weighted_metric = True, 0, 0
185
+ for data_name in dataset_list:
186
+ metric_value, total_num = data_eval_info.get(data_name, ("-1", "-1"))
187
+ summary_line.append("{:.2f}".format(float(metric_value) * 100))
188
+ if str(metric_value) == "-1" or str(metric_value) == "-1":
189
+ is_summary_valid = False
190
+ continue
191
+
192
+ all_total_num += float(total_num)
193
+ all_weighted_metric += float(total_num) * float(metric_value)
194
+ all_metric_value += float(metric_value)
195
+
196
+ summary_value_valid = ((all_weighted_metric / (all_total_num + 1e-9)) * 100) if is_weighted_sum \
197
+ else (all_metric_value / (len(dataset_list) + 1e-9) * 100)
198
+ summary_value = "-" if not is_summary_valid else "{:.2f}".format(summary_value_valid)
199
+ summary_line.append(summary_value)
200
+ summary_line_list.append(summary_line)
201
+
202
+ md_table_info = tabulate(summary_line_list, headers=summary_head, tablefmt='pipe')
203
+ all_table_md.append(md_table_info)
204
+
205
+ print("\n\n".join(all_table_md))
206
+ summary_path = os.path.abspath(os.path.join(exp_dir_base, "summary.md"))
207
+ with open(summary_path, "w") as f:
208
+ f.write("\n\n".join(all_table_md))
209
+ return summary_path
210
+
211
+
212
+ if __name__ == '__main__':
213
+ if len(sys.argv) != 2:
214
+ print("Usage: python {} exp_base_dir".format(__file__))
215
+ exit(-1)
216
+ else:
217
+ print('--> info: {}'.format(sys.argv))
218
+ exp_base_dir = sys.argv[1]
219
+
220
+ summary_path = summary_multi_exp(exp_base_dir, dataset_list=None, is_weighted_sum=False)
221
+ print("--> info: summary saved at : {}".format(summary_path))
222
+ print("happy coding.")
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/doc_parsing_evaluator.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ import re
3
+ from tqdm import tqdm
4
+ from collections import deque
5
+ from apted.helpers import Tree
6
+ from apted import APTED, Config
7
+
8
+ # local import
9
+ from .common import BaseMetric
10
+
11
+
12
+ # 移除指定的LaTeX命令
13
+ patterns = [
14
+ r'\\documentclass\{.*?\}',
15
+ r'\\usepackage\[.*?\]\{.*?\}',
16
+ r'\\usepackage\{.*?\}',
17
+ r'\\geometry\{.*?\}',
18
+ r'\\begin\{document\}',
19
+ r'\\end\{document\}',
20
+ r'\\noindent'
21
+ ]
22
+
23
+
24
+ class TableTree(Tree):
25
+ """
26
+ # Copyright 2020 IBM
27
+ # Author: peter.zhong@au1.ibm.com
28
+ # License: Apache 2.0 License.
29
+ """
30
+ def __init__(self, tag, colspan=None, rowspan=None, content=None, *children):
31
+ self.tag = tag
32
+ self.colspan = colspan
33
+ self.rowspan = rowspan
34
+ self.content = content
35
+ self.children = list(children)
36
+
37
+ def bracket(self):
38
+ """Show tree using brackets notation"""
39
+ if self.tag == "td":
40
+ result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % (
41
+ self.tag,
42
+ self.colspan,
43
+ self.rowspan,
44
+ self.content,
45
+ )
46
+ else:
47
+ result = '"tag": %s' % self.tag
48
+ for child in self.children:
49
+ result += child.bracket()
50
+ return "{{{}}}".format(result)
51
+
52
+
53
+ class CustomConfig(Config):
54
+ """
55
+ # Copyright 2020 IBM
56
+ # Author: peter.zhong@au1.ibm.com
57
+ # License: Apache 2.0 License.
58
+ """
59
+ def rename(self, node1, node2):
60
+ """Compares attributes of trees"""
61
+ # print(node1.tag)
62
+ if (
63
+ (node1.tag != node2.tag)
64
+ or (node1.colspan != node2.colspan)
65
+ or (node1.rowspan != node2.rowspan)
66
+ ):
67
+ return 1.0
68
+ if node1.tag == "td":
69
+ if node1.content or node2.content:
70
+ return nltk.edit_distance(node1.content, node2.content) / max(len(node1.content), len(node2.content))
71
+ return 0.0
72
+
73
+
74
+ class TEDS(object):
75
+ """Tree Edit Distance basead Similarity
76
+ # Copyright 2020 IBM
77
+ # Author: peter.zhong@au1.ibm.com
78
+ # License: Apache 2.0 License.
79
+ """
80
+ def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None):
81
+ assert isinstance(n_jobs, int) and (
82
+ n_jobs >= 1
83
+ ), "n_jobs must be an integer greather than 1"
84
+ self.structure_only = structure_only
85
+ self.n_jobs = n_jobs
86
+ self.ignore_nodes = ignore_nodes
87
+ self.__tokens__ = []
88
+
89
+ def tokenize(self, node):
90
+ """Tokenizes table cells"""
91
+ self.__tokens__.append("<%s>" % node.tag)
92
+ if node.text is not None:
93
+ self.__tokens__ += list(node.text)
94
+ for n in node.getchildren():
95
+ self.tokenize(n)
96
+ if node.tag != "unk":
97
+ self.__tokens__.append("</%s>" % node.tag)
98
+ if node.tag != "td" and node.tail is not None:
99
+ self.__tokens__ += list(node.tail)
100
+
101
+ def load_html_tree(self, node, parent=None):
102
+ """Converts HTML tree to the format required by apted"""
103
+ global __tokens__
104
+ if node.tag == "td":
105
+ if self.structure_only:
106
+ cell = []
107
+ else:
108
+ self.__tokens__ = []
109
+ self.tokenize(node)
110
+ cell = self.__tokens__[1:-1].copy()
111
+ new_node = TableTree(
112
+ node.tag,
113
+ int(node.attrib.get("colspan", "1")),
114
+ int(node.attrib.get("rowspan", "1")),
115
+ cell,
116
+ *deque(),
117
+ )
118
+ else:
119
+ new_node = TableTree(node.tag, None, None, None, *deque())
120
+ if parent is not None:
121
+ parent.children.append(new_node)
122
+ if node.tag != "td":
123
+ for n in node.getchildren():
124
+ self.load_html_tree(n, new_node)
125
+ if parent is None:
126
+ return new_node
127
+
128
+ def evaluate(self, pred, true):
129
+ """Computes TEDS score between the prediction and the ground truth of a
130
+ given sample
131
+ """
132
+ # try_import("lxml")
133
+ from lxml import etree, html
134
+ if (not pred) or (not true):
135
+ return 0.0
136
+
137
+ parser = html.HTMLParser(remove_comments=True, encoding="utf-8")
138
+ pred = html.fromstring(pred, parser=parser)
139
+ true = html.fromstring(true, parser=parser)
140
+ if pred.xpath("body/table") and true.xpath("body/table"):
141
+ pred = pred.xpath("body/table")[0]
142
+ true = true.xpath("body/table")[0]
143
+ if self.ignore_nodes:
144
+ etree.strip_tags(pred, *self.ignore_nodes)
145
+ etree.strip_tags(true, *self.ignore_nodes)
146
+ n_nodes_pred = len(pred.xpath(".//*"))
147
+ n_nodes_true = len(true.xpath(".//*"))
148
+ n_nodes = max(n_nodes_pred, n_nodes_true)
149
+ tree_pred = self.load_html_tree(pred)
150
+ tree_true = self.load_html_tree(true)
151
+ distance = APTED(
152
+ tree_pred, tree_true, CustomConfig()
153
+ ).compute_edit_distance()
154
+ return 1.0 - (float(distance) / n_nodes)
155
+ else:
156
+ return 0.0
157
+
158
+
159
+ class ParsingEvaluator(BaseMetric):
160
+ def response_post_func(self, response_text, **kwargs):
161
+ return response_text
162
+
163
+ def evaluate(self, response_info, gt_info, **kwargs):
164
+ op = kwargs['op']
165
+ if op == 'doc':
166
+ score = self.eval_doc(response_info, gt_info)
167
+ elif op == 'table':
168
+ score = self.eval_table(response_info, gt_info)
169
+ elif op in ['molecular', "formula"]:
170
+ score = self.eval_formula(response_info, gt_info, op_name=op)
171
+ else:
172
+ raise ValueError(f'doc parsing unsupported op: {op}')
173
+
174
+ # summary info
175
+ eval_info = {"summary": {"score": score}}
176
+ return eval_info
177
+
178
+ def eval_doc(self, response_info, gt_info):
179
+ results = []
180
+ for img_name, gt in tqdm(gt_info.items()):
181
+ if img_name not in response_info:
182
+ results.append(0)
183
+ continue
184
+
185
+ pred = response_info[img_name]
186
+ for pattern in patterns:
187
+ pred = re.sub(pattern, '', pred)
188
+
189
+ try:
190
+ pred = pred.split('```')[1]
191
+ except:
192
+ pass
193
+
194
+ pred = pred.replace('```latex', '')
195
+ pred = pred.replace('```', '')
196
+
197
+ pred = pred.replace(' ', '').replace('\n', '')
198
+ gt = gt.replace(' ', '').replace('\n', '')
199
+
200
+ edit_dist = nltk.edit_distance(pred, gt) / max(len(pred), len(gt))
201
+ results.append(1 - edit_dist)
202
+
203
+ score = sum(results) / len(results)
204
+ return score
205
+
206
+ def eval_table(self, response_info, gt_info):
207
+ teds = TEDS(structure_only=False, n_jobs=1)
208
+ results = []
209
+ for img_name, gt in tqdm(gt_info.items()):
210
+ if img_name not in response_info:
211
+ results.append(0)
212
+ continue
213
+
214
+ pred = response_info[img_name]
215
+ for pattern in patterns:
216
+ pred = re.sub(pattern, '', pred)
217
+
218
+ try:
219
+ pred = pred.split('```html')[1]
220
+ except:
221
+ pass
222
+
223
+ pred = pred.replace('```', '')
224
+ pred = pred.replace(' ', '').replace('\n', '').replace(',', ',')
225
+ gt = gt.replace(' ', '').replace('\n', '')
226
+
227
+ pred_html = '<html><body>{}</body></html>'.format(pred)
228
+ gt_html = '<html><body>{}</body></html>'.format(gt)
229
+ results.append(teds.evaluate(pred_html, gt_html))
230
+
231
+ score = sum(results) / len(results)
232
+ return score
233
+
234
+ def eval_formula(self, response_info, gt_info, op_name='formula'):
235
+ results = []
236
+ for img_name, gt in tqdm(gt_info.items()):
237
+ if img_name not in response_info:
238
+ results.append(0)
239
+ continue
240
+
241
+ pred = response_info[img_name]
242
+
243
+ if op_name == 'formula':
244
+ pred = pred.replace("\n", " ").replace("```latex", "").replace("```", "").replace("\t", " ").replace(" ", "") # noqa: E501
245
+ gt = gt.replace(" ", "")
246
+ elif op_name == 'molecular':
247
+ pred = pred.replace("\n", "").replace(" ", "").replace("<smiles>", "").replace("</smiles>", "")
248
+ gt = gt.replace(" ", "")
249
+ edit_dist = nltk.edit_distance(pred, gt) / max(len(pred), len(gt))
250
+ results.append(1 - edit_dist)
251
+ score = sum(results) / len(results)
252
+ return score
253
+
254
+
255
+ if __name__ == '__main__':
256
+ pass
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/kie_evaluator.py ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ Donut
4
+ Copyright (c) 2022-present NAVER Corp.
5
+ MIT License
6
+ """
7
+ import json
8
+ import os
9
+ import sys
10
+ import re
11
+ import time
12
+ from typing import Any, Dict, List, Tuple, Union
13
+
14
+ import zss
15
+ from zss import Node
16
+ from collections import Counter
17
+ from nltk import edit_distance
18
+
19
+ # local import
20
+ from .common import BaseMetric
21
+
22
+
23
+ def flatten(data: dict):
24
+ """
25
+ Convert Dictionary into Non-nested Dictionary
26
+ Example:
27
+ input(dict)
28
+ {
29
+ "menu": [
30
+ {"name" : ["cake"], "count" : ["2"]},
31
+ {"name" : ["juice"], "count" : ["1"]},
32
+ ]
33
+ }
34
+ output(list)
35
+ [
36
+ ("menu.name", "cake"),
37
+ ("menu.count", "2"),
38
+ ("menu.name", "juice"),
39
+ ("menu.count", "1"),
40
+ ]
41
+ """
42
+ flatten_data = list()
43
+
44
+ def _flatten(value, key=""):
45
+ if type(value) is dict:
46
+ for child_key, child_value in value.items():
47
+ _flatten(child_value, f"{key}.{child_key}" if key else child_key)
48
+ elif type(value) is list:
49
+ for value_item in value:
50
+ _flatten(value_item, key)
51
+ else:
52
+ flatten_data.append((key, value))
53
+
54
+ _flatten(data)
55
+ return flatten_data
56
+
57
+
58
+ def update_cost(node1: Node, node2: Node):
59
+ """
60
+ Update cost for tree edit distance.
61
+ If both are leaf node, calculate string edit distance between two labels (special token '<leaf>' will be ignored).
62
+ If one of them is leaf node, cost is length of string in leaf node + 1.
63
+ If neither are leaf node, cost is 0 if label1 is same with label2 othewise 1
64
+ """
65
+ label1 = node1.label
66
+ label2 = node2.label
67
+ label1_leaf = "<leaf>" in label1
68
+ label2_leaf = "<leaf>" in label2
69
+ if label1_leaf and label2_leaf:
70
+ return edit_distance(label1.replace("<leaf>", ""), label2.replace("<leaf>", ""))
71
+ elif not label1_leaf and label2_leaf:
72
+ return 1 + len(label2.replace("<leaf>", ""))
73
+ elif label1_leaf and not label2_leaf:
74
+ return 1 + len(label1.replace("<leaf>", ""))
75
+ else:
76
+ return int(label1 != label2)
77
+
78
+
79
+ def insert_and_remove_cost(node: Node):
80
+ """
81
+ Insert and remove cost for tree edit distance.
82
+ If leaf node, cost is length of label name.
83
+ Otherwise, 1
84
+ """
85
+ label = node.label
86
+ if "<leaf>" in label:
87
+ return len(label.replace("<leaf>", ""))
88
+ else:
89
+ return 1
90
+
91
+
92
+ def normalize_dict(data: Union[Dict, List, Any]):
93
+ """
94
+ Sort by value, while iterate over element if data is list
95
+ """
96
+ # if not data:
97
+ # return {}
98
+
99
+ if isinstance(data, dict):
100
+ new_data = dict()
101
+ for key in sorted(data.keys(), key=lambda k: (len(k), k)):
102
+ value = normalize_dict(data[key])
103
+ if value:
104
+ if not isinstance(value, list):
105
+ value = [value]
106
+ new_data[key] = value
107
+
108
+ elif isinstance(data, list):
109
+ if all(isinstance(item, dict) for item in data):
110
+ new_data = []
111
+ for item in data:
112
+ item = normalize_dict(item)
113
+ if item:
114
+ new_data.append(item)
115
+ else:
116
+ new_data = [str(item).strip() for item in data if type(item) in {str, int, float} and str(item).strip()]
117
+ else:
118
+ new_data = [str(data).strip()]
119
+ return new_data
120
+
121
+
122
+ def cal_f1_all(preds, answers):
123
+ """
124
+ Calculate global F1 accuracy score (field-level, micro-averaged) by counting all true positives,
125
+ false negatives and false positives
126
+ """
127
+ metric_info, error_info = {}, {}
128
+ total_tp, total_fn_or_fp = 0, 0
129
+ for file_name, answer in answers.items():
130
+ sample_error_info = {"fp": [], "fn": [], "tp": []}
131
+ pred = preds.get(file_name, {})
132
+ pred, answer = flatten(normalize_dict(pred)), flatten(normalize_dict(answer))
133
+ for field in pred:
134
+ field_name = field[0]
135
+ if field_name not in metric_info:
136
+ metric_info[field_name] = {"total_tp": 0, "total_fn_or_fp": 0}
137
+ if field in answer:
138
+ total_tp += 1
139
+ metric_info[field_name]["total_tp"] += 1
140
+ sample_error_info["tp"].append(field)
141
+ answer.remove(field)
142
+ else:
143
+ total_fn_or_fp += 1
144
+ metric_info[field_name]["total_fn_or_fp"] += 1
145
+ sample_error_info["fp"].append(field)
146
+
147
+ total_fn_or_fp += len(answer)
148
+ for field in answer:
149
+ field_name = field[0]
150
+ if field_name not in metric_info:
151
+ metric_info[field_name] = {"total_tp": 0, "total_fn_or_fp": 0}
152
+ metric_info[field_name]["total_fn_or_fp"] += 1
153
+ sample_error_info["fn"].append(field)
154
+
155
+ sample_error_num = sum([len(v) for k, v in sample_error_info.items() if k != "tp"])
156
+ if sample_error_num > 0:
157
+ sample_error_info["error_num"] = sample_error_num
158
+ error_class_list = ["counter_" + x[0] for x in (sample_error_info["fn"] + sample_error_info["fp"])]
159
+ counter = Counter(error_class_list)
160
+ sample_error_info["error_info"] = dict(counter)
161
+ error_info[file_name] = sample_error_info
162
+
163
+ # summary
164
+ for field_name, field_info in metric_info.items():
165
+ field_tp, field_fn_or_fp = field_info["total_tp"], field_info["total_fn_or_fp"]
166
+ metric_info[field_name]["acc"] = field_tp / (field_tp + field_fn_or_fp / 2 + 1e-6)
167
+
168
+ print("donut_evaluator: total_tp: {}, total_fn_or_fp: {}, ptd_num: {}, gt_num: {}".format(total_tp, total_fn_or_fp,
169
+ len(preds), len(answers)))
170
+ error_info = {k: v for k, v in
171
+ sorted(error_info.items(), key=lambda item: item[1].get("error_num", 0), reverse=True)}
172
+ metric_info = {k: v for k, v in
173
+ sorted(metric_info.items(), key=lambda item: item[1].get("total_fn_or_fp", 0), reverse=True)}
174
+ return total_tp / (total_tp + total_fn_or_fp / 2 + 1e-6), metric_info, error_info
175
+
176
+
177
+ def construct_tree_from_dict(data: Union[Dict, List], node_name: str = None):
178
+ """
179
+ Convert Dictionary into Tree
180
+
181
+ Example:
182
+ input(dict)
183
+
184
+ {
185
+ "menu": [
186
+ {"name" : ["cake"], "count" : ["2"]},
187
+ {"name" : ["juice"], "count" : ["1"]},
188
+ ]
189
+ }
190
+
191
+ output(tree)
192
+ <root>
193
+ |
194
+ menu
195
+ / \
196
+ <subtree> <subtree>
197
+ / | | \
198
+ name count name count
199
+ / | | \
200
+ <leaf>cake <leaf>2 <leaf>juice <leaf>1
201
+ """
202
+ if node_name is None:
203
+ node_name = "<root>"
204
+
205
+ node = Node(node_name)
206
+
207
+ if isinstance(data, dict):
208
+ for key, value in data.items():
209
+ kid_node = construct_tree_from_dict(value, key)
210
+ node.addkid(kid_node)
211
+ elif isinstance(data, list):
212
+ if all(isinstance(item, dict) for item in data):
213
+ for item in data:
214
+ kid_node = construct_tree_from_dict(
215
+ item,
216
+ "<subtree>",
217
+ )
218
+ node.addkid(kid_node)
219
+ else:
220
+ for item in data:
221
+ node.addkid(Node(f"<leaf>{item}"))
222
+ else:
223
+ raise Exception(data, node_name)
224
+ return node
225
+
226
+
227
+ def cal_acc(pred: dict, answer: dict):
228
+ """
229
+ Calculate normalized tree edit distance(nTED) based accuracy.
230
+ 1) Construct tree from dict,
231
+ 2) Get tree distance with insert/remove/update cost,
232
+ 3) Divide distance with GT tree size (i.e., nTED),
233
+ 4) Calculate nTED based accuracy. (= max(1 - nTED, 0 ).
234
+ """
235
+ pred = construct_tree_from_dict(normalize_dict(pred))
236
+ answer = construct_tree_from_dict(normalize_dict(answer))
237
+ val1 = zss.distance(
238
+ pred,
239
+ answer,
240
+ get_children=zss.Node.get_children,
241
+ insert_cost=insert_and_remove_cost,
242
+ remove_cost=insert_and_remove_cost,
243
+ update_cost=update_cost,
244
+ return_operations=False,
245
+ )
246
+ val2 = zss.distance(
247
+ construct_tree_from_dict(normalize_dict({})),
248
+ answer,
249
+ get_children=zss.Node.get_children,
250
+ insert_cost=insert_and_remove_cost,
251
+ remove_cost=insert_and_remove_cost,
252
+ update_cost=update_cost,
253
+ return_operations=False,
254
+ )
255
+ return max(0, 1 - val1 / val2)
256
+
257
+
258
+ def cal_acc_all(pred_info, answer_info):
259
+ acc_info, error_info = {}, {}
260
+ for file_name, answer in answer_info.items():
261
+ # if file_name not in pred_info:
262
+ # print("---> error: pdt not found: {}".format(file_name))
263
+ # continue
264
+ pred = pred_info.get(file_name, {})
265
+ acc = cal_acc(pred, answer)
266
+ acc_info[file_name] = acc
267
+ if acc < 1.0:
268
+ error_info[file_name] = {"acc": acc, "pred": pred, "answer": answer}
269
+
270
+ error_info = {k: v for k, v in sorted(error_info.items(), key=lambda item: item[1].get("acc", 0))}
271
+ acc_averge = sum(list(acc_info.values())) / (len(acc_info) + 1e-6)
272
+ return acc_averge, error_info
273
+
274
+
275
+ def normalize_values_of_nested_dict(d, normalize_func):
276
+ """
277
+ """
278
+ if isinstance(d, dict):
279
+ return {k: normalize_values_of_nested_dict(v, normalize_func) for k, v in d.items()}
280
+ elif isinstance(d, list):
281
+ return [normalize_values_of_nested_dict(x, normalize_func) if isinstance(x, dict) else x for x in d]
282
+ elif isinstance(d, str):
283
+ return normalize_func(d)
284
+ else:
285
+ return d
286
+
287
+
288
+ def eval_donut(pdt_info, gt_info, normalize_func=None, data_name=None):
289
+ """
290
+ """
291
+ if normalize_func is not None:
292
+ print("--> info: normalize_func executed.")
293
+ pdt_info = normalize_values_of_nested_dict(pdt_info, normalize_func)
294
+ gt_info = normalize_values_of_nested_dict(gt_info, normalize_func)
295
+
296
+ f1_score, class_eval_info, error_info = cal_f1_all(pdt_info, gt_info)
297
+ acc_average, acc_error_info = cal_acc_all(pdt_info, gt_info)
298
+ eval_info = {"f1_score": f1_score, "acc": acc_average, "class_f1_score": class_eval_info,
299
+ "f1_error_info": error_info, "acc_error_info": acc_error_info}
300
+ print(data_name, "f1_score", f1_score, "acc", acc_average)
301
+ return eval_info
302
+
303
+
304
+ def post_process_to_json(qwen_info_str, file_name=None):
305
+ try:
306
+ if "```json" in qwen_info_str:
307
+ if "```" not in qwen_info_str:
308
+ qwen_info_str += "```"
309
+ qwen_info_group = re.search(r'```json(.*?)```', qwen_info_str, re.DOTALL)
310
+ json_str = qwen_info_group.group(1).strip().replace("\n", "")
311
+ else:
312
+ json_str = qwen_info_str.strip().replace("\n", "")
313
+ json_data = json.loads(json_str)
314
+ return json_data
315
+ except Exception as err: # noqa: F841
316
+ return None
317
+
318
+
319
+ def fullwidth_to_halfwidth(text):
320
+ # 全角转半角
321
+ result = ''
322
+ for char in text:
323
+ code_point = ord(char)
324
+ # 全角空格直接转化
325
+ if code_point == 0x3000:
326
+ code_point = 0x0020
327
+ # 其他全角字符(除空格)转换为半角
328
+ elif 0xFF01 <= code_point <= 0xFF5E:
329
+ code_point -= 0xFEE0
330
+ result += chr(code_point)
331
+ result = result.replace("、", ",")
332
+ return result
333
+
334
+
335
+ def remove_unnecessary_spaces(text):
336
+ # 去掉中文字符之间的空格
337
+ text = re.sub(r'(?<=[\u4e00-\u9fff])\s+(?=[\u4e00-\u9fff])', '', text)
338
+ # 去掉中文和英文、数字之间的空格
339
+ text = re.sub(r'(?<=[\u4e00-\u9fff])\s+(?=[a-zA-Z0-9])', '', text)
340
+ text = re.sub(r'(?<=[a-zA-Z0-9])\s+(?=[\u4e00-\u9fff])', '', text)
341
+ # 去掉符号前的不必要空格,保留符号后的一个空格
342
+ text = re.sub(r'(?<![0-9])\s*([,.!?:;])\s*', r'\1 ', text) # 非数字前后的符号
343
+ # 在数字和英文之间添加空格
344
+ text = re.sub(r'(?<=[0-9])(?=[a-zA-Z])', ' ', text)
345
+ text = re.sub(r'(?<=[a-zA-Z])(?=[0-9])', ' ', text)
346
+ text = re.sub(r'\s+', ' ', text)
347
+ return text
348
+
349
+
350
+ class KieEvaluator(BaseMetric):
351
+ def response_post_func(self, response_text, **kwargs):
352
+ response_text = post_process_to_json(response_text, file_name=kwargs.get('file_name', None))
353
+ return response_text
354
+
355
+ def normalize_func(self, text, **kwargs):
356
+ halfwidth_text = fullwidth_to_halfwidth(str(text))
357
+ cleaned_text = remove_unnecessary_spaces(halfwidth_text)
358
+ return cleaned_text
359
+
360
+ def evaluate(self, response_info, gt_info, **kwargs):
361
+ """
362
+ response_info: dict: {"file_name_1": response, "file_name_2": gt}
363
+ gt_info: dict: {"file_name_1": gt, "file_name_2": gt}
364
+ kwargs: dataset index config: {'dataset': 'kie_benchmark_POIE', 'group': 'kie', 'op': 'poie', 'num': 250}
365
+ """
366
+ # gt should be a dict for kie task, fix for VLMEvalKit
367
+ for image_name, label_content in gt_info.items():
368
+ if isinstance(label_content, str):
369
+ gt_info[image_name] = json.loads(label_content)
370
+
371
+ response_info = normalize_values_of_nested_dict(response_info, self.normalize_func)
372
+ gt_info = normalize_values_of_nested_dict(gt_info, self.normalize_func)
373
+
374
+ f1_score, class_eval_info, error_info = cal_f1_all(response_info, gt_info)
375
+ acc_average, acc_error_info = cal_acc_all(response_info, gt_info)
376
+
377
+ # summary info
378
+ summary_info = {"f1_score": f1_score, "acc": acc_average}
379
+ eval_info = {"summary": summary_info, "class_f1_score": class_eval_info,
380
+ "f1_error_info": error_info, "acc_error_info": acc_error_info}
381
+ return eval_info
382
+
383
+
384
+ if __name__ == '__main__':
385
+ pass
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmdu.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ...smp import *
2
+
3
+ meta_prompt = """
4
+ You are an assistant skilled at evaluating the quality of creative text.
5
+ Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to \
6
+ the user question displayed below. You'll need to assess the response on the following dimensions: \
7
+ Creativity, Richness, Visual Perception, Logical Coherence, Answer Accuracy and Image Relationship Understanding. \
8
+ We will provide you with a creative question and the AI model's response and a reference answer for your evaluation. \
9
+ As you begin your assessment, follow this process:
10
+ 1. Evaluate the AI model's answers on different dimensions, pointing out its strengths or weaknesses \
11
+ in each dimension and assigning a score of 1 to 10 for each.
12
+ 2. Finally, based on the assessments across dimensions, \
13
+ provide an overall score of 1 to 10 for the AI model's response.
14
+ 3. Your scoring should be as stringent as possible and follow the scoring rules below:
15
+ In general, the higher the quality of the model's response and its strict adherence to user needs, \
16
+ the higher the score. Responses that do not meet user needs will receive lower scores.
17
+ Scoring rules:
18
+ Creativity:
19
+ Scores 1-2 when there is no innovation or uniqueness in the content.
20
+ Scores 3-4 when providing partially original content but with low creative quality.
21
+ Scores 5-6 when mostly creative but lacks significant novelty, with moderate quality.
22
+ Scores 7-8 when having novelty and high-quality content.
23
+ Scores 9-10 when highly novel and of exceptional quality compared to the reference answer.
24
+ Richness:
25
+ Scores 1-2 when lacking depth and breadth, with very limited information.
26
+ Scores 3-4 when limited in depth and breadth, with fewer explanations and examples, showing low diversity.
27
+ Scores 5-6 when limited in depth and breadth but provides basic necessary information.
28
+ Scores 7-8 when providing depth and useful additional information.
29
+ Scores 9-10 when providing exceptional depth, breadth, and high diversity compared to the reference answer.
30
+ Visual Perception:
31
+ Scores 1-2 when the description of the visual information in the image contains errors or \
32
+ is significantly inconsistent with the content of the image.
33
+ Scores 3-4 When the description of the visual information in the image reflects only a small amount \
34
+ of the image's information and contains some errors.
35
+ Scores 5-6 when the description of the visual information in the image includes the basic information \
36
+ of the image but contains minimal information.
37
+ Scores 7-8 when the description of the visual information in the image matches the image well and is rich in content, \
38
+ providing a substantial amount of information about the image.
39
+ Scores 9-10 when the description of the visual information in the image not only matches the image \
40
+ but also is more detailed and informative compared to the reference answer, providing more information about the image.
41
+ Logical Coherence:
42
+ Scores 1-2 when entirely incoherent, lacking any logic, and not matching the question or known information.
43
+ Scores 3-4 when somewhat coherent but with many logical errors or inconsistencies.
44
+ Scores 5-6 when mostly coherent, with few errors, but may struggle to maintain complete coherence in complex situations.
45
+ Scores 7-8 when excellent logical handling, very few errors.
46
+ Scores 9-10 when flawless logic, impeccable in handling complexity, \
47
+ and significantly higher logical coherence compared to the reference answer.
48
+ Answer Accuracy:
49
+ Scores 1-2 when the answer is significantly inconsistent with the question or contains obvious errors.
50
+ Scores 3-4 when the answer is partially correct but contains some errors or is incomplete.
51
+ Scores 5-6 when the answer is basically correct but lacks details or is not sufficiently detailed.
52
+ Scores 7-8 when the answer is accurate and detailed, fully corresponding to the question.
53
+ Scores 9-10 when the answer is not only accurate and detailed but also provides additional useful information, \
54
+ exceeding expectations.
55
+ Image Relationship Understanding:
56
+ Scores 1-2 when there are significant errors or confusion in distinguishing and describing different images, \
57
+ unable to correctly identify and relate the content of the images.
58
+ Scores 3-4 when the description of different images reflects only minimal distinguishing information, \
59
+ contains some errors and confusion, and fails to clearly differentiate and relate the images.
60
+ Scores 5-6 when the description of different images includes basic distinguishing information, \
61
+ is able to correctly identify and relate the images in a basic manner, \
62
+ but the information provided is minimal and lacks detail.
63
+ Scores 7-8 when the description of different images is accurate and detailed, \
64
+ clearly distinguishing and relating the images, \
65
+ with rich content that points out the main commonalities and differences between the images.
66
+ Scores 9-10 when the description of different images is not only accurate and detailed but also \
67
+ provides richer information and analysis, clearly distinguishing and relating the images, \
68
+ more comprehensively pointing out the commonalities and differences \
69
+ between the images compared to the reference answer.
70
+ Overall Score:
71
+ Scores 1-2 when irrelevant to the question, factually incorrect, or generates harmful content.
72
+ Scores 3-4 when no serious errors, mostly harmless, but of low quality and does not meet requirements.
73
+ Scores 5-6 when basically meeting requirements but performing poorly in some dimensions, with moderate quality.
74
+ Scores 7-8 when performing well in all dimensions.
75
+ Scores 9-10 when fully addressing user questions and all requirements, significantly surpassing the reference answer.
76
+ Please remember, you must evaluate and explain before scoring. After your explanation for each dimension, \
77
+ add the score for that dimension. Finally, at the end of your response, \
78
+ in the format of the dictionary (including brackets), return all your scoring results, \
79
+ ensuring your scores are integers:
80
+ {'Dimension One': Score, 'Dimension Two': Score, ..., 'Overall Score': Score}, \
81
+ for example: {'Creativity': 9, 'Richness': 6, ..., 'Overall Score': 7}.\n
82
+ """
83
+ question_begin_prompt = '[Question]'
84
+ reference_begin_prompt = '[The Start of Reference Answer]'
85
+ reference_end_prompt = '[The End of Reference Answer]'
86
+ answers_begin_prompt = '[The Start of Assistant’s Answer]'
87
+ answers_end_prompt = '[The End of Assistant’s Answer]'
88
+
89
+
90
+ def mmdu_score(model, line):
91
+ question = eval(line['question'])
92
+ gt = eval(line['answer'])
93
+ prediction = eval(line['prediction'])
94
+
95
+ DIMS = [
96
+ 'Creativity', 'Richness', 'Visual Perception', 'Logical Coherence',
97
+ 'Answer Accuracy', 'Image Relationship Understanding', 'Overall Score'
98
+ ]
99
+
100
+ all_result_dict = []
101
+ logs = []
102
+ for j in range(len(question)):
103
+ try:
104
+ prompt = meta_prompt + question_begin_prompt + '\n' + question[j] + '\n\n' + \
105
+ reference_begin_prompt + '\n' + gt[j] + '\n' + reference_end_prompt + '\n\n' + \
106
+ answers_begin_prompt + '\n' + prediction[j] + '\n' + answers_end_prompt
107
+ response = model.generate(prompt)
108
+ start_index = response.find('{')
109
+ end_index = response.rfind('}') + 1
110
+ dictionary_str = response[start_index: end_index]
111
+ result_dict = eval(dictionary_str)
112
+ all_result_dict.append(result_dict)
113
+ if all([x in result_dict for x in DIMS]):
114
+ logs.append('Succeed')
115
+ else:
116
+ logs.append(
117
+ f'Following Dims are not in results of turn {j}: '
118
+ f'{",".join([x for x in DIMS if x not in result_dict])}'
119
+ )
120
+ except Exception as e:
121
+ logging.warning(str(e))
122
+ all_result_dict.append({d: None for d in DIMS})
123
+ logs.append(str(e))
124
+
125
+ df = pd.DataFrame(all_result_dict)
126
+ return dict(res=df, log='\n'.join(logs))
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmniah.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+
4
+
5
+ def has_word(sentence, word):
6
+ pattern = r'\b' + re.escape(word) + r'\b'
7
+ match = re.search(pattern, sentence)
8
+ if match:
9
+ return True
10
+ else:
11
+ return False
12
+
13
+
14
+ class VQAEval:
15
+ def __init__(self):
16
+ self.contractions = {
17
+ 'aint': "ain't",
18
+ 'arent': "aren't",
19
+ 'cant': "can't",
20
+ 'couldve': "could've",
21
+ 'couldnt': "couldn't",
22
+ "couldn'tve": "couldn't've",
23
+ "couldnt've": "couldn't've",
24
+ 'didnt': "didn't",
25
+ 'doesnt': "doesn't",
26
+ 'dont': "don't",
27
+ 'hadnt': "hadn't",
28
+ "hadnt've": "hadn't've",
29
+ "hadn'tve": "hadn't've",
30
+ 'hasnt': "hasn't",
31
+ 'havent': "haven't",
32
+ 'hed': "he'd",
33
+ "hed've": "he'd've",
34
+ "he'dve": "he'd've",
35
+ 'hes': "he's",
36
+ 'howd': "how'd",
37
+ 'howll': "how'll",
38
+ 'hows': "how's",
39
+ "Id've": "I'd've",
40
+ "I'dve": "I'd've",
41
+ 'Im': "I'm",
42
+ 'Ive': "I've",
43
+ 'isnt': "isn't",
44
+ 'itd': "it'd",
45
+ "itd've": "it'd've",
46
+ "it'dve": "it'd've",
47
+ 'itll': "it'll",
48
+ "let's": "let's",
49
+ 'maam': "ma'am",
50
+ 'mightnt': "mightn't",
51
+ "mightnt've": "mightn't've",
52
+ "mightn'tve": "mightn't've",
53
+ 'mightve': "might've",
54
+ 'mustnt': "mustn't",
55
+ 'mustve': "must've",
56
+ 'neednt': "needn't",
57
+ 'notve': "not've",
58
+ 'oclock': "o'clock",
59
+ 'oughtnt': "oughtn't",
60
+ "ow's'at": "'ow's'at",
61
+ "'ows'at": "'ow's'at",
62
+ "'ow'sat": "'ow's'at",
63
+ 'shant': "shan't",
64
+ "shed've": "she'd've",
65
+ "she'dve": "she'd've",
66
+ "she's": "she's",
67
+ 'shouldve': "should've",
68
+ 'shouldnt': "shouldn't",
69
+ "shouldnt've": "shouldn't've",
70
+ "shouldn'tve": "shouldn't've",
71
+ "somebody'd": 'somebodyd',
72
+ "somebodyd've": "somebody'd've",
73
+ "somebody'dve": "somebody'd've",
74
+ 'somebodyll': "somebody'll",
75
+ 'somebodys': "somebody's",
76
+ 'someoned': "someone'd",
77
+ "someoned've": "someone'd've",
78
+ "someone'dve": "someone'd've",
79
+ 'someonell': "someone'll",
80
+ 'someones': "someone's",
81
+ 'somethingd': "something'd",
82
+ "somethingd've": "something'd've",
83
+ "something'dve": "something'd've",
84
+ 'somethingll': "something'll",
85
+ 'thats': "that's",
86
+ 'thered': "there'd",
87
+ "thered've": "there'd've",
88
+ "there'dve": "there'd've",
89
+ 'therere': "there're",
90
+ 'theres': "there's",
91
+ 'theyd': "they'd",
92
+ "theyd've": "they'd've",
93
+ "they'dve": "they'd've",
94
+ 'theyll': "they'll",
95
+ 'theyre': "they're",
96
+ 'theyve': "they've",
97
+ 'twas': "'twas",
98
+ 'wasnt': "wasn't",
99
+ "wed've": "we'd've",
100
+ "we'dve": "we'd've",
101
+ 'weve': "we've",
102
+ 'werent': "weren't",
103
+ 'whatll': "what'll",
104
+ 'whatre': "what're",
105
+ 'whats': "what's",
106
+ 'whatve': "what've",
107
+ 'whens': "when's",
108
+ 'whered': "where'd",
109
+ 'wheres': "where's",
110
+ 'whereve': "where've",
111
+ 'whod': "who'd",
112
+ "whod've": "who'd've",
113
+ "who'dve": "who'd've",
114
+ 'wholl': "who'll",
115
+ 'whos': "who's",
116
+ 'whove': "who've",
117
+ 'whyll': "why'll",
118
+ 'whyre': "why're",
119
+ 'whys': "why's",
120
+ 'wont': "won't",
121
+ 'wouldve': "would've",
122
+ 'wouldnt': "wouldn't",
123
+ "wouldnt've": "wouldn't've",
124
+ "wouldn'tve": "wouldn't've",
125
+ 'yall': "y'all",
126
+ "yall'll": "y'all'll",
127
+ "y'allll": "y'all'll",
128
+ "yall'd've": "y'all'd've",
129
+ "y'alld've": "y'all'd've",
130
+ "y'all'dve": "y'all'd've",
131
+ 'youd': "you'd",
132
+ "youd've": "you'd've",
133
+ "you'dve": "you'd've",
134
+ 'youll': "you'll",
135
+ 'youre': "you're",
136
+ 'youve': "you've",
137
+ }
138
+ self.manualMap = {
139
+ 'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4,
140
+ 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9,
141
+ 'ten': 10, 'eleven': 11, 'twelve': 12, 'thirteen': 13,
142
+ 'fourteen': 14, 'fifteen': 15, 'sixteen': 16,
143
+ 'seventeen': 17, 'eighteen': 18, 'nineteen': 19,
144
+ 'twenty': 20, 'thirty': 30, 'forty': 40, 'fifty': 50,
145
+ 'sixty': 60, 'seventy': 70, 'eighty': 80, 'ninety': 90}
146
+ self.articles = ['a', 'an', 'the']
147
+
148
+ self.periodStrip = re.compile('(?!<=\\d)(\\.)(?!\\d)')
149
+ self.commaStrip = re.compile('(\\d)(\\,)(\\d)')
150
+ self.punct = [
151
+ ';',
152
+ r'/',
153
+ '[',
154
+ ']',
155
+ '"',
156
+ '{',
157
+ '}',
158
+ '(',
159
+ ')',
160
+ '=',
161
+ '+',
162
+ '\\',
163
+ '_',
164
+ '-',
165
+ '>',
166
+ '<',
167
+ '@',
168
+ '`',
169
+ ',',
170
+ '?',
171
+ '!',
172
+ ]
173
+
174
+ def evaluate(self, answer, gt_answers):
175
+ answer = answer.replace('\n', ' ')
176
+ answer = answer.replace('\t', ' ')
177
+ answer = answer.strip()
178
+ answer = self.processPunctuation(answer)
179
+ answer = self.processDigitArticle(answer)
180
+ if isinstance(gt_answers, list):
181
+ for i in range(len(gt_answers)):
182
+ gt_answers[i] = str(gt_answers[i])
183
+ gt_answers[i] = gt_answers[i].replace('\n', ' ')
184
+ gt_answers[i] = gt_answers[i].replace('\t', ' ')
185
+ gt_answers[i] = gt_answers[i].strip()
186
+ gt_answers[i] = self.processPunctuation(gt_answers[i])
187
+ gt_answers[i] = self.processDigitArticle(gt_answers[i])
188
+ if has_word(answer, gt_answers[i]):
189
+ return 1
190
+ return 0
191
+ else:
192
+ gt_answers = gt_answers.replace('\n', ' ')
193
+ gt_answers = gt_answers.replace('\t', ' ')
194
+ gt_answers = gt_answers.strip()
195
+ gt_answers = self.processPunctuation(gt_answers)
196
+ gt_answers = self.processDigitArticle(gt_answers)
197
+ if has_word(answer, gt_answers):
198
+ return 1
199
+ else:
200
+ return 0
201
+
202
+ def evaluate_MRR(self, answer, gt_answers):
203
+ answer = answer.replace('\n', ' ')
204
+ answer = answer.replace('\t', ' ')
205
+ answer = answer.strip()
206
+ answer = self.processPunctuation(answer)
207
+ answer = self.processDigitArticle(answer)
208
+ assert isinstance(gt_answers, list)
209
+ for i in range(len(gt_answers)):
210
+ gt_answers[i] = gt_answers[i].replace('\n', ' ')
211
+ gt_answers[i] = gt_answers[i].replace('\t', ' ')
212
+ gt_answers[i] = gt_answers[i].strip()
213
+ gt_answers[i] = self.processPunctuation(gt_answers[i])
214
+ gt_answers[i] = self.processDigitArticle(gt_answers[i])
215
+ if has_word(answer, gt_answers[i]):
216
+ return 1 / (i + 1)
217
+ return 0.0
218
+
219
+ def processPunctuation(self, inText):
220
+ outText = inText
221
+ for p in self.punct:
222
+ if (p + ' ' in inText or ' ' + p in inText) or (
223
+ re.search(self.commaStrip, inText) is not None
224
+ ):
225
+ outText = outText.replace(p, '')
226
+ else:
227
+ outText = outText.replace(p, ' ')
228
+ outText = self.periodStrip.sub('', outText, re.UNICODE)
229
+ return outText
230
+
231
+ def processDigitArticle(self, inText):
232
+ outText = []
233
+ tempText = inText.lower().split()
234
+ for word in tempText:
235
+ word = self.manualMap.setdefault(word, word)
236
+ if word not in self.articles:
237
+ outText.append(word)
238
+ else:
239
+ pass
240
+ for wordId, word in enumerate(outText):
241
+ if word in self.contractions:
242
+ outText[wordId] = self.contractions[word]
243
+
244
+ outText = [str(text) for text in outText]
245
+ outText = ' '.join(outText)
246
+ return outText
247
+
248
+
249
+ def is_correct(answer, response):
250
+ # response_orig = response
251
+ response = response.strip('.')
252
+ if isinstance(answer, int):
253
+ if response.isdigit():
254
+ return int(int(response) == answer)
255
+
256
+ response = response.lower()
257
+ response = response.replace('the answer is', '')
258
+ response = response.replace('*', '') # parse **A**
259
+ if response.find('.') != -1:
260
+ response = response.split('.')[0]
261
+ response = response.replace(',', '')
262
+ response = response.strip()
263
+ response = response.strip()
264
+
265
+ if response == 'none':
266
+ return 0
267
+
268
+ if 'the camera is moving left' in response:
269
+ response = 'a'
270
+ elif 'the camera is moving right' in response:
271
+ response = 'b'
272
+
273
+ if len(response) != 1:
274
+ # print(f"Fail to parse {response_orig}")
275
+ return 0
276
+
277
+ return (ord(response) - ord('a')) == answer
278
+
279
+ if isinstance(answer, list):
280
+ try:
281
+ response = response.replace('json', '').replace('```', '').strip()
282
+ response = json.loads(response)
283
+ if isinstance(response, dict):
284
+ response = sum(list(response.values()), start=[])
285
+ except:
286
+ # print(f"Fail to parse {response_orig} Exception: {e}")
287
+ return 0
288
+
289
+ if not isinstance(response, (list, tuple)):
290
+ # print(f"Fail to parse {response_orig} Exception: not a list!")
291
+ return 0
292
+
293
+ match = 0
294
+ for res, ans in zip(response, answer):
295
+ match += res == ans
296
+ return match / len(answer)
297
+
298
+ return VQAEval().evaluate(response, answer)
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmvet.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ...smp import *
2
+
3
+
4
+ def build_mmvet_gpt4_prompt(line):
5
+ question = line['question']
6
+ gt = str(line['answer'])
7
+ prediction = str(line['prediction'])
8
+ prompt = """
9
+ Compare the ground truth and prediction from AI models, to give a correctness score for the prediction.
10
+ <AND> in the ground truth means it is totally right
11
+ only when all elements in the ground truth are present in the prediction,
12
+ and <OR> means it is totally right when any one element in the ground truth is present in the prediction.
13
+ The correctness score is 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right).
14
+ Just complete the last space of the correctness score.
15
+
16
+ Question | Ground truth | Prediction | Correctness
17
+ --- | --- | --- | ---
18
+ What is x in the equation? | -1 <AND> -5 | x = 3 | 0.0
19
+ What is x in the equation? | -1 <AND> -5 | x = -1 | 0.5
20
+ What is x in the equation? | -1 <AND> -5 | x = -5 | 0.5
21
+ What is x in the equation? | -1 <AND> -5 | x = -5 or 5 | 0.5
22
+ What is x in the equation? | -1 <AND> -5 | x = -1 or x = -5 | 1.0
23
+ Can you explain this meme? | This meme is poking fun at the fact that the names of the countries
24
+ Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes,
25
+ while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues
26
+ because the names of these countries do not accurately represent their landscapes. |
27
+ The meme talks about Iceland and Greenland. It's pointing out that despite their names,
28
+ Iceland is not very icy and Greenland isn't very green. | 0.4
29
+ Can you explain this meme? | This meme is poking fun at the fact that the names of the countries
30
+ Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes,
31
+ while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues
32
+ because the names of these countries do not accurately represent their landscapes. |
33
+ The meme is using humor to point out the misleading nature of Iceland's and Greenland's names.
34
+ Iceland, despite its name, has lush green landscapes while Greenland is mostly covered in ice and snow.
35
+ The text 'This is why I have trust issues' is a playful way to suggest
36
+ that these contradictions can lead to distrust or confusion.
37
+ The humor in this meme is derived from the unexpected contrast between the names of the countries
38
+ and their actual physical characteristics. | 1.0
39
+ """
40
+ gpt4_prompt = prompt + '\n' + ' | '.join(
41
+ [question, gt.replace('<AND>', ' <AND> ').replace('<OR>', ' <OR> '), prediction, ''])
42
+ return gpt4_prompt
43
+
44
+
45
+ def MMVet_auxeval(model, line):
46
+ def float_cvt(s):
47
+ try:
48
+ return float(s)
49
+ except ValueError:
50
+ return None
51
+
52
+ prompt = build_mmvet_gpt4_prompt(line)
53
+ log = ''
54
+ retry = 5
55
+ for i in range(retry):
56
+ output = model.generate(prompt, temperature=i * 0.5)
57
+ score = float_cvt(output)
58
+ if score is None:
59
+ log += f'Try {i}: output is {output}, failed to parse.\n'
60
+ elif score < 0 or score > 1:
61
+ log += f'Try {i}: output is {output}, invalid score: {score}.\n'
62
+ else:
63
+ log += 'Succeed'
64
+ return dict(log=log, score=score)
65
+ log += 'All 5 retries failed.\n'
66
+ return dict(log=log, score=0.0)
67
+
68
+
69
+ def MMVet_acc(result_file):
70
+ data = load(result_file)
71
+ tot = defaultdict(lambda: 0)
72
+ score = defaultdict(lambda: 0)
73
+ lt = len(data)
74
+ cate2_list = []
75
+ for i in range(lt):
76
+ item = data.iloc[i]
77
+ cate = item['category']
78
+ cate2 = cate.replace(',', '_')
79
+ if cate2 not in cate2_list:
80
+ cate2_list.append(cate2)
81
+ grade = float(item['score'])
82
+ cate_list = ['rec', 'ocr', 'know', 'gen', 'spat', 'math']
83
+ for capa in cate_list:
84
+ if capa in cate:
85
+ tot[capa] += 1
86
+ score[capa] += grade
87
+ tot['Overall'] += 1
88
+ tot[cate2] += 1
89
+ score['Overall'] += grade
90
+ score[cate2] += grade
91
+
92
+ res = defaultdict(list)
93
+ res2 = defaultdict(list)
94
+ cate_list.append('Overall')
95
+ cate2_list.append('Overall')
96
+ for k in cate_list:
97
+ res['Category'].append(k)
98
+ res['tot'].append(tot[k])
99
+ res['acc'].append(score[k] / tot[k] * 100)
100
+ for v in cate2_list:
101
+ res2['Category'].append(v)
102
+ res2['tot'].append(tot[v])
103
+ res2['acc'].append(score[v] / tot[v] * 100)
104
+ res = pd.DataFrame(res)
105
+ res2 = pd.DataFrame(res2)
106
+ return res, res2
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/multiple_choice.py ADDED
@@ -0,0 +1,558 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from ...utils import can_infer, track_progress_rich
3
+ from ...smp import *
4
+ import numpy as np
5
+ import re
6
+
7
+ MMB_abbrs = {
8
+ 'coarse_perception': 'CP',
9
+ 'finegrained_perception (instance-level)': 'FP-S',
10
+ 'finegrained_perception (cross-instance)': 'FP-C',
11
+ 'logic_reasoning': 'LR',
12
+ 'relation_reasoning': 'RR',
13
+ 'attribute_reasoning': 'AR'
14
+ }
15
+
16
+ MMT_abbrs = {
17
+ 'visual_recognition': 'VR',
18
+ 'localization': 'Loc',
19
+ 'ocr': 'OCR',
20
+ 'counting': 'Count',
21
+ 'hallucination': 'HLN',
22
+ 'image_retrieval': 'IR',
23
+ 'threed': '3D',
24
+ 'visual_captioning': 'VC',
25
+ 'visual_grounding': 'VG',
26
+ 'doc_understanding': 'DU',
27
+ 'action_recognition': 'AR',
28
+ 'pixel_level_perception': 'PLP',
29
+ 'image-to-image_translation': 'I2IT',
30
+ 'relation_reasoning': 'RR',
31
+ 'intelligence_quotient_test': 'IQT',
32
+ 'emotion': 'Emo',
33
+ 'visual_illusion': 'VI',
34
+ 'meme_understanding': 'MemU',
35
+ 'visual_prompt_understanding': 'VPU',
36
+ 'anomaly_detection': 'AND',
37
+ 'keypoint_detection': 'KD',
38
+ 'visual_commonsense_reasoning': 'VCR',
39
+ 'image_evaluation_judgement': 'IEJ',
40
+ 'multiple_image_analysis': 'MIA',
41
+ 'cross_image_matching': 'CIM',
42
+ 'temporal_understanding': 'TU',
43
+ 'visual_code': 'VP',
44
+ 'medical_understanding': 'MedU',
45
+ 'autonomous_driving': 'AUD',
46
+ 'discipline_knowledge_reasoning': 'DKR',
47
+ 'embodied_ai': 'EA',
48
+ 'gui_navigation': 'GN'
49
+ }
50
+
51
+
52
+ def MMMU_preproc(data):
53
+ logger = get_logger('Evaluation')
54
+ cnt = 0
55
+ As, Bs, Ans = list(data['A']), list(data['B']), list(data['answer'])
56
+ lt = len(data)
57
+ for i in range(lt):
58
+ if pd.isna(As[i]):
59
+ As[i] = Ans[i]
60
+ Bs[i] = 'Other Answers'
61
+ cnt += 1
62
+ logger.info(f'During MMMU_preproc in Evaluation, {cnt} open questions are re-formulated to multi-choice ones. ')
63
+ data['A'] = As
64
+ data['B'] = Bs
65
+ return data
66
+
67
+
68
+ def report_acc(df):
69
+ # assert group in [None, 'category', 'l2-category']
70
+ res = defaultdict(list)
71
+
72
+ if 'split' in df:
73
+ splits = list(set(df['split']))
74
+ res['split'] = splits
75
+ else:
76
+ df['split'] = ['none'] * len(df)
77
+ res['split'] = ['none']
78
+
79
+ for group in [None, 'l2-category', 'category']:
80
+ if group is None:
81
+ res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']]
82
+ elif group not in df:
83
+ continue
84
+ else:
85
+ abilities = list(set(df[group]))
86
+ abilities.sort()
87
+ for ab in abilities:
88
+ ab_name = MMB_abbrs[ab] if ab in MMB_abbrs else ab
89
+ sub_df = df[df[group] == ab]
90
+ res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']]
91
+ return pd.DataFrame(res)
92
+
93
+
94
+ def report_acc_MMT(df):
95
+ # assert group in [None, 'category', 'l2-category']
96
+ res = defaultdict(list)
97
+ res['split'] = list()
98
+ res['Overall'] = list()
99
+ for _, name in MMT_abbrs.items():
100
+ res[name] = list()
101
+
102
+ if 'split' in df:
103
+ splits = list(set(df['split']))
104
+ res['split'] = splits
105
+
106
+ else:
107
+ df['split'] = ['none'] * len(df)
108
+ res['split'] = ['none']
109
+
110
+ for group in [None, 'category', 'l2-category']:
111
+ if group is None:
112
+ res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']]
113
+ res['Overall'].extend([np.mean(df['hit'])])
114
+ elif group not in df:
115
+ continue
116
+ elif group == 'category':
117
+ abilities = list(set(df[group]))
118
+ abilities.sort()
119
+ for ab in abilities:
120
+ ab_name = ab
121
+ sub_df = df[df[group] == ab]
122
+ res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']]
123
+ res[ab_name].extend([np.mean(sub_df['hit'])])
124
+ else:
125
+ abilities = list(set(df[group]))
126
+ abilities.sort()
127
+ for ab in abilities:
128
+ sub_task_name_list = df[df['l2-category'] == ab]['category'].unique()
129
+ sub_task_acc = []
130
+ for sub_task_name in sub_task_name_list:
131
+ sub_df = df[df['category'] == sub_task_name]
132
+ sub_task_acc.append([np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']])
133
+
134
+ new_acc = []
135
+ for i in range(len(sub_task_acc[0])):
136
+ new_acc.append(sum([_[i] for _ in sub_task_acc]) / len([_ for _ in sub_task_acc]))
137
+ ab_name = MMT_abbrs[ab] if ab in MMT_abbrs else ab
138
+ res[ab_name] = new_acc
139
+
140
+ sub_task_acc = []
141
+ for sub_task_name in sub_task_name_list:
142
+ sub_df = df[df['category'] == sub_task_name]
143
+ sub_task_acc.append([np.mean(sub_df['hit'])])
144
+ new_acc = []
145
+ for i in range(len(sub_task_acc[0])):
146
+ new_acc.append(sum([_[i] for _ in sub_task_acc]) / len([_ for _ in sub_task_acc]))
147
+
148
+ res[ab_name].extend(new_acc)
149
+
150
+ res['split'].append('ALL')
151
+ return pd.DataFrame(res)
152
+
153
+
154
+ def build_prompt(question, options, prediction):
155
+ tmpl = (
156
+ 'You are an AI assistant who will help me to match '
157
+ 'an answer with several options of a single-choice question. '
158
+ 'You are provided with a question, several options, and an answer, '
159
+ 'and you need to find which option is most similar to the answer. '
160
+ 'If the meaning of all options are significantly different from the answer, output Z. '
161
+ 'Your should output a single uppercase character in A, B, C, D (if they are valid options), and Z. \n'
162
+ 'Example 1: \n'
163
+ 'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n'
164
+ 'Answer: a cute teddy bear\nYour output: A\n'
165
+ 'Example 2: \n'
166
+ 'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n'
167
+ 'Answer: Spider\nYour output: Z\n'
168
+ 'Example 3: \n'
169
+ 'Question: {}?\nOptions: {}\nAnswer: {}\nYour output: '
170
+ )
171
+ return tmpl.format(question, options, prediction)
172
+
173
+
174
+ def build_prompt_wemath(question, prediction):
175
+ tmpl = (
176
+ 'You are an AI assistant who will help me to match '
177
+ 'an answer with several options of a single-choice question. '
178
+ 'You are provided with a question, several options, and an answer, '
179
+ 'and you need to find which option is most similar to the answer. '
180
+ 'If the meaning of all options are significantly different from the answer, output Z. '
181
+ 'Your should output a single uppercase character in A, B, C, D, E, F, G (if they are valid options), and Z. \n'
182
+ 'Example 1: \n'
183
+ 'Question: <start>\nWhat is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n<end>\n'
184
+ 'Answer: <start>\na cute teddy bear\n<end>\nYour output: A\n'
185
+ 'Example 2: \n'
186
+ 'Question: <start>\nWhat is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n<end>\n'
187
+ 'Answer: <start>\nSpider\n<end>\nYour output: Z\n'
188
+ 'Example 3: \n'
189
+ 'Question: <start>\n{}\n<end>\nAnswer: <start>\n{}\n<end>\nYour output: '
190
+ )
191
+ question = question.replace(
192
+ ("Regarding the format, please answer following the template below, and be sure to include two <> symbols:\n"
193
+ "<Thought process>: <<your thought process>> <Answer>: <<your option>>"),
194
+ '',
195
+ )
196
+ return tmpl.format(question, prediction)
197
+
198
+
199
+ def build_prompt_blink(question, options, prediction):
200
+ tmpl = (
201
+ 'You are an AI assistant who will help me to match an answer with several options of a single-choice question. '
202
+ 'You are provided with a question, several options, and an answer, '
203
+ 'and you need to find which option is most similar to the answer. '
204
+ "If the answer says things like refuse to answer, I'm sorry cannot help, etc., output Z."
205
+ 'If the meaning of all options are significantly different from the answer, '
206
+ 'or the answer does not select any option, output Z. '
207
+ 'Your should output one of the choices, A, B, C, D (if they are valid options), or Z.\n'
208
+ 'Example 1: \n'
209
+ 'Question: Which point is closer to the camera?\nSelect from the following choices.\n'
210
+ 'Options: A. Point A\nB. Point B\n(Z) Failed\n'
211
+ 'Answer: Point B, where the child is sitting, is closer to the camera.\nYour output: (B)\n'
212
+ 'Example 2: \n'
213
+ 'Question: Which point is closer to the camera?\nSelect from the following choices.\n'
214
+ 'Options: (A) Point A\n(B) Point B\n(Z) Failed\n'
215
+ "Answer: I'm sorry, but I can't assist with that request.\nYour output: (Z)\n"
216
+ 'Example 3: \n'
217
+ 'Question: Which point is corresponding to the reference point?\nSelect from the following choices.\n'
218
+ 'Options: (A) Point A\n(B) Point B\n(Z) Failed\n'
219
+ 'Answer:The reference point (REF) on the first image is at the tip of the pot, '
220
+ 'which is the part used to Poke if the pots were used for that action. Looking at the second image, '
221
+ 'we need to find the part of the object that would correspond to poking.\n'
222
+ "(A) Point A is at the tip of the spoon's handle, which is not used for poking.\n"
223
+ '(B) Point B is at the bottom of the spoon, which is not used for poking.\n'
224
+ '(C) Point C is on the side of the pspoonot, which is not used for poking.\n'
225
+ '(D) Point D is at the tip of the spoon, which is not used for poking.\n'
226
+ '\nTherefore, there is no correct answer in the choices\nYour output: (Z)\n'
227
+ 'Example 4: \n'
228
+ 'Question: {}?\nOptions: {}\n(Z) Failed\nAnswer: {}\nYour output: '
229
+ )
230
+ return tmpl.format(question, options, prediction)
231
+
232
+
233
+ def build_prompt_cn(question, options, prediction):
234
+ tmpl = (
235
+ '你是一个帮助我匹配答案与单选题中多个选项的 AI 助手。'
236
+ '你会被提供:一个问题,多个选项,一个答案。你的任务是找到与答案意义最相近的选项。'
237
+ '如果所有选项的意义都与答案显著不同,则输出 Z。'
238
+ '你应该输出一个单个的大写字母,例如 A, B, C, D(如果它们是有效选项),或 Z。'
239
+ '例 1:'
240
+ '问题: 图中最主要的物体是什么?\n选项: A. 泰迪熊 B. 兔子 C. 猫 D. 狗\n答案: 一只可爱的泰迪熊\n输出: A\n'
241
+ '例 2: \n'
242
+ '问题: 图中最主要的物体是什么?\n选项: A. 泰迪熊 B. 兔子 C. 猫 D. 狗\n答案: 蜘蛛\n输出: Z\n'
243
+ '例 3: \n'
244
+ '问题: {}?\n选项: {}\n答案: {}\n输出: '
245
+ )
246
+ return tmpl.format(question, options, prediction)
247
+
248
+
249
+ def build_choices(item):
250
+ ret = {}
251
+ for ch in string.ascii_uppercase:
252
+ if ch in item and (not pd.isna(item[ch])):
253
+ ret[ch] = item[ch]
254
+ return ret
255
+
256
+
257
+ def prefetch_answer(item):
258
+ choices = build_choices(item)
259
+ return can_infer(item['prediction'], choices)
260
+
261
+
262
+ def extract_answer_from_item(model, item, dataset_name=None):
263
+ logger = get_logger('Evaluation')
264
+ # It will return: (pred, raw, llm_time)
265
+ choices = build_choices(item)
266
+ option_str = build_option_str(choices)
267
+
268
+ if dataset_name == 'BLINK':
269
+ prompt = build_prompt_blink(item['question'], option_str, item['prediction'])
270
+ elif dataset_name == 'WeMath':
271
+ prompt = build_prompt_wemath(item['question'], item['prediction'])
272
+ elif cn_string(item['question']):
273
+ prompt = build_prompt_cn(item['question'], option_str, item['prediction'])
274
+ else:
275
+ prompt = build_prompt(item['question'], option_str, item['prediction'])
276
+ retry = 3
277
+
278
+ ret = can_infer(item['prediction'], choices)
279
+ if ret:
280
+ return dict(opt=ret, log=item['prediction'])
281
+ if model is None:
282
+ return dict(opt='Z', log='Failed in Prefetch, no GPT-based answer matching under `exact_matching` policy.')
283
+
284
+ while retry:
285
+ ans = model.generate(prompt)
286
+ if 'Failed to obtain answer via API' in ans:
287
+ logger.warning('GPT API failed to answer. ')
288
+ else:
289
+ ret = can_infer(ans, choices)
290
+ if ret:
291
+ return dict(opt=ret, log=ans)
292
+ else:
293
+ logger.warning(f'Output includes 0 / > 1 letter among candidates {set(choices)} and Z: {ans}')
294
+ retry -= 1
295
+
296
+ if retry == 0:
297
+ options = list(choices) + ['Z'] if 'Z' not in choices else []
298
+ return dict(opt=rd.choice(options), log='Failed to predict, thus randomly generate one. ')
299
+
300
+
301
+ # For Circular Evaluation
302
+ def prefetch_circular_group(sub_data, verbose=False):
303
+ lt = len(sub_data)
304
+ GT, PRED = [], []
305
+ for i in range(lt):
306
+ item = sub_data.iloc[i]
307
+ GT.append(item['GT'])
308
+ PRED.append(prefetch_answer(item))
309
+ if PRED[-1] and (GT[-1] != PRED[-1]):
310
+ log = (
311
+ f'Failed in Prefetching Rolling {i}: Answer is {GT[-1]}, '
312
+ f"Prediction is {item['prediction']}, Pre-fetched is {PRED[-1]}. "
313
+ )
314
+ return dict(hit=0, log=log)
315
+ flag = True
316
+ for g, p in zip(GT, PRED):
317
+ if g != p:
318
+ flag = False
319
+ ret = (dict(hit=1, log='Succeed During Pre-fetching'), ) if flag else (None, )
320
+ ret = ret + (GT, PRED) if verbose else ret
321
+ return ret if len(ret) > 1 else ret[0]
322
+
323
+
324
+ def eval_vanilla(model, item, dataset_name=None):
325
+ res = extract_answer_from_item(model, item, dataset_name=dataset_name)
326
+ opt, match_log = res['opt'], res['log']
327
+ if opt == item['GT']:
328
+ return dict(hit=1, log=f'Match Log: {match_log}. ')
329
+ else:
330
+ return dict(hit=0, log=f'Match Log: {match_log}. ')
331
+
332
+
333
+ # For Circular Evaluation
334
+ def eval_circular_group(model, sub_data, dataset_name=None):
335
+ res, GT, PRED = prefetch_circular_group(sub_data, verbose=True)
336
+ if res is not None:
337
+ return res
338
+
339
+ lt = len(sub_data)
340
+ log = ''
341
+ for i in range(lt):
342
+ if PRED[i]:
343
+ log += f'Rolling {i} Matched.\n'
344
+ else:
345
+ res = extract_answer_from_item(model, sub_data.iloc[i], dataset_name=dataset_name)
346
+ opt, match_log = res['opt'], res['log']
347
+ PRED[i] = opt
348
+ if PRED[i] != GT[i]:
349
+ log += (
350
+ f"Failed in Rolling {i}: Answer is {GT[i]}; Prediction is {sub_data.iloc[i]['prediction']}; "
351
+ f'Pre-fetched is {PRED[i]}; Match Log is {match_log}.\n'
352
+ )
353
+ return dict(hit=0, log=log)
354
+ else:
355
+ log += (
356
+ f"Rolling {i}: Answer is {GT[i]}, Prediction is {sub_data.iloc[i]['prediction']}, "
357
+ f'Pre-fetched is {PRED[i]}.\n'
358
+ )
359
+
360
+ return dict(hit=1, log=log)
361
+
362
+
363
+ # data, meta are pd.DataFrame, result_file is a path
364
+ def mcq_vanilla_eval(model, data, meta, nproc, result_file, dataset_name=None):
365
+ result = {}
366
+ if osp.exists(result_file):
367
+ result = load(result_file)
368
+ answer_map = {i: c for i, c in zip(meta['index'], meta['answer'])}
369
+
370
+ if 'MMMU' in dataset_name:
371
+ data = MMMU_preproc(data)
372
+ answer_map = {k: (v if v in list(string.ascii_uppercase) else 'A') for k, v in answer_map.items()}
373
+
374
+ data = data[data['index'].isin(answer_map)]
375
+ data['GT'] = [answer_map[idx] for idx in data['index']]
376
+ items = []
377
+
378
+ for i in range(len(data)):
379
+ # Dealing with the normal part
380
+ item = data.iloc[i]
381
+ if item['index'] not in result:
382
+ items.append(item)
383
+
384
+ tups = [dict(model=model, item=x, dataset_name=dataset_name) for x in items]
385
+ keys = [x['index'] for x in items]
386
+ if len(tups):
387
+ res = track_progress_rich(eval_vanilla, tups, nproc=nproc, chunksize=nproc, save=result_file, keys=keys)
388
+ result = load(result_file)
389
+ for k, v in zip(keys, res):
390
+ if k not in result:
391
+ result[k] = v
392
+ data['hit'] = [result[i]['hit'] for i in data['index']]
393
+ data['log'] = [result[i]['log'] for i in data['index']]
394
+ if 'GT' in data:
395
+ data.pop('GT')
396
+ return data
397
+
398
+
399
+ # data, meta are pd.DataFrame, result_file is a path
400
+ def mcq_circular_eval(model, data, meta, nproc, result_file, dataset_name=None):
401
+ result = {}
402
+ if osp.exists(result_file):
403
+ result = load(result_file)
404
+ # Build Answer Map
405
+ answer_map = {i: c for i, c in zip(meta['index'], meta['answer'])}
406
+
407
+ for idx in list(meta['index']) + list(data['index']):
408
+ assert istype(idx, int)
409
+
410
+ # Only keep those lines in the meta data
411
+ data = data[data['index'].isin(answer_map)]
412
+ data['GT'] = [answer_map[idx] for idx in data['index']]
413
+ data_main = data[data['index'] < int(1e6)]
414
+
415
+ data_groups = []
416
+ for i in range(len(data_main)):
417
+ # Dealing with the normal part
418
+ idx = data_main.iloc[i]['index']
419
+ if idx not in result:
420
+ sub_data = data[data['index'] % int(1e6) == idx]
421
+ data_groups.append(sub_data)
422
+
423
+ if len(data_groups):
424
+ prefetched = [prefetch_circular_group(g, verbose=False) for g in data_groups]
425
+ remain = []
426
+ for dg, pf in zip(data_groups, prefetched):
427
+ if pf is not None:
428
+ result[dg.iloc[0]['index'] % 1e6] = pf
429
+ else:
430
+ remain.append(dg)
431
+ dump(result, result_file)
432
+
433
+ tups = [dict(model=model, sub_data=x, dataset_name=dataset_name) for x in remain]
434
+ keys = [x.iloc[0]['index'] % 1e6 for x in remain]
435
+
436
+ if len(tups) == 0:
437
+ pass
438
+ elif model is None:
439
+ logger = get_logger('Evaluation')
440
+ logger.warning('Exact Matching mode, will not do GPT-based answer matching. ')
441
+ for k in keys:
442
+ result[k] = dict(
443
+ hit=0, log='Failed in Prefetch, no GPT-based answer matching under `exact_matching` policy.')
444
+ else:
445
+ res = track_progress_rich(
446
+ eval_circular_group,
447
+ tups,
448
+ nproc=nproc,
449
+ chunksize=nproc,
450
+ save=result_file,
451
+ keys=keys)
452
+ result = load(result_file)
453
+ for k, v in zip(keys, res):
454
+ if k not in result:
455
+ result[k] = v
456
+
457
+ tmp_pth = f'/tmp/{timestr()}.xlsx'
458
+ dump(data_main, tmp_pth)
459
+ data_main = load(tmp_pth)
460
+ indices = data_main['index']
461
+ data_main['hit'] = [result[i]['hit'] for i in indices]
462
+ data_main['log'] = [result[i]['log'] for i in indices]
463
+ if 'GT' in data_main:
464
+ data_main.pop('GT')
465
+
466
+ return data_main
467
+
468
+
469
+ def extract_characters_regex(s, choices=['(A)', '(B)', '(C)', '(D)', '(E)']):
470
+ if type(s) is dict:
471
+ s = ''
472
+ s = s.strip()
473
+ answer_prefixes = [
474
+ 'The best answer is',
475
+ 'The correct answer is',
476
+ 'The answer is',
477
+ 'The answer',
478
+ 'The best option is'
479
+ 'The correct option is',
480
+ 'Best answer:'
481
+ 'Best option:',
482
+ ]
483
+ for answer_prefix in answer_prefixes:
484
+ s = s.replace(answer_prefix, '')
485
+
486
+ if len(s.split()) > 10 and not re.search('[ABCDE]', s):
487
+ return ''
488
+ matches = re.search(r'[ABCDE]', s)
489
+ if matches is None:
490
+ for choice in choices:
491
+ if s.lower() in choice.lower():
492
+ return choice[1]
493
+ return ''
494
+ return matches[0]
495
+
496
+
497
+ def get_dimension_rating(data_path):
498
+ TASKS = [
499
+ 'Reasoning',
500
+ 'Perception',
501
+ ]
502
+
503
+ SUBTASKS = [
504
+ 'Monitoring',
505
+ 'Autonomous_Driving',
506
+ 'OCR with Complex Context',
507
+ 'Diagram and Table',
508
+ 'Remote Sensing',
509
+ ]
510
+ data = load(data_path)
511
+ results = {}
512
+ results['Overall'] = {}
513
+ for task in TASKS:
514
+ results[f'{task}'] = {}
515
+ for subtask in SUBTASKS:
516
+ results[f'{task}'][f'{subtask}'] = {}
517
+
518
+ for i in range(len(data)):
519
+ question = data.iloc[i]
520
+ Task = question['category'].split('/')[0]
521
+ Subtask = question['category'].split('/')[1]
522
+ Category = question['l2-category'].lower()
523
+ if 'attribute' in Category.lower():
524
+ Category = Category.split('/')[0] + '/attribute'
525
+ if question['score'] >= 0:
526
+ cnt = question['score']
527
+ if Category not in results[Task][Subtask].keys():
528
+ results[Task][Subtask][f'{Category}'] = {'true': cnt, 'false': 1 - cnt}
529
+ else:
530
+ results[Task][Subtask][f'{Category}']['true'] += cnt
531
+ results[Task][Subtask][f'{Category}']['false'] += 1 - cnt
532
+
533
+ sum_all, succ_all = 0, 0
534
+ for task, tasks_values in results.items():
535
+ cnt_task, sum_task = 0, 0
536
+ for substask, subtask_value in tasks_values.items():
537
+ cnt_subtask, sum_subtask = 0, 0
538
+ for category, category_dict in subtask_value.items():
539
+ cnt_subtask += category_dict['true']
540
+ sum_subtask += category_dict['false'] + category_dict['true']
541
+ acc = category_dict['true'] / (category_dict['false'] + category_dict['true'])
542
+ results[task][substask][category] = acc
543
+ if sum_subtask == 0:
544
+ acc_subtasks = 0
545
+ else:
546
+ acc_subtasks = cnt_subtask / sum_subtask
547
+ cnt_task += cnt_subtask
548
+ sum_task += sum_subtask
549
+ results[task][substask]['Avg'] = acc_subtasks
550
+ if sum_task == 0:
551
+ acc_task = 0
552
+ else:
553
+ acc_task = cnt_task / sum_task
554
+ succ_all += cnt_task
555
+ sum_all += sum_task
556
+ results[task]['Avg'] = acc_task
557
+ results['Overall'] = succ_all / sum_all
558
+ return results
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/mvbench.py ADDED
@@ -0,0 +1,509 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ...smp import *
2
+ from .multiple_choice import extract_answer_from_item
3
+ from PIL import Image, ImageOps
4
+ import torchvision
5
+ import random
6
+ import numbers
7
+ import math
8
+ import torch
9
+
10
+
11
+ def get_dimension_rating(data_path):
12
+ data = load(data_path)
13
+ result_board = {}
14
+ for idx, item in data.iterrows():
15
+ if item['task_type'] not in result_board:
16
+ result_board[item['task_type']] = [0, 0]
17
+ result_board[item['task_type']][1] += 1
18
+ if item['score']:
19
+ result_board[item['task_type']][0] += 1
20
+
21
+ correct = 0
22
+ total = 0
23
+ for key, value in result_board.items():
24
+ correct += value[0]
25
+ total += value[1]
26
+ result_board[key].append(f'{value[0] / value[1] * 100 :.2f}%')
27
+
28
+ result_board['overall'] = [correct, total, f'{correct / total * 100 :.2f}%']
29
+
30
+ return result_board
31
+
32
+
33
+ def check_ans(pred, gt):
34
+ flag = False
35
+
36
+ pred_list = pred.lower().strip().split(' ')
37
+ pred_option, _ = pred_list[0], ' '.join(pred_list[1:])
38
+ gt_list = gt.lower().strip().split(' ')
39
+ gt_option, gt_content = gt_list[0], ' '.join(gt_list[1:])
40
+ if gt_content[-1] == '.':
41
+ gt_content = gt_content[:-1]
42
+
43
+ if pred_option.replace('.', '') in gt_option:
44
+ flag = True
45
+ elif gt_option in pred_option:
46
+ flag = True
47
+
48
+ return flag
49
+
50
+
51
+ def check_ans_with_model(pred, gt, model, item, dataset_name='MVBench'):
52
+ flag = False
53
+
54
+ pred_list = pred.lower().strip().split(' ')
55
+ pred_option, _ = pred_list[0], ' '.join(pred_list[1:])
56
+ gt_list = gt.lower().strip().split(' ')
57
+ gt_option, gt_content = gt_list[0], ' '.join(gt_list[1:])
58
+ if gt_content[-1] == '.':
59
+ gt_content = gt_content[:-1]
60
+
61
+ if pred_option.replace('.', '') in gt_option:
62
+ flag = True
63
+ elif gt_option in pred_option:
64
+ flag = True
65
+ elif extract_answer_from_item(model, item, dataset_name)['opt'] == item['answer']:
66
+ flag = True
67
+
68
+ return flag
69
+
70
+
71
+ def check_ans_advanced(pred, gt):
72
+ number_table = {
73
+ 0: 'zero',
74
+ 1: 'one',
75
+ 2: 'two',
76
+ 3: 'three',
77
+ 4: 'four',
78
+ 5: 'five',
79
+ 6: 'six',
80
+ 7: 'seven',
81
+ 8: 'eight',
82
+ 9: 'nine',
83
+ }
84
+ flag = False
85
+
86
+ pred_list = pred.lower().strip().split(' ')
87
+ pred_option, _ = pred_list[0], ' '.join(pred_list[1:])
88
+ gt_list = gt.lower().strip().split(' ')
89
+ gt_option, gt_content = gt_list[0], ' '.join(gt_list[1:])
90
+ if gt_content[-1] == '.':
91
+ gt_content = gt_content[:-1]
92
+
93
+ try:
94
+ gt_content = number_table[int(gt_content.strip('. \n'))]
95
+ print(gt_content)
96
+ except:
97
+ pass
98
+
99
+ if pred_option.replace('.', '') in gt_option:
100
+ flag = True
101
+ elif gt_option in pred_option:
102
+ flag = True
103
+ elif gt_content.lower().strip('. \n') in pred.lower().strip('. \n'):
104
+ flag = True
105
+
106
+ return flag
107
+
108
+
109
+ class GroupRandomCrop(object):
110
+ def __init__(self, size):
111
+ if isinstance(size, numbers.Number):
112
+ self.size = (int(size), int(size))
113
+ else:
114
+ self.size = size
115
+
116
+ def __call__(self, img_group):
117
+
118
+ w, h = img_group[0].size
119
+ th, tw = self.size
120
+
121
+ out_images = list()
122
+
123
+ x1 = random.randint(0, w - tw)
124
+ y1 = random.randint(0, h - th)
125
+
126
+ for img in img_group:
127
+ assert (img.size[0] == w and img.size[1] == h)
128
+ if w == tw and h == th:
129
+ out_images.append(img)
130
+ else:
131
+ out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
132
+
133
+ return out_images
134
+
135
+
136
+ class MultiGroupRandomCrop(object):
137
+ def __init__(self, size, groups=1):
138
+ if isinstance(size, numbers.Number):
139
+ self.size = (int(size), int(size))
140
+ else:
141
+ self.size = size
142
+ self.groups = groups
143
+
144
+ def __call__(self, img_group):
145
+
146
+ w, h = img_group[0].size
147
+ th, tw = self.size
148
+
149
+ out_images = list()
150
+
151
+ for i in range(self.groups):
152
+ x1 = random.randint(0, w - tw)
153
+ y1 = random.randint(0, h - th)
154
+
155
+ for img in img_group:
156
+ assert (img.size[0] == w and img.size[1] == h)
157
+ if w == tw and h == th:
158
+ out_images.append(img)
159
+ else:
160
+ out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
161
+
162
+ return out_images
163
+
164
+
165
+ class GroupCenterCrop(object):
166
+ def __init__(self, size):
167
+ self.worker = torchvision.transforms.CenterCrop(size)
168
+
169
+ def __call__(self, img_group):
170
+ return [self.worker(img) for img in img_group]
171
+
172
+
173
+ class GroupRandomHorizontalFlip(object):
174
+ """Randomly horizontally flips the given PIL.Image with a probability of 0.5
175
+ """
176
+
177
+ def __init__(self, is_flow=False):
178
+ self.is_flow = is_flow
179
+
180
+ def __call__(self, img_group, is_flow=False):
181
+ v = random.random()
182
+ if v < 0.5:
183
+ ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group]
184
+ if self.is_flow:
185
+ for i in range(0, len(ret), 2):
186
+ # invert flow pixel values when flipping
187
+ ret[i] = ImageOps.invert(ret[i])
188
+ return ret
189
+ else:
190
+ return img_group
191
+
192
+
193
+ class GroupNormalize(object):
194
+ def __init__(self, mean, std):
195
+ self.mean = mean
196
+ self.std = std
197
+
198
+ def __call__(self, tensor):
199
+ rep_mean = self.mean * (tensor.size()[0] // len(self.mean))
200
+ rep_std = self.std * (tensor.size()[0] // len(self.std))
201
+
202
+ # TODO: make efficient
203
+ for t, m, s in zip(tensor, rep_mean, rep_std):
204
+ t.sub_(m).div_(s)
205
+
206
+ return tensor
207
+
208
+
209
+ class GroupScale(object):
210
+ """ Rescales the input PIL.Image to the given 'size'.
211
+ 'size' will be the size of the smaller edge.
212
+ For example, if height > width, then image will be
213
+ rescaled to (size * height / width, size)
214
+ size: size of the smaller edge
215
+ interpolation: Default: PIL.Image.BILINEAR
216
+ """
217
+
218
+ def __init__(self, size, interpolation=Image.BILINEAR):
219
+ self.worker = torchvision.transforms.Resize(size, interpolation)
220
+
221
+ def __call__(self, img_group):
222
+ return [self.worker(img) for img in img_group]
223
+
224
+
225
+ class GroupOverSample(object):
226
+ def __init__(self, crop_size, scale_size=None, flip=True):
227
+ self.crop_size = crop_size if not isinstance(
228
+ crop_size, int) else (crop_size, crop_size)
229
+
230
+ if scale_size is not None:
231
+ self.scale_worker = GroupScale(scale_size)
232
+ else:
233
+ self.scale_worker = None
234
+ self.flip = flip
235
+
236
+ def __call__(self, img_group):
237
+
238
+ if self.scale_worker is not None:
239
+ img_group = self.scale_worker(img_group)
240
+
241
+ image_w, image_h = img_group[0].size
242
+ crop_w, crop_h = self.crop_size
243
+
244
+ offsets = GroupMultiScaleCrop.fill_fix_offset(
245
+ False, image_w, image_h, crop_w, crop_h)
246
+ oversample_group = list()
247
+ for o_w, o_h in offsets:
248
+ normal_group = list()
249
+ flip_group = list()
250
+ for i, img in enumerate(img_group):
251
+ crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h))
252
+ normal_group.append(crop)
253
+ flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT)
254
+
255
+ if img.mode == 'L' and i % 2 == 0:
256
+ flip_group.append(ImageOps.invert(flip_crop))
257
+ else:
258
+ flip_group.append(flip_crop)
259
+
260
+ oversample_group.extend(normal_group)
261
+ if self.flip:
262
+ oversample_group.extend(flip_group)
263
+ return oversample_group
264
+
265
+
266
+ class GroupFullResSample(object):
267
+ def __init__(self, crop_size, scale_size=None, flip=True):
268
+ self.crop_size = crop_size if not isinstance(
269
+ crop_size, int) else (crop_size, crop_size)
270
+
271
+ if scale_size is not None:
272
+ self.scale_worker = GroupScale(scale_size)
273
+ else:
274
+ self.scale_worker = None
275
+ self.flip = flip
276
+
277
+ def __call__(self, img_group):
278
+
279
+ if self.scale_worker is not None:
280
+ img_group = self.scale_worker(img_group)
281
+
282
+ image_w, image_h = img_group[0].size
283
+ crop_w, crop_h = self.crop_size
284
+
285
+ w_step = (image_w - crop_w) // 4
286
+ h_step = (image_h - crop_h) // 4
287
+
288
+ offsets = list()
289
+ offsets.append((0 * w_step, 2 * h_step)) # left
290
+ offsets.append((4 * w_step, 2 * h_step)) # right
291
+ offsets.append((2 * w_step, 2 * h_step)) # center
292
+
293
+ oversample_group = list()
294
+ for o_w, o_h in offsets:
295
+ normal_group = list()
296
+ flip_group = list()
297
+ for i, img in enumerate(img_group):
298
+ crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h))
299
+ normal_group.append(crop)
300
+ if self.flip:
301
+ flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT)
302
+
303
+ if img.mode == 'L' and i % 2 == 0:
304
+ flip_group.append(ImageOps.invert(flip_crop))
305
+ else:
306
+ flip_group.append(flip_crop)
307
+
308
+ oversample_group.extend(normal_group)
309
+ oversample_group.extend(flip_group)
310
+ return oversample_group
311
+
312
+
313
+ class GroupMultiScaleCrop(object):
314
+
315
+ def __init__(self, input_size, scales=None, max_distort=1,
316
+ fix_crop=True, more_fix_crop=True):
317
+ self.scales = scales if scales is not None else [1, .875, .75, .66]
318
+ self.max_distort = max_distort
319
+ self.fix_crop = fix_crop
320
+ self.more_fix_crop = more_fix_crop
321
+ self.input_size = input_size if not isinstance(input_size, int) else [
322
+ input_size, input_size]
323
+ self.interpolation = Image.BILINEAR
324
+
325
+ def __call__(self, img_group):
326
+
327
+ im_size = img_group[0].size
328
+
329
+ crop_w, crop_h, offset_w, offset_h = self._sample_crop_size(im_size)
330
+ crop_img_group = [
331
+ img.crop(
332
+ (offset_w,
333
+ offset_h,
334
+ offset_w + crop_w,
335
+ offset_h + crop_h)) for img in img_group]
336
+ ret_img_group = [img.resize((self.input_size[0], self.input_size[1]), self.interpolation)
337
+ for img in crop_img_group]
338
+ return ret_img_group
339
+
340
+ def _sample_crop_size(self, im_size):
341
+ image_w, image_h = im_size[0], im_size[1]
342
+
343
+ # find a crop size
344
+ base_size = min(image_w, image_h)
345
+ crop_sizes = [int(base_size * x) for x in self.scales]
346
+ crop_h = [
347
+ self.input_size[1] if abs(
348
+ x - self.input_size[1]) < 3 else x for x in crop_sizes]
349
+ crop_w = [
350
+ self.input_size[0] if abs(
351
+ x - self.input_size[0]) < 3 else x for x in crop_sizes]
352
+
353
+ pairs = []
354
+ for i, h in enumerate(crop_h):
355
+ for j, w in enumerate(crop_w):
356
+ if abs(i - j) <= self.max_distort:
357
+ pairs.append((w, h))
358
+
359
+ crop_pair = random.choice(pairs)
360
+ if not self.fix_crop:
361
+ w_offset = random.randint(0, image_w - crop_pair[0])
362
+ h_offset = random.randint(0, image_h - crop_pair[1])
363
+ else:
364
+ w_offset, h_offset = self._sample_fix_offset(
365
+ image_w, image_h, crop_pair[0], crop_pair[1])
366
+
367
+ return crop_pair[0], crop_pair[1], w_offset, h_offset
368
+
369
+ def _sample_fix_offset(self, image_w, image_h, crop_w, crop_h):
370
+ offsets = self.fill_fix_offset(
371
+ self.more_fix_crop, image_w, image_h, crop_w, crop_h)
372
+ return random.choice(offsets)
373
+
374
+ @staticmethod
375
+ def fill_fix_offset(more_fix_crop, image_w, image_h, crop_w, crop_h):
376
+ w_step = (image_w - crop_w) // 4
377
+ h_step = (image_h - crop_h) // 4
378
+
379
+ ret = list()
380
+ ret.append((0, 0)) # upper left
381
+ ret.append((4 * w_step, 0)) # upper right
382
+ ret.append((0, 4 * h_step)) # lower left
383
+ ret.append((4 * w_step, 4 * h_step)) # lower right
384
+ ret.append((2 * w_step, 2 * h_step)) # center
385
+
386
+ if more_fix_crop:
387
+ ret.append((0, 2 * h_step)) # center left
388
+ ret.append((4 * w_step, 2 * h_step)) # center right
389
+ ret.append((2 * w_step, 4 * h_step)) # lower center
390
+ ret.append((2 * w_step, 0 * h_step)) # upper center
391
+
392
+ ret.append((1 * w_step, 1 * h_step)) # upper left quarter
393
+ ret.append((3 * w_step, 1 * h_step)) # upper right quarter
394
+ ret.append((1 * w_step, 3 * h_step)) # lower left quarter
395
+ ret.append((3 * w_step, 3 * h_step)) # lower righ quarter
396
+
397
+ return ret
398
+
399
+
400
+ class GroupRandomSizedCrop(object):
401
+ """Random crop the given PIL.Image to a random size of (0.08 to 1.0) of the original size
402
+ and and a random aspect ratio of 3/4 to 4/3 of the original aspect ratio
403
+ This is popularly used to train the Inception networks
404
+ size: size of the smaller edge
405
+ interpolation: Default: PIL.Image.BILINEAR
406
+ """
407
+
408
+ def __init__(self, size, interpolation=Image.BILINEAR):
409
+ self.size = size
410
+ self.interpolation = interpolation
411
+
412
+ def __call__(self, img_group):
413
+ for attempt in range(10):
414
+ area = img_group[0].size[0] * img_group[0].size[1]
415
+ target_area = random.uniform(0.08, 1.0) * area
416
+ aspect_ratio = random.uniform(3. / 4, 4. / 3)
417
+
418
+ w = int(round(math.sqrt(target_area * aspect_ratio)))
419
+ h = int(round(math.sqrt(target_area / aspect_ratio)))
420
+
421
+ if random.random() < 0.5:
422
+ w, h = h, w
423
+
424
+ if w <= img_group[0].size[0] and h <= img_group[0].size[1]:
425
+ x1 = random.randint(0, img_group[0].size[0] - w)
426
+ y1 = random.randint(0, img_group[0].size[1] - h)
427
+ found = True
428
+ break
429
+ else:
430
+ found = False
431
+ x1 = 0
432
+ y1 = 0
433
+
434
+ if found:
435
+ out_group = list()
436
+ for img in img_group:
437
+ img = img.crop((x1, y1, x1 + w, y1 + h))
438
+ assert (img.size == (w, h))
439
+ out_group.append(
440
+ img.resize(
441
+ (self.size, self.size), self.interpolation))
442
+ return out_group
443
+ else:
444
+ # Fallback
445
+ scale = GroupScale(self.size, interpolation=self.interpolation)
446
+ crop = GroupRandomCrop(self.size)
447
+ return crop(scale(img_group))
448
+
449
+
450
+ class ConvertDataFormat(object):
451
+ def __init__(self, model_type):
452
+ self.model_type = model_type
453
+
454
+ def __call__(self, images):
455
+ if self.model_type == '2D':
456
+ return images
457
+ tc, h, w = images.size()
458
+ t = tc // 3
459
+ images = images.view(t, 3, h, w)
460
+ images = images.permute(1, 0, 2, 3)
461
+ return images
462
+
463
+
464
+ class Stack(object):
465
+
466
+ def __init__(self, roll=False):
467
+ self.roll = roll
468
+
469
+ def __call__(self, img_group):
470
+ if img_group[0].mode == 'L':
471
+ return np.concatenate([np.expand_dims(x, 2)
472
+ for x in img_group], axis=2)
473
+ elif img_group[0].mode == 'RGB':
474
+ if self.roll:
475
+ return np.concatenate([np.array(x)[:, :, ::-1]
476
+ for x in img_group], axis=2)
477
+ else:
478
+ # print(np.concatenate(img_group, axis=2).shape)
479
+ # print(img_group[0].shape)
480
+ return np.concatenate(img_group, axis=2)
481
+
482
+
483
+ class ToTorchFormatTensor(object):
484
+ """ Converts a PIL.Image (RGB) or numpy.ndarray (H x W x C) in the range [0, 255]
485
+ to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] """
486
+
487
+ def __init__(self, div=True):
488
+ self.div = div
489
+
490
+ def __call__(self, pic):
491
+ if isinstance(pic, np.ndarray):
492
+ # handle numpy array
493
+ img = torch.from_numpy(pic).permute(2, 0, 1).contiguous()
494
+ else:
495
+ # handle PIL Image
496
+ img = torch.ByteTensor(
497
+ torch.ByteStorage.from_buffer(
498
+ pic.tobytes()))
499
+ img = img.view(pic.size[1], pic.size[0], len(pic.mode))
500
+ # put it from HWC to CHW format
501
+ # yikes, this transpose takes 80% of the loading time/CPU
502
+ img = img.transpose(0, 1).transpose(0, 2).contiguous()
503
+ return img.float().div(255) if self.div else img.float()
504
+
505
+
506
+ class IdentityTransform(object):
507
+
508
+ def __call__(self, data):
509
+ return data
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/naturalbench.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+
4
+ def extract_answer(output_string, task_type="yes_no"):
5
+ """
6
+ Extracts the answer from the output string based on the task type.
7
+
8
+ Parameters:
9
+ output_string (str): The output string.
10
+ task_type (str): The type of task. Must be either "yes_no" or "multiple_choice".
11
+
12
+ Returns:
13
+ int:
14
+ 1 if "yes" or "A"
15
+ 0 if "no" or "B"
16
+ -1 if no relevant answer is found.
17
+ Raises a ValueError if an unsupported task_type is provided.
18
+ """
19
+
20
+ def find_word_position(string, word):
21
+ pattern = r'\b' + re.escape(word) + r'\b'
22
+ match = re.search(pattern, string, re.IGNORECASE)
23
+ if match:
24
+ return match.start()
25
+ return -1
26
+
27
+ if task_type not in ["yes_no", "multiple_choice"]:
28
+ raise ValueError(f"Task type {task_type} not supported. Must be 'yes_no' or 'multiple_choice'.")
29
+
30
+ if task_type == "yes_no":
31
+ position_yes_and_a = find_word_position(output_string, "yes")
32
+ position_no_and_b = find_word_position(output_string, "no")
33
+ elif task_type == "multiple_choice":
34
+ position_yes_and_a = find_word_position(output_string, "A")
35
+ position_no_and_b = find_word_position(output_string, "B")
36
+
37
+ if position_yes_and_a == -1 and position_no_and_b == -1:
38
+ print(f"No answer found in the output string: {output_string}.")
39
+ return -1
40
+ elif position_yes_and_a != -1 and position_no_and_b != -1:
41
+ return 1 if position_yes_and_a < position_no_and_b else 0
42
+ else:
43
+ return 0 if position_yes_and_a == -1 else 1
44
+
45
+
46
+ def get_scores(scores):
47
+ """
48
+ Calculate various scores based on the given results.
49
+
50
+ Args:
51
+ scores (dict or list): A dictionary or list containing results where each result can be:
52
+ - dict: {id: {"q0_i0": 1 or 0, "q0_i1": 1 or 0, "q1_i0": 1 or 0, "q1_i1": 1 or 0}, ...}
53
+ - list: [[q0_i0 (1 or 0), q0_i1 (1 or 0), q1_i0 (1 or 0), q1_i1 (1 or 0)], ...]
54
+
55
+ The keys "q0_i0", "q0_i1", "q1_i0", "q1_i1" represent combinations of questions and images:
56
+ - "q0_i0" means question_0 on image_0
57
+ - "q0_i1" means question_0 on image_1
58
+ - "q1_i0" means question_1 on image_0
59
+ - "q1_i1" means question_1 on image_1
60
+
61
+ Returns:
62
+ dict: A dictionary containing the calculated scores:
63
+ - 'Q_Acc': Average question score
64
+ - 'I_Acc': Average image score
65
+ - 'Acc': Average binary VQA score
66
+ - 'G_Acc': Average group score
67
+ """
68
+ Q_Acc = 0.0
69
+ I_Acc = 0.0
70
+ Acc = 0.0
71
+ G_Acc = 0.0
72
+
73
+ num_samples = len(scores)
74
+
75
+ def calculate_image_score(result):
76
+ image_correct = 0
77
+ if isinstance(result, dict):
78
+ if result["q0_i0"] == 1.0 and result["q1_i0"] == 0.0:
79
+ image_correct += 1
80
+ if result["q1_i1"] == 1.0 and result["q0_i1"] == 0.0:
81
+ image_correct += 1
82
+ elif isinstance(result, list):
83
+ if result[0] == 1.0 and result[2] == 0.0:
84
+ image_correct += 1
85
+ if result[3] == 1.0 and result[1] == 0.0:
86
+ image_correct += 1
87
+ return image_correct
88
+
89
+ def calculate_question_score(result):
90
+ text_correct = 0
91
+ if isinstance(result, dict):
92
+ if result["q0_i0"] == 1.0 and result["q0_i1"] == 0.0:
93
+ text_correct += 1
94
+ if result["q1_i1"] == 1.0 and result["q1_i0"] == 0.0:
95
+ text_correct += 1
96
+ else:
97
+ if result[0] == 1.0 and result[1] == 0.0:
98
+ text_correct += 1
99
+ if result[3] == 1.0 and result[2] == 0.0:
100
+ text_correct += 1
101
+ return text_correct
102
+
103
+ def calculate_binary_score(result):
104
+ binary_score_correct = 0
105
+ if isinstance(result, dict):
106
+ binary_score_correct += 1 if result["q0_i0"] == 1.0 else 0
107
+ binary_score_correct += 1 if result["q0_i1"] == 0.0 else 0
108
+ binary_score_correct += 1 if result["q1_i0"] == 0.0 else 0
109
+ binary_score_correct += 1 if result["q1_i1"] == 1.0 else 0
110
+ else:
111
+ binary_score_correct += 1 if result[0] == 1.0 else 0
112
+ binary_score_correct += 1 if result[1] == 0.0 else 0
113
+ binary_score_correct += 1 if result[2] == 0.0 else 0
114
+ binary_score_correct += 1 if result[3] == 1.0 else 0
115
+
116
+ return binary_score_correct
117
+
118
+ def calculate_group(result):
119
+ group_correct = 0
120
+ if calculate_question_score(result) == 2 and calculate_image_score(result) == 2:
121
+ group_correct += 1
122
+
123
+ return group_correct
124
+
125
+ if isinstance(scores, dict):
126
+ for _, result in scores.items():
127
+ Q_Acc += calculate_question_score(result)
128
+ I_Acc += calculate_image_score(result)
129
+ Acc += calculate_binary_score(result)
130
+ G_Acc += calculate_group(result)
131
+ else:
132
+ for result in scores:
133
+ Q_Acc += calculate_question_score(result)
134
+ I_Acc += calculate_image_score(result)
135
+ Acc += calculate_binary_score(result)
136
+ G_Acc += calculate_group(result)
137
+
138
+ results = {
139
+ 'Q_Acc': Q_Acc / float(num_samples * 2),
140
+ 'I_Acc': I_Acc / float(num_samples * 2),
141
+ 'Acc': Acc / float(num_samples * 4),
142
+ 'G_Acc': G_Acc / num_samples
143
+ }
144
+
145
+ return results
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/ocrbench.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ...smp import *
2
+
3
+
4
+ def OCRBench_eval(eval_file):
5
+ OCRBench_score = {
6
+ 'Regular Text Recognition': 0,
7
+ 'Irregular Text Recognition': 0,
8
+ 'Artistic Text Recognition': 0,
9
+ 'Handwriting Recognition': 0,
10
+ 'Digit String Recognition': 0,
11
+ 'Non-Semantic Text Recognition': 0,
12
+ 'Scene Text-centric VQA': 0,
13
+ 'Doc-oriented VQA': 0,
14
+ 'Key Information Extraction': 0,
15
+ 'Handwritten Mathematical Expression Recognition': 0
16
+ }
17
+
18
+ logger = get_logger('Evaluation')
19
+
20
+ data = load(eval_file)
21
+ lt = len(data)
22
+ lines = [data.iloc[i] for i in range(lt)]
23
+ for i in tqdm(range(len(lines))):
24
+ line = lines[i]
25
+ predict = str(line['prediction'])
26
+ answers = eval(line['answer'])
27
+ category = line['category']
28
+ if category == 'Handwritten Mathematical Expression Recognition':
29
+ for j in range(len(answers)):
30
+ answer = answers[j].strip().replace('\n', ' ').replace(' ', '')
31
+ predict = predict.strip().replace('\n', ' ').replace(' ', '')
32
+ if answer in predict:
33
+ OCRBench_score[category] += 1
34
+ break
35
+ else:
36
+ for j in range(len(answers)):
37
+ answer = answers[j].lower().strip().replace('\n', ' ')
38
+ predict = predict.lower().strip().replace('\n', ' ')
39
+ if answer in predict:
40
+ OCRBench_score[category] += 1
41
+ break
42
+
43
+ final_score_dict = {}
44
+ final_score_dict['Text Recognition'] = (
45
+ OCRBench_score['Regular Text Recognition'] + OCRBench_score['Irregular Text Recognition']
46
+ + OCRBench_score['Artistic Text Recognition'] + OCRBench_score['Handwriting Recognition']
47
+ + OCRBench_score['Digit String Recognition'] + OCRBench_score['Non-Semantic Text Recognition']
48
+ )
49
+ final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA']
50
+ final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA']
51
+ final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction']
52
+ final_score_dict['Handwritten Mathematical Expression Recognition'] = \
53
+ OCRBench_score['Handwritten Mathematical Expression Recognition']
54
+ final_score_dict['Final Score'] = (
55
+ final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA']
56
+ + final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction']
57
+ + final_score_dict['Handwritten Mathematical Expression Recognition']
58
+ )
59
+ final_score_dict['Final Score Norm'] = float(final_score_dict['Final Score']) / 10
60
+ score_pth = eval_file.replace('.xlsx', '_score.json')
61
+ dump(final_score_dict, score_pth)
62
+ logger.info(f'OCRBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
63
+ logger.info('Score: ')
64
+ for key, value in final_score_dict.items():
65
+ logger.info('{}:{}'.format(key, value))
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/olympiadbench.py ADDED
@@ -0,0 +1,532 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ from math import isclose
4
+ import sympy as sp
5
+ from sympy import simplify, Eq, sympify, evalf, Pow
6
+ from sympy.parsing.latex import parse_latex
7
+ import antlr4
8
+ from decimal import Decimal, getcontext
9
+ from fractions import Fraction
10
+ import sys
11
+ import math
12
+
13
+
14
+ chinese_answer_type_dict = {
15
+ 'Numerical': '数值',
16
+ 'Expression': '表达式',
17
+ 'Equation': '方程',
18
+ 'Interval': '区间'
19
+ }
20
+ english_answer_type_dict = {
21
+ 'Numerical': 'a numerical value',
22
+ 'Expression': 'an expression',
23
+ 'Equation': 'an equation',
24
+ 'Interval': 'an interval'
25
+ }
26
+
27
+
28
+ def get_single_answer_type_text(answer_type, is_chinese):
29
+ if '-' in answer_type: # No need now
30
+ answer_type = answer_type[:answer_type.find('-')]
31
+ for t in ['Numerical', 'Expression', 'Equation', 'Interval']:
32
+ if t in answer_type:
33
+ if is_chinese:
34
+ return chinese_answer_type_dict[t]
35
+ else:
36
+ return english_answer_type_dict[t]
37
+ exit(f'Error parsing answer type {answer_type}!')
38
+
39
+
40
+ def get_answer_type_text(answer_type, is_chinese, multiple_answer):
41
+ # 'Tuple' has various meanings in different context, such as position or values of a series of variable,
42
+ # so it may lead to confusion to directly use 'tuple' in the prompt.
43
+ if ('Need_human_evaluate' in answer_type) or ('Tuple' in answer_type):
44
+ full_answer_text = ''
45
+ else:
46
+ if not multiple_answer:
47
+ answer_text = get_single_answer_type_text(answer_type, is_chinese)
48
+ if is_chinese:
49
+ full_answer_text = f',答案类型为{answer_text}'
50
+ else:
51
+ full_answer_text = f"The answer of The problem should be {answer_text}. "
52
+ else:
53
+ if ',' not in answer_type: # Same answer type for all answers
54
+ answer_text = get_single_answer_type_text(answer_type, is_chinese)
55
+ if is_chinese:
56
+ full_answer_text = f',题目有多个答案,答案类型均为{answer_text}'
57
+ else:
58
+ full_answer_text = f'The problem has multiple answers, each of them should be {answer_text}. '
59
+ else:
60
+ answer_types = answer_type.split(',')
61
+ answer_types = [get_single_answer_type_text(t, is_chinese) for t in answer_types]
62
+ if len(set(answer_types)) == 1:
63
+ answer_text = answer_types[0]
64
+ if is_chinese:
65
+ full_answer_text = f',题目有多个答案,答案类型均为{answer_text}'
66
+ else:
67
+ full_answer_text = f'The problem has multiple answers, each of them should be {answer_text}. '
68
+ else:
69
+ if is_chinese:
70
+ answer_text = '、'.join(answer_types)
71
+ full_answer_text = f',题目有多个答案,答案类型分别为{answer_text}'
72
+ else:
73
+ answer_text = ', '.join(answer_types)
74
+ full_answer_text = (
75
+ f'The problem has multiple answers, with the answers in order being {answer_text}. '
76
+ )
77
+ return full_answer_text
78
+
79
+
80
+ def make_input(prompt, question_content):
81
+ # diversified based on the vllm, which is not implemented temporarily
82
+ input = prompt + '\n' + question_content
83
+ return input
84
+
85
+
86
+ sys.set_int_max_str_digits(1000000)
87
+ # 设置decimal的精度
88
+ getcontext().prec = 50
89
+
90
+
91
+ class MathJudger:
92
+ def __init__(self):
93
+ self.special_signal_map = {
94
+ "\\left": "",
95
+ "\\right": "",
96
+ "∶": ":",
97
+ ",": ",",
98
+ "$": "",
99
+ "\\approx": "=",
100
+ "\\simeq": "=",
101
+ "\\sim": "=",
102
+ "^\\prime": "'",
103
+ "^{\\prime}": "'",
104
+ "^\\circ": "",
105
+ "%": "",
106
+ }
107
+ self.pi = parse_latex("\\pi")
108
+ self.precision = 1e-8
109
+
110
+ def split_by_comma(self, expr: str):
111
+ in_bracket_num = 0
112
+ splitted_expr = []
113
+ start_idx = 0
114
+ for i, char in enumerate(expr):
115
+ if char == "(" or char == "[":
116
+ in_bracket_num += 1
117
+ elif char == ")" or char == "]":
118
+ in_bracket_num -= 1
119
+ elif char == "," and in_bracket_num == 0:
120
+ splitted_expr.append(expr[start_idx:i].strip())
121
+ start_idx = i + 1
122
+
123
+ if start_idx < len(expr):
124
+ splitted_expr.append(expr[start_idx:].strip())
125
+
126
+ return splitted_expr
127
+
128
+ def trans_plus_minus_sign(self, expr_list: list):
129
+ new_expr_list = []
130
+ for expr in expr_list:
131
+ if "\\pm" in expr:
132
+ new_expr_list.append(expr.replace("\\pm", "+"))
133
+ new_expr_list.append(expr.replace("\\pm", "-"))
134
+ else:
135
+ new_expr_list.append(expr)
136
+
137
+ return new_expr_list
138
+
139
+ def judge(self, expression1, expression2, precision=1e-8):
140
+ # (默认 expression1 为 Ground_Truth)
141
+ precision = precision if isinstance(precision, list) else [precision]
142
+
143
+ try:
144
+ expression1, expression2 = self.preprocess(expression1, expression2)
145
+ except:
146
+ return False
147
+ if expression1 == expression2:
148
+ # print("原生相等")
149
+ return True
150
+
151
+ # 去除字符串中的中文字符,因为上面已经判断过了类似回答为"能"或"不能"的含有中文字符的回答情况
152
+ expression1 = re.sub(r'[\u4e00-\u9fff]+', '', expression1)
153
+ expression2 = re.sub(r'[\u4e00-\u9fff]+', '', expression2)
154
+
155
+ expression1 = self.split_by_comma(expression1)
156
+ expression2 = self.split_by_comma(expression2)
157
+
158
+ temp_list1 = self.trans_plus_minus_sign(expression1)
159
+ temp_list2 = self.trans_plus_minus_sign(expression2)
160
+
161
+ # 设计误差值列表
162
+ if len(precision) <= 1:
163
+ precision = precision * len(temp_list1)
164
+
165
+ if len(temp_list1) != len(temp_list2):
166
+ return False
167
+
168
+ # 判断两个列表中的元素是否可以两两配对,并且两两相等,由此支持多个回答的比较
169
+ idx = -1
170
+ while len(temp_list1) != 0:
171
+ idx = (idx + 1) % len(temp_list1)
172
+
173
+ item1 = temp_list1[idx]
174
+ self.precision = precision[idx]
175
+ # print(self.precision)
176
+
177
+ for item2 in temp_list2:
178
+ if self.is_equal(item1, item2):
179
+ temp_list1.remove(item1)
180
+ temp_list2.remove(item2)
181
+ precision.remove(self.precision)
182
+ break
183
+ else:
184
+ # If we didn't break from the inner loop, it means no match was found
185
+ return False
186
+
187
+ # If all elements are matched and removed, the lists can be paired
188
+ return True
189
+
190
+ def is_interval(self, epr):
191
+ return epr.startswith(("(", "[")) and epr.endswith((")", "]"))
192
+
193
+ # 在进行数值计算前,需要将sympy中的pi符号替换为pi的近似数值
194
+ # def sympy_sub_pi(self, expression_sympy):
195
+ # return expression_sympy.subs(self.pi, math.pi)
196
+
197
+ # 默认第一个表达式是 ground_truth
198
+ def is_equal(self, expression1, expression2):
199
+ if expression1 == expression2 and expression1 != "" and expression2 != "":
200
+ # print("原生等价")
201
+ return True
202
+
203
+ # 先判断是否是两个区间,是的话进行判断相等,不相等则返回 False
204
+ if self.is_interval(expression1) and self.is_interval(expression2):
205
+ try:
206
+ if self.interval_equal(expression1, expression2):
207
+ # print("区间等价")
208
+ return True
209
+ except:
210
+ return False
211
+
212
+ # 再判断是否在数值上相等
213
+ try:
214
+ if self.numerical_equal(expression1, expression2):
215
+ # print("数值等价")
216
+ return True
217
+ except:
218
+ pass
219
+
220
+ # 再判断是否是表达式相等
221
+ try:
222
+ if self.expression_equal(expression1, expression2) and not ("=" in expression1 and "=" in expression2):
223
+ # print("表达式等价")
224
+ return True
225
+ except:
226
+ pass
227
+
228
+ # 再判断是否是等式相等
229
+ try:
230
+ if self.equation_equal(expression1, expression2):
231
+ # print("等式等价")
232
+ return True
233
+ except:
234
+ pass
235
+
236
+ return False
237
+
238
+ # 判断两个数值在误差允许范围内是否相等
239
+ def numerical_equal(self, expression1: str, expression2: str, include_percentage: bool = True):
240
+ """
241
+ (默认 expression1 为 Ground_Truth)
242
+ 函数: 判读两个数值是否在误差允许范围内相等
243
+ 步骤1: 将可能出现的百分号的情况包含进来
244
+ 步骤2: 使用 math.isclose 函数判断是否相等
245
+ """
246
+ reference = float(expression1)
247
+ prediction = float(expression2)
248
+
249
+ if include_percentage:
250
+ gt_result = [reference / 100, reference, reference * 100]
251
+ else:
252
+ gt_result = [reference]
253
+
254
+ for item in gt_result:
255
+ # if isclose(item, prediction, abs_tol=self.precision, rel_tol=0):
256
+ if abs(item - prediction) <= self.precision * 1.01:
257
+ return True
258
+ return False
259
+
260
+ def expression_equal(self, exp1, exp2):
261
+ """
262
+ (默认 expression1 为 Ground_Truth)
263
+ 函数: 判断两个表达式是否在数学意义上等价
264
+ 步骤1: 提取表达式, 防止有的模型会给出"x=1"而不是"1"
265
+ 步骤2: 使用 sympy 库进行等价判断
266
+ """
267
+
268
+ # 只提取等号右边的表达式,一般左边是所求的量
269
+ def extract_expression(expression):
270
+ if "=" in expression:
271
+ expression = expression.split("=")[1]
272
+ return expression.strip()
273
+
274
+ exp1 = extract_expression(exp1)
275
+ exp2 = extract_expression(exp2)
276
+
277
+ exp_too_long = len(exp1) > 300 or len(exp2) > 300
278
+
279
+ # 将表达式转换为 sympy 中能够进行处理的格式
280
+ expr1_sym = sympify(parse_latex(exp1))
281
+ expr2_sym = sympify(parse_latex(exp2))
282
+
283
+ if expr1_sym == expr2_sym:
284
+ return True
285
+ else:
286
+ expr1_sym = self.sympy_sub_pi(expr1_sym)
287
+ expr2_sym = self.sympy_sub_pi(expr2_sym)
288
+ # 如果输入的表达式可以计算出具体数值的话,则将其进行数值计算的比较
289
+
290
+ if (expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol)) or (
291
+ not expr1_sym.has(sp.Symbol) and expr2_sym.has(sp.Symbol)):
292
+ return False
293
+ elif not expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol):
294
+ try:
295
+ if not (self.can_compute_power(expr1_sym) and self.can_compute_power(expr2_sym)):
296
+ print(
297
+ "These two number can not be calculated by current computer for: "
298
+ f"\"{str(expr1_sym)}\" and \"{str(expr2_sym)}\""
299
+ )
300
+ return False
301
+ if exp_too_long:
302
+ print(f'Expression {exp1} or {exp2} is too long to compute. ')
303
+ return False
304
+
305
+ if abs(expr1_sym.evalf() - expr2_sym.evalf()) <= self.precision * 1.01:
306
+ return True
307
+ else:
308
+ return False
309
+ except:
310
+ return False
311
+ elif exp_too_long:
312
+ print(f'Expression {exp1} or {exp2} is too long to compute. ')
313
+ return False
314
+ else:
315
+ try:
316
+ simplified_expr = simplify(expr1_sym - expr2_sym)
317
+
318
+ num_value = simplified_expr.evalf()
319
+
320
+ return abs(num_value) < 1e-3
321
+ except:
322
+ return False
323
+
324
+ def equation_equal(self, expression1, expression2):
325
+ """
326
+ (默认 expression1 为 Ground_Truth)
327
+ 函数: 判断两个方程是否在数学意义上等价
328
+ 步骤1: 将一个方程/等式化简为标准方程, 即等式的右边严格等于0, 接下来只需要判断两个等式的左边是否"等价"
329
+ 步骤2: 使用 sympy 库计算两个等式左边的商, 如果这个商或者这个商的倒数为整数, 那么数学意义上我们可以推导出这两个方程等价👌
330
+ """
331
+
332
+ # 将等式的右边都移到左边,并返回一个 sympy 格式的表达式
333
+ def simplify_equation(latex_eq):
334
+ # 分割等式的左边和右边
335
+ lhs, rhs = latex_eq.split('=')
336
+
337
+ # 使用 parse_latex 解析 LaTeX 表达式
338
+ lhs_expr = parse_latex(lhs)
339
+ rhs_expr = parse_latex(rhs)
340
+
341
+ # 创建等式对象
342
+ equation = Eq(lhs_expr, rhs_expr)
343
+
344
+ # 化简等式:将等式右边移到左边
345
+ simplified_eq = simplify(equation.lhs - equation.rhs)
346
+
347
+ return simplified_eq
348
+
349
+ expr1_sym = simplify_equation(expression1)
350
+ expr2_sym = simplify_equation(expression2)
351
+
352
+ division_result_1 = simplify(expr1_sym / expr2_sym)
353
+ division_result_2 = simplify(expr2_sym / expr1_sym)
354
+
355
+ # 如果两个方程转换后的式子相除为整数 且非零,则根据推导可知这两个方程等价
356
+ if (division_result_1.is_Integer and division_result_1 != 0) or (
357
+ division_result_2.is_Integer and division_result_2 != 0):
358
+ return True
359
+ else:
360
+ return False
361
+
362
+ def interval_equal(self, expression1, expression2):
363
+ # 函数: 判断两个区间是否在数学意义上等价
364
+ # 步骤1: 简化区间的表达式, 去除无关的符号比如"\left", "\right", 同时将可能出现的"x \in"删去
365
+ # 步骤2: 对比两个区间的左右符号、中间出现的数学表达式等是否一致
366
+
367
+ def compare_two_interval(inter1, inter2):
368
+
369
+ # 首先比较两边的括号是否一致,一致的话再进行下一步比较
370
+ if inter1[0] != inter2[0] or inter1[-1] != inter2[-1]:
371
+ return False
372
+
373
+ inter1 = inter1.strip('[]()')
374
+ inter2 = inter2.strip('[]()')
375
+
376
+ # 分割区间的左右部分
377
+ items_1 = inter1.split(',')
378
+ items_2 = inter2.split(',')
379
+
380
+ for item_1, item_2 in zip(items_1, items_2):
381
+ if not self.expression_equal(item_1, item_2):
382
+ return False
383
+ return True
384
+
385
+ interval1 = expression1
386
+ interval2 = expression2
387
+
388
+ if interval1 == interval2:
389
+ return True
390
+ else:
391
+ inter_list1 = interval1.split("\\cup")
392
+ inter_list2 = interval2.split("\\cup")
393
+
394
+ if len(inter_list1) != len(inter_list2):
395
+ return False
396
+ else:
397
+ for inter1, inter2 in zip(inter_list1, inter_list2):
398
+ if not compare_two_interval(inter1, inter2):
399
+ return False
400
+ return True
401
+
402
+ def preprocess(self, expression1, expression2):
403
+
404
+ # 尝试捕获box中的内容,如果有多个则以逗号相连返回,如果一个都没有,则报错
405
+ def extract_boxed_content(latex_str):
406
+ # 查找所有的 \boxed{...} 结构
407
+ boxed_matches = re.finditer(r'\\boxed{', latex_str)
408
+ results = ""
409
+
410
+ for match in boxed_matches:
411
+ start_index = match.end()
412
+ end_index = start_index
413
+ stack = 1
414
+
415
+ # 从 \boxed{ 之后开始搜索,直到找到对应的闭合括号
416
+ while stack > 0 and end_index < len(latex_str):
417
+ if latex_str[end_index] == '{':
418
+ stack += 1
419
+ elif latex_str[end_index] == '}':
420
+ stack -= 1
421
+ end_index += 1
422
+
423
+ if stack == 0:
424
+ # 提取 \boxed{} 内部的内容
425
+ content = latex_str[start_index:end_index - 1]
426
+ results += content + ","
427
+ else:
428
+ # 如果括号没有正确闭合,则返回错误信息
429
+ raise ValueError("Mismatched braces in LaTeX string.")
430
+
431
+ # 如果没有匹配到'\boxed{}'字符,则默认提取有内容的文字最后一行中的所有公式部分
432
+ if results == "":
433
+ last_line_ans = latex_str.strip().split("\n")[-1]
434
+ dollar_pattern = r"\$(.*?)\$"
435
+ answers = re.findall(dollar_pattern, last_line_ans)
436
+
437
+ if answers:
438
+ for ans in answers:
439
+ results += ans + ","
440
+ else:
441
+ results = latex_str
442
+
443
+ return results
444
+
445
+ def sepcial_symbol_replace(expression):
446
+ if "\\in " in expression:
447
+ expression = expression.split("\\in ")[1]
448
+
449
+ # 进行特殊字符的替换,这些字符都不影响latex的解析,属于美观/修饰性字符
450
+ for signal in self.special_signal_map:
451
+ expression = expression.replace(signal, self.special_signal_map[signal])
452
+
453
+ expression = expression.strip("\n$,.:;^_=+`!@#$%^&*~,。")
454
+
455
+ pattern = r'\\(?:mathrm|mathbf)\{~?([^}]*)\}'
456
+ expression = re.sub(pattern, r'\1', expression)
457
+
458
+ return expression
459
+
460
+ exp1, exp2 = extract_boxed_content(expression1), extract_boxed_content(expression2)
461
+ exp1, exp2 = sepcial_symbol_replace(exp1), sepcial_symbol_replace(exp2)
462
+
463
+ return exp1, exp2
464
+
465
+ def can_compute_power(self, expr):
466
+ """
467
+ Check if the power expression can be computed.
468
+
469
+ Parameters:
470
+ expr (sympy expression): The expression to check.
471
+
472
+ Returns:
473
+ bool: True if the expression can be computed, False otherwise.
474
+ """
475
+ # Check if the expression is a power expression
476
+ if isinstance(expr, Pow):
477
+ # Extract the base and the exponent
478
+ base, exp = expr.as_base_exp()
479
+
480
+ # Check if the base and the exponent are numbers
481
+ if base.is_number and exp.is_number:
482
+ # Set a threshold for the maximum size of the exponent
483
+ MAX_EXP = 1000 # This threshold can be adjusted based on the computing environment
484
+
485
+ # Check if the exponent is greater than the threshold
486
+ if abs(exp.evalf()) > MAX_EXP:
487
+ return False
488
+ else:
489
+ return True
490
+ else:
491
+ # If the base or the exponent is not a number, we cannot compute the power
492
+ return False
493
+ else:
494
+ # If the expression is not a power expression, return True as it is not the case we are checking for
495
+ return True
496
+
497
+
498
+ def extract_answer(is_chinese, model_output, is_deepseek=False):
499
+ # deepseekmath has special answering format
500
+ if str(model_output) == 'nan':
501
+ model_output = 'nan'
502
+
503
+ if is_deepseek:
504
+ if is_chinese:
505
+ matches = re.findall('## 解题答案(.*)', model_output)
506
+ else:
507
+ matches = re.findall('The answer is: (.*)', model_output)
508
+
509
+ # 检测是否至少找到一个匹配,如果没有就直接整个送进去找\boxed{}
510
+ if matches:
511
+ # 如果找到多个匹配,取最后一个
512
+ model_answer = matches[-1].strip()
513
+ return model_answer
514
+ else:
515
+ return model_output
516
+
517
+ if is_chinese:
518
+ matches = re.findall('所以最终答案是(.*)', model_output)
519
+ else:
520
+ matches = re.findall('So the final answer is (.*)', model_output)
521
+
522
+ # 检测是否至少找到一个匹配,如果没有就直接整个送进去找\boxed{}
523
+ if matches:
524
+ # 如果找到多个匹配,取最后一个
525
+ model_answer = matches[-1].strip()
526
+ return model_answer
527
+ else:
528
+ return model_output
529
+
530
+
531
+ def calculate_merged_accuracy(reference_dir, text_only):
532
+ pass
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/qspatial.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ...smp import *
2
+ from ...utils import can_infer
3
+
4
+
5
+ FAIL_MSG = 'Failed to obtain answer via API.'
6
+
7
+
8
+ def get_gpt4_ICE_for_qspatial():
9
+ example_1 = """
10
+ Hint: Please answer the question requiring in a tuple format. The tuple should contain a numeric value and a unit,
11
+ e.g., (1, m), (2.2, cm), (3.12, meter), at the end.\n
12
+ Model response: **Object Identification**
13
+
14
+ * The object in question is a chair.
15
+ * The chair is not visible in the image.
16
+
17
+ **Conclusion**
18
+
19
+ The height of the chair cannot be determined from the provided image.\n
20
+ Extracted answer: (0, cm)
21
+ """
22
+
23
+ example_2 = """
24
+ Hint: Please answer the question requiring in a tuple format. The tuple should contain a numeric value and a unit,
25
+ e.g., (1, inch), (1.2, cm), (3.0, feet), at the end.\n
26
+ Model response: **Step 1: Identify the stapler and the recycle bin in the image.**
27
+
28
+ The stapler is located on the wooden table, and the recycle bin is located on the floor.
29
+
30
+ **Step 2: Determine the distance between the stapler and the recycle bin.**
31
+
32
+ The stapler is 0.5 meters from the edge of the table, and the recycle bin is 1.5 meters from the edge of the table.
33
+ Therefore, the minimum distance between the stapler and the recycle bin is 1.5 - 0.5 = 1 meter.
34
+
35
+ **Answer:** 1 m\n
36
+ Extracted answer: (1, m)
37
+ """
38
+ example_3 = """
39
+ Hint: Please answer the question requiring in a tuple format. The tuple should contain a numeric value and a unit,
40
+ e.g., (1, foot), (2, cm), (4.3, meter), at the end.\n
41
+ Model response: The mirror in the image is approximately 5 feet 4 inches tall.\n
42
+ Extracted answer: (64, inch)
43
+ """
44
+ example_4 = """
45
+ Hint: Please answer the question requiring in a tuple format. The tuple should contain a numeric value and a unit,
46
+ e.g., (0.1, cm), (2.9, cm), (0.3, meter), at the end.\n
47
+ Model response: The minimum distance between the wooden chair and the chair near the camera in the image is 1.7 feet.\n
48
+ Extracted answer: (1.7, feet)
49
+ """
50
+ example_5 = """
51
+ Hint: Please answer the question requiring in a tuple format. The tuple should contain a numeric value and a unit,
52
+ e.g., (5.1, cm), (0.9, cm), (55, mm), at the end.\n
53
+ Model response: The height of the painting's bottom edge from the floor is approximately 4.5 feet.\n
54
+ Extracted answer: (4.5, feet)
55
+ """
56
+ return [example_1, example_2, example_3, example_4, example_5]
57
+
58
+
59
+ def list_to_dict(lst):
60
+ return {chr(65 + i): val for i, val in enumerate(lst)}
61
+
62
+
63
+ def post_check(line, prefetch=False):
64
+ res = None
65
+ ans = line['answer']
66
+ response = line['prediction'] if prefetch else line['res']
67
+ try:
68
+ if line['question_type'] == 'multi_choice':
69
+ ans = line['answer_option']
70
+ choices = list_to_dict(eval(line['choices']))
71
+ res = can_infer(response, choices)
72
+ if prefetch:
73
+ return res
74
+ else:
75
+ if line['answer_type'] == 'integer':
76
+ res = int(response)
77
+ ans = int(line['answer'])
78
+ elif line['answer_type'] == 'float':
79
+ res = float(response)
80
+ ans = float(line['answer'])
81
+ else:
82
+ res = str(res)
83
+ ans = str(ans)
84
+ except ValueError:
85
+ pass
86
+
87
+ if res == ans:
88
+ return res if prefetch else True
89
+ else:
90
+ return False
91
+
92
+
93
+ def build_qspatial_gpt4_prompt(line):
94
+ task_description = """
95
+ Please read the following example.
96
+ Then extract the answer from the model response and type it at the end of the prompt.\n
97
+ """
98
+ prediction = str(line['prediction'])
99
+ prompt = task_description
100
+ examples = get_gpt4_ICE_for_qspatial()
101
+ for example in examples:
102
+ prompt += example + '\n'
103
+ prompt += 'Model respone: ' + prediction
104
+ prompt += '\nExtracted answer:'
105
+ return prompt
106
+
107
+
108
+ def QSpatial_auxeval(model, line):
109
+ prompt = build_qspatial_gpt4_prompt(line)
110
+
111
+ log = ''
112
+ retry = 5
113
+ for i in range(retry):
114
+ prediction = line['prediction']
115
+ res = model.generate(prompt, temperature=i * 0.5)
116
+
117
+ if FAIL_MSG in res:
118
+ log += f'Try {i}: output is {prediction}, failed to parse.\n'
119
+ else:
120
+ log += 'Succeed'
121
+ return dict(log=log, res=res)
122
+ log += 'All 5 retries failed.\n'
123
+ return dict(log=log, res='')
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/tablevqabench.py ADDED
@@ -0,0 +1,500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copied from https://github.com/allenai/allennlp-semparse
3
+ Modified from https://github.com/naver-ai/tablevqabench
4
+ """
5
+
6
+ import re
7
+ import unicodedata
8
+ import time
9
+
10
+ from abc import ABCMeta, abstractmethod
11
+ from math import isinf, isnan
12
+
13
+
14
+ # Vision Prompts
15
+ VWTQ_PROMPT = (
16
+ 'You are asked to answer questions asked on an image.\n'
17
+ 'You should answer the question with a single word.\n'
18
+ 'Example: \n'
19
+ 'Question: what was the only year mr. wu competed in the olympic games?\n'
20
+ 'Answer: 2004\n'
21
+ 'Question: which township in pope county, arkansas has the least amount of water area?\n'
22
+ 'Answer: Freeman\n'
23
+ 'If you have multiple answers, please separate them with || marks. Example: Apple||Banana||Tomato\n\n'
24
+ 'Question: {question}\n'
25
+ 'Answer:'
26
+ )
27
+
28
+ VTABFACT_PROMPT = (
29
+ 'You are asked to answer whether the statement is True or False based on given image\n'
30
+ 'You should only answer True or False.\n'
31
+ 'Example: \n'
32
+ 'Statement: the milwaukee buck win 6 game in the 2010 - 11 season\n'
33
+ 'Answer: True\n'
34
+ 'Statement: only the top team score above the average of 8.8\n'
35
+ 'Answer: False\n\n'
36
+ 'Statement: {question}\n'
37
+ 'Answer:'
38
+ )
39
+
40
+ FINTABNETQA_PROMPT = (
41
+ 'You are asked to answer questions asked on a image.\n'
42
+ 'You should answer the question within a single word or few words.\n'
43
+ 'If units can be known, the answer should include units such as $, %, million and etc.\n'
44
+ 'Example: \n'
45
+ 'Question: What were the total financing originations for the fiscal year ended October 31, 2004?\n'
46
+ 'Answer: $3,852 million\n'
47
+ 'Question: What is the time period represented in the table?\n'
48
+ 'Answer: October 31\n'
49
+ 'Question: What was the percentage of net sales for selling, general and administrative expenses in 2006?\n'
50
+ 'Answer: 34.2%\n'
51
+ 'Question: {question}\n'
52
+ 'Answer:'
53
+ )
54
+
55
+
56
+ def evaluate_tabfact(data, score_keys):
57
+ num_examples = 0
58
+ num_correct = 0
59
+ manual_check = 0
60
+ start_time = time.time()
61
+ for instance in data:
62
+ if instance['prediction'] is None:
63
+ instance['prediction'] = 'none'
64
+ pred = instance['prediction'].lower()
65
+ gt = instance['answer']
66
+ num_examples += 1
67
+ if 'true' in pred and 'false' in pred:
68
+ manual_check += 1
69
+ score = None
70
+ elif 'true' in pred and gt == '1':
71
+ num_correct += 1
72
+ score = 1
73
+ elif 'false' in pred and gt == '0':
74
+ num_correct += 1
75
+ score = 1
76
+ else:
77
+ score = 0
78
+ instance['scores'] = {score_keys[0]: score}
79
+ if manual_check > 0:
80
+ print(f'the number of not properly parsed samples: {manual_check}')
81
+ end_time = time.time()
82
+ elapsed_time = end_time - start_time
83
+ Accuracy = round((num_correct + 1e-9) / (num_examples + 1e-9), 8) * 100
84
+ meta = {
85
+ 'evaluators': 'correctness',
86
+ 'score_info': [score_keys[0]],
87
+ 'evaluated_time': elapsed_time,
88
+ 'total_num_sample': len(data),
89
+ 'average_scores': [Accuracy],
90
+ }
91
+ return meta
92
+
93
+
94
+ def evaluate_wtq(data, score_keys):
95
+ num_examples = 0
96
+ num_correct = 0
97
+ start_time = time.time()
98
+
99
+ for instance in data:
100
+ pred = instance['prediction'].replace('||', '|')
101
+ gt = instance['answer']
102
+ original_strings = tsv_unescape_list(gt)
103
+ target_values = to_value_list(original_strings)
104
+
105
+ predicted_strings = tsv_unescape_list(pred)
106
+ predicted_values = to_value_list(predicted_strings)
107
+ correct = check_denotation(target_values, predicted_values)
108
+ num_examples += 1
109
+ score = 0
110
+ if correct:
111
+ num_correct += 1
112
+ score = 1
113
+ instance['scores'] = {score_keys[0]: score}
114
+
115
+ end_time = time.time()
116
+ elapsed_time = end_time - start_time
117
+
118
+ Accuracy = round((num_correct + 1e-9) / (num_examples + 1e-9), 8) * 100
119
+ meta = {
120
+ 'evaluators': 'correctness',
121
+ 'score_info': [score_keys[0]],
122
+ 'evaluated_time': elapsed_time,
123
+ 'total_num_sample': len(data),
124
+ 'average_scores': [Accuracy],
125
+ }
126
+ return meta
127
+
128
+
129
+ def evaluate_fintabnet(data, score_keys):
130
+ num_examples = 0
131
+ num_correct, _num_correct = 0, 0
132
+ start_time = time.time()
133
+ for instance in data:
134
+ pred, preds = fintabnet_normalize(instance['prediction'])
135
+ gt, gts = fintabnet_normalize(instance['answer'])
136
+ correct = 1 if gt == pred else 0
137
+ _correct = any(_pred == _gt for _pred in preds for _gt in gts)
138
+ num_examples += 1
139
+ score, _score = 0, 0
140
+ if correct:
141
+ num_correct += 1
142
+ score = 1
143
+ if _correct:
144
+ _num_correct += 1
145
+ _score = 1
146
+ instance['scores'] = {score_keys[0]: _score, 'exact_score': score}
147
+
148
+ end_time = time.time()
149
+ elapsed_time = end_time - start_time
150
+ Accuracy = round((num_correct + 1e-9) / (num_examples + 1e-9), 8) * 100
151
+ _Accuracy = round((_num_correct + 1e-9) / (num_examples + 1e-9), 8) * 100
152
+ meta = {
153
+ 'evaluators': 'correctness',
154
+ 'score_info': ['relieved_accuracy', score_keys[0]],
155
+ 'evaluated_time': elapsed_time,
156
+ 'total_num_sample': len(data),
157
+ 'average_scores': [_Accuracy, Accuracy],
158
+ }
159
+ return meta
160
+
161
+
162
+ def fintabnet_normalize(s):
163
+ s = normalize(s)
164
+ remove_words = [
165
+ 'dollar', 'gallons', 'square feet', 'shares', 'mbtu',
166
+ 'mbpd', 'mbbls', 'mmbtu', 'unit', 'gwh', 'year', 'mmcf', 'mile', 'mboe'
167
+ ]
168
+
169
+ # Data specific filtering using regular expressions
170
+ # Remove special characters like $, (, and )
171
+ s = re.sub(r'[\$\(\),]', '', s)
172
+
173
+ # Replace "dollar" with empty string if it's not part of another word
174
+ pattern = r'\b(' + '|'.join(remove_words) + r')s?\b'
175
+ s = re.sub(pattern, '', s, flags=re.IGNORECASE)
176
+
177
+ # Unit conversion dictionary with regex patterns for flexibility
178
+ unit_conversion = {
179
+ r' \bthousand\b': 'e3',
180
+ r' \bmillion\b': 'e6',
181
+ r' \bbillion\b': 'e9',
182
+ r'\bthousand\b': 'e3',
183
+ r'\bmillion\b': 'e6',
184
+ r'\bbillion\b': 'e9',
185
+ r' ?%': 'e-2',
186
+ }
187
+
188
+ # Convert percentages to their decimal representation.
189
+ # Applying this after unit_conversion prevents "percent" from being processed
190
+ # in cases like "million %", which would be incorrect.
191
+ # s = re.sub(r' ?%', 'e-2', s)
192
+ # s_percent = re.sub(r' ?%', '', s_percent)
193
+
194
+ s_unit_free = s
195
+
196
+ # Iterate over unit_conversion and apply transformations
197
+ for pattern, value in unit_conversion.items():
198
+ s = re.sub(pattern, value, s)
199
+ s_unit_free = re.sub(pattern, '', s_unit_free)
200
+
201
+ # Attempt to convert to float
202
+ try:
203
+ return float(s), [float(s), float(s_unit_free)]
204
+ except ValueError:
205
+ # Return the original string and the error for debugging purposes
206
+ return s, [s, s_unit_free]
207
+
208
+
209
+ def normalize(x):
210
+ if not isinstance(x, str):
211
+ x = x.decode('utf8', errors='ignore')
212
+ # Remove diacritics
213
+ x = ''.join(
214
+ c for c in unicodedata.normalize('NFKD', x) if unicodedata.category(c) != 'Mn'
215
+ )
216
+ # Normalize quotes and dashes
217
+ x = re.sub(r'[‘’´`]', "'", x)
218
+ x = re.sub(r'[“”]', '"', x)
219
+ x = re.sub(r'[‐‑‒–—−]', '-', x)
220
+ while True:
221
+ old_x = x
222
+ # Remove citations
223
+ x = re.sub(r'((?<!^)\[[^\]]*\]|\[\d+\]|[•♦†‡*#+])*$', '', x.strip())
224
+ # Remove details in parenthesis
225
+ x = re.sub(r'(?<!^)( \([^)]*\))*$', '', x.strip())
226
+ # Remove outermost quotation mark
227
+ x = re.sub(r'^"([^"]*)"$', r'\1', x.strip())
228
+ if x == old_x:
229
+ break
230
+ # Remove final '.'
231
+ if x and x[-1] == '.':
232
+ x = x[:-1]
233
+ # Collapse whitespaces and convert to lower case
234
+ x = re.sub(r'\s+', ' ', x, flags=re.U).lower().strip()
235
+ return x
236
+
237
+
238
+ # Value Types
239
+ class Value(object):
240
+ __metaclass__ = ABCMeta
241
+
242
+ # Should be populated with the normalized string
243
+ _normalized = None
244
+
245
+ @abstractmethod
246
+ def match(self, other):
247
+ """Return True if the value matches the other value.
248
+
249
+ Args:
250
+ other (Value)
251
+ Returns:
252
+ a boolean
253
+ """
254
+ pass
255
+
256
+ @property
257
+ def normalized(self):
258
+ return self._normalized
259
+
260
+
261
+ class StringValue(Value):
262
+ def __init__(self, content):
263
+ assert isinstance(content, str)
264
+ self._normalized = normalize(content)
265
+ self._hash = hash(self._normalized)
266
+
267
+ def __eq__(self, other):
268
+ return isinstance(other, StringValue) and self.normalized == other.normalized
269
+
270
+ def __hash__(self):
271
+ return self._hash
272
+
273
+ def __str__(self):
274
+ return 'S' + str([self.normalized])
275
+
276
+ def __repr__(self):
277
+ return self.__str__()
278
+
279
+ def match(self, other):
280
+ assert isinstance(other, Value)
281
+ return self.normalized == other.normalized
282
+
283
+
284
+ class NumberValue(Value):
285
+ def __init__(self, amount, original_string=None):
286
+ assert isinstance(amount, (int, float))
287
+ if abs(amount - round(amount)) < 1e-6:
288
+ self._amount = int(amount)
289
+ else:
290
+ self._amount = float(amount)
291
+ if not original_string:
292
+ self._normalized = str(self._amount)
293
+ else:
294
+ self._normalized = normalize(original_string)
295
+ self._hash = hash(self._amount)
296
+
297
+ @property
298
+ def amount(self):
299
+ return self._amount
300
+
301
+ def __eq__(self, other):
302
+ return isinstance(other, NumberValue) and self.amount == other.amount
303
+
304
+ def __hash__(self):
305
+ return self._hash
306
+
307
+ def __str__(self):
308
+ return 'N({})'.format(self.amount) + str([self.normalized])
309
+
310
+ def __repr__(self):
311
+ return self.__str__()
312
+
313
+ def match(self, other):
314
+ assert isinstance(other, Value)
315
+ if self.normalized == other.normalized:
316
+ return True
317
+ if isinstance(other, NumberValue):
318
+ return abs(self.amount - other.amount) < 1e-6
319
+ return False
320
+
321
+ @staticmethod
322
+ def parse(text):
323
+ """Try to parse into a number.
324
+
325
+ Return:
326
+ the number (int or float) if successful; otherwise None.
327
+ """
328
+ try:
329
+ return int(text)
330
+ except ValueError:
331
+ try:
332
+ amount = float(text)
333
+ assert not isnan(amount) and not isinf(amount)
334
+ return amount
335
+ except ValueError:
336
+ return None
337
+
338
+
339
+ class DateValue(Value):
340
+ def __init__(self, year, month, day, original_string=None):
341
+ """Create a new DateValue. Placeholders are marked as -1."""
342
+ assert isinstance(year, int)
343
+ assert isinstance(month, int) and (month == -1 or 1 <= month <= 12)
344
+ assert isinstance(day, int) and (day == -1 or 1 <= day <= 31)
345
+ assert not (year == month == day == -1)
346
+ self._year = year
347
+ self._month = month
348
+ self._day = day
349
+ if not original_string:
350
+ self._normalized = '{}-{}-{}'.format(
351
+ year if year != -1 else 'xx',
352
+ month if month != -1 else 'xx',
353
+ day if day != '-1' else 'xx',
354
+ )
355
+ else:
356
+ self._normalized = normalize(original_string)
357
+ self._hash = hash((self._year, self._month, self._day))
358
+
359
+ @property
360
+ def ymd(self):
361
+ return (self._year, self._month, self._day)
362
+
363
+ def __eq__(self, other):
364
+ return isinstance(other, DateValue) and self.ymd == other.ymd
365
+
366
+ def __hash__(self):
367
+ return self._hash
368
+
369
+ def __str__(self):
370
+ return ('D(%d,%d,%d)' % (self._year, self._month, self._day)) + str(
371
+ [self._normalized]
372
+ )
373
+
374
+ __repr__ = __str__
375
+
376
+ def match(self, other):
377
+ assert isinstance(other, Value)
378
+ if self.normalized == other.normalized:
379
+ return True
380
+ if isinstance(other, DateValue):
381
+ return self.ymd == other.ymd
382
+ return False
383
+
384
+ @staticmethod
385
+ def parse(text):
386
+ """Try to parse into a date.
387
+
388
+ Return:
389
+ tuple (year, month, date) if successful; otherwise None.
390
+ """
391
+ try:
392
+ ymd = text.lower().split('-')
393
+ assert len(ymd) == 3
394
+ year = -1 if ymd[0] in ('xx', 'xxxx') else int(ymd[0])
395
+ month = -1 if ymd[1] == 'xx' else int(ymd[1])
396
+ day = -1 if ymd[2] == 'xx' else int(ymd[2])
397
+ assert not (year == month == day == -1)
398
+ assert month == -1 or 1 <= month <= 12
399
+ assert day == -1 or 1 <= day <= 31
400
+ return (year, month, day)
401
+ except:
402
+ return None
403
+
404
+
405
+ # Value Instantiation
406
+ def to_value(original_string, corenlp_value=None):
407
+ """Convert the string to Value object.
408
+
409
+ Args:
410
+ original_string (basestring): Original string
411
+ corenlp_value (basestring): Optional value returned from CoreNLP
412
+ Returns:
413
+ Value
414
+ """
415
+ if isinstance(original_string, Value):
416
+ # Already a Value
417
+ return original_string
418
+ if not corenlp_value:
419
+ corenlp_value = original_string
420
+ # Number?
421
+ amount = NumberValue.parse(corenlp_value)
422
+ if amount is not None:
423
+ return NumberValue(amount, original_string)
424
+ # Date?
425
+ ymd = DateValue.parse(corenlp_value)
426
+ if ymd is not None:
427
+ if ymd[1] == ymd[2] == -1:
428
+ return NumberValue(ymd[0], original_string)
429
+ else:
430
+ return DateValue(ymd[0], ymd[1], ymd[2], original_string)
431
+ # String.
432
+ return StringValue(original_string)
433
+
434
+
435
+ def to_value_list(original_strings, corenlp_values=None):
436
+ """Convert a list of strings to a list of Values
437
+
438
+ Args:
439
+ original_strings (list[basestring])
440
+ corenlp_values (list[basestring or None])
441
+ Returns:
442
+ list[Value]
443
+ """
444
+ assert isinstance(original_strings, (list, tuple, set))
445
+ if corenlp_values is not None:
446
+ assert isinstance(corenlp_values, (list, tuple, set))
447
+ assert len(original_strings) == len(corenlp_values)
448
+ return list(
449
+ set(to_value(x, y) for (x, y) in zip(original_strings, corenlp_values))
450
+ )
451
+ else:
452
+ return list(set(to_value(x) for x in original_strings))
453
+
454
+
455
+ # Check the Predicted Denotations
456
+ def check_denotation(target_values, predicted_values):
457
+ """Return True if the predicted denotation is correct.
458
+
459
+ Args:
460
+ target_values (list[Value])
461
+ predicted_values (list[Value])
462
+ Returns:
463
+ bool
464
+ """
465
+ # Check size
466
+ if len(target_values) != len(predicted_values):
467
+ return False
468
+ # Check items
469
+ for target in target_values:
470
+ if not any(target.match(pred) for pred in predicted_values):
471
+ return False
472
+ return True
473
+
474
+
475
+ # Batch Mode
476
+ def tsv_unescape(x):
477
+ """Unescape strings in the TSV file.
478
+ Escaped characters include:
479
+ newline (0x10) -> backslash + n
480
+ vertical bar (0x7C) -> backslash + p
481
+ backslash (0x5C) -> backslash + backslash
482
+
483
+ Args:
484
+ x (str or unicode)
485
+ Returns:
486
+ a unicode
487
+ """
488
+ return x.replace(r'\n', '\n').replace(r'\p', '|').replace('\\\\', '\\')
489
+
490
+
491
+ def tsv_unescape_list(x):
492
+ """Unescape a list in the TSV file.
493
+ List items are joined with vertical bars (0x5C)
494
+
495
+ Args:
496
+ x (str or unicode)
497
+ Returns:
498
+ a list of unicodes
499
+ """
500
+ return [tsv_unescape(y) for y in x.split('|')]
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/tempcompass.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ...smp import *
2
+ from .multiple_choice import extract_answer_from_item
3
+ from PIL import Image, ImageOps
4
+ import numpy as np
5
+
6
+ sys_prompt = "You are an AI assistant for question answering."
7
+
8
+ system_prompt_multi_choice = (
9
+ "You will receive a multi-choice question, the ground-truth answer and the prediction from a question answering (QA) model. " # noqa
10
+ "Your task is to determine whether QA model prediction is correct, based on the question and ground-truth answer. "
11
+ "If the prediction is correct, respond \"Correct\". If the prediction is incorrect, respond \"Incorrect\"."
12
+ )
13
+
14
+ system_prompt_caption_matching = (
15
+ "You will receive a caption matching question, the ground-truth answer and the prediction from a question answering (QA) model. " # noqa
16
+ "Your task is to determine whether QA model prediction is correct, based on the question and ground-truth answer. "
17
+ "If the prediction is correct, respond \"Correct\". If the prediction is incorrect, respond \"Incorrect\"."
18
+ )
19
+
20
+ system_prompt_captioning = """
21
+ You will receive a video description and a multi-choice question. Your task is to choose the correct answer and briefly explain the reason why you choose the answer. \
22
+ If none of the choice candidates are correct or the video description lacks enough information to answer the question, just answer "None of the choices are correct". \
23
+ Please organize your response in this format:
24
+ ```
25
+ Reasoning: [Your reason to obtain the answer]
26
+ Answer: [Your answer]
27
+ ```
28
+
29
+ Here are some examples of video description, multi-choice question and the expected answer:
30
+ ```
31
+ Video Description: A person is palying football.
32
+ Multi-Choice Question:
33
+ What is the person doing in the video?
34
+ A. cooking
35
+ B. palying football
36
+ C. playing basketball
37
+ D. reading book
38
+ Reasoning: The video description mentions that the person is playing football.
39
+ Answer: B. palying football
40
+
41
+ Video Description: A bird is flying clockwise.
42
+ Multi-Choice Question:
43
+ In which direction is the bird flying?
44
+ A. backwark
45
+ B. counter-clockwise
46
+ C. clockwise
47
+ D. downward
48
+ Reasoning: The video description mentions that the bird is flying clockwise
49
+ Answer: C. clockwise
50
+
51
+ Video Description: An air balloon is inflating.
52
+ Multi-Choice Question:
53
+ What is happening to the air balloon?
54
+ A. exploding
55
+ B. getting smaller
56
+ C. flying
57
+ Reasoning: The video description mentions that the air balloon is inflating, while none of the coices can be explained as inflating.
58
+ Answer: None of the choices are correct
59
+ ```
60
+ """ # noqa
61
+
62
+ system_prompt_YorN = """
63
+ You will receive a Yes/No question, the ground-truth answer and the prediction from a question answering (QA) model. \
64
+ Your task is to determine whether QA model prediction is correct, based on the question and ground-truth answer. \
65
+ If the prediction is correct, respond "Correct". If the prediction is incorrect, respond "Incorrect".
66
+ """ # noqa
67
+
68
+
69
+ def eval_rule_caption_matching(line):
70
+ # Determine whether the video llm output is correct, based on word matching rules
71
+ video_llm_output = line['prediction']
72
+ answer = line['answer']
73
+ option_strs = eval(line['candidates']) # complete option strings
74
+ option_sents = [opt.split(': ')[1] for opt in option_strs] # option sentence
75
+ # option index, e.g., Sentence A, Caption A, Option 1
76
+ option_inds = [opt.split(': ')[0] for opt in option_strs] + [opt.split(': ')[0].replace('Sentence ', '').replace('Option ', '').replace('Caption ', '') for opt in option_strs] # noqa
77
+ video_llm_pred = None
78
+ for option_str in option_strs:
79
+ if option_str == video_llm_output:
80
+ video_llm_pred = option_str
81
+ for option_sent in option_sents:
82
+ if option_sent == video_llm_output or (') ' in video_llm_output and option_sent == video_llm_output.split(') ')[1]): # noqa
83
+ video_llm_pred = option_sent
84
+ for option_ind in option_inds:
85
+ if option_ind == video_llm_output or option_ind == video_llm_output.replace('.', ''): # noqa
86
+ video_llm_pred = option_ind
87
+
88
+ if video_llm_pred is None:
89
+ return "fail"
90
+ else:
91
+ return 1 if video_llm_pred == answer or video_llm_pred == answer.split(":")[0] or video_llm_pred == answer.split(": ")[1] or video_llm_pred == answer.split(": ")[0].split()[1] else 0 # noqa
92
+
93
+
94
+ def eval_rule_multi_choice(line):
95
+ if line['prediction'] == line['answer']:
96
+ return 1
97
+ elif line['prediction'] in ['A', 'B', 'C', 'D']:
98
+ return 1 if line['prediction'] == line['answer'][0] else 0
99
+ elif any(line['prediction'].startswith(prefix) for prefix in ['A.', 'B.', 'C.', 'D.']):
100
+ return 1 if line['prediction'].split('.')[0] == line['answer'][0] else 0
101
+ elif any(line['prediction'].startswith(prefix) for prefix in ['A)', 'B)', 'C)', 'D)']):
102
+ return 1 if line['prediction'].split(')')[0] == line['answer'][0] else 0
103
+ else:
104
+ return "fail"
105
+
106
+
107
+ def eval_rule_YorN(video_llm_output):
108
+ # Extract the yes/no predction from the original video llm output
109
+ video_llm_output = video_llm_output.lower()
110
+ if video_llm_output.startswith("yes"):
111
+ return "yes"
112
+ elif video_llm_output.startswith("no"):
113
+ return "no"
114
+ else:
115
+ return False
116
+
117
+
118
+ def llm_output_to_rating(llm_output):
119
+ if not ('Correct' in llm_output or 'Incorrect' in llm_output):
120
+ print(f"Warning: LLM output is not in the correct format: {llm_output}")
121
+ rating = 0
122
+ return rating
123
+ if llm_output.startswith('Correct'):
124
+ rating = 1
125
+ elif llm_output.startswith('Incorrect'):
126
+ rating = 0
127
+ elif ('Correct' in llm_output) and ('Incorrect' not in llm_output):
128
+ rating = 1
129
+ elif 'Incorrect' in llm_output:
130
+ rating = 0
131
+ return rating
132
+
133
+
134
+ def parse_llm_output(llm_output, gt_answer):
135
+ if llm_output == "invalid_request_error" or not llm_output:
136
+ eval_result = {"rating": -1, "chatgpt-answer": None, "chatgpt-reasoning": None}
137
+ return eval_result
138
+
139
+ eval_result = {}
140
+ lines = llm_output.split("\n")
141
+
142
+ for line in lines:
143
+ line = line.strip()
144
+ if "Reasoning" in line:
145
+ eval_result['chatgpt-reasoning'] = line.replace("Reasoning:", "").strip()
146
+ if "Answer" in line:
147
+ eval_result['chatgpt-answer'] = line.replace("Answer:", "").strip()
148
+
149
+ if "chatgpt-answer" not in eval_result:
150
+ eval_result['chatgpt-answer'] = llm_output
151
+ if "chatgpt-reasoning" not in eval_result:
152
+ eval_result['chatgpt-reasoning'] = None
153
+
154
+ # Check if the chatgpt answer is the ground-truth answer
155
+ # calculate the number of 'A.', 'B.', 'C.', 'D.' in chatgpt-answer
156
+ answer_counts = sum(eval_result['chatgpt-answer'].count(prefix) for prefix in ['A.', 'B.', 'C.', 'D.']) # noqa
157
+ if eval_result['chatgpt-answer'].split(". ")[0] == gt_answer.split(". ")[0] and answer_counts == 1:
158
+ eval_result['rating'] = 1
159
+ else:
160
+ eval_result['rating'] = 0
161
+ return eval_result
162
+
163
+
164
+ def evaluate_tempcompass_mcq(model, line):
165
+ eval_rules_dict = {
166
+ 'caption_matching': eval_rule_caption_matching,
167
+ 'multi-choice': eval_rule_multi_choice
168
+ }
169
+ gpt_eval_prompt = {
170
+ 'multi-choice': '{}\nMulti-Choice Question:\n{}\nGround-Truth Answer: {}\nModel Prediction: {}',
171
+ 'caption_matching': '{}\nCaption Matching Question:\n{}\nGround-Truth Answer: {}\nModel Prediction: {}'
172
+ }
173
+ base_prompt = {
174
+ 'multi-choice': system_prompt_multi_choice,
175
+ 'caption_matching': system_prompt_caption_matching
176
+ }
177
+ eval_result = {
178
+ "question": line['question'],
179
+ "answer": line['answer'],
180
+ "prediction": line['prediction'],
181
+ "task_type": line['task_type'],
182
+ "candidates": line['candidates'],
183
+ "match_success": True
184
+ }
185
+ result = eval_rules_dict[line['task_type']](line)
186
+ if result == "fail":
187
+ eval_result['match_success'] = False
188
+ if model is None:
189
+ eval_result['rating'] = 0
190
+ else:
191
+ prompt_template = gpt_eval_prompt[line['task_type']]
192
+ prompt = prompt_template.format(base_prompt[line['task_type']], line['question'], line['answer'], line['prediction']) # noqa
193
+ llm_output = model.generate(prompt)
194
+ result = llm_output_to_rating(llm_output)
195
+ eval_result['chatgpt-response'] = llm_output
196
+ eval_result['rating'] = result
197
+ else:
198
+ eval_result['rating'] = result
199
+
200
+ return eval_result
201
+
202
+
203
+ def evaluate_tempcompass_captioning(model, line):
204
+ prompt = (
205
+ f"{system_prompt_captioning}\n"
206
+ f"Video Description:{line['prediction']}\n"
207
+ f"Multi-Choice Question:\n{line['mc_question']}\n"
208
+ )
209
+ if model is not None:
210
+ llm_output = model.generate(prompt)
211
+ eval_result = parse_llm_output(llm_output, gt_answer=line['mc_answer'])
212
+ return eval_result
213
+ else:
214
+ raise ValueError("Model is None, TempCompass Captioning task not supported exact matching") # noqa
215
+
216
+
217
+ def evaluate_tempcompass_YorN(model, line):
218
+ prompt = (
219
+ f"{system_prompt_YorN}\n"
220
+ f"Yes/No Question:\n{line['question']}\n"
221
+ f"Ground-Truth Answer: {line['answer']}\n"
222
+ f"Model Prediction: {line['prediction']}"
223
+ )
224
+ result = eval_rule_YorN(line['prediction'])
225
+ eval_result = {
226
+ "question": line['question'],
227
+ "answer": line['answer'],
228
+ "prediction": line['prediction'],
229
+ "match_success": True
230
+ }
231
+ if result:
232
+ eval_result['rating'] = 1 if result == line['answer'] else 0
233
+ elif model is None:
234
+ eval_result['match_success'] = False
235
+ eval_result['rating'] = 0
236
+ else:
237
+ eval_result['match_success'] = False
238
+ llm_output = model.generate(prompt)
239
+ result = llm_output_to_rating(llm_output)
240
+ eval_result['chatgpt-response'] = llm_output
241
+ eval_result['rating'] = result
242
+ return eval_result
243
+
244
+
245
+ def get_dimension_rating(score_file):
246
+ data = load(score_file)
247
+ result_dict = {}
248
+ for idx, item in data.iterrows():
249
+ dict_key = item['dim'] + '. ' + item['task_type']
250
+ if dict_key not in result_dict:
251
+ result_dict[dict_key] = [0,0]
252
+ result_dict[dict_key][0] += int(item['score'])
253
+ result_dict[dict_key][1] += 1
254
+ return result_dict
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/videomme.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ...smp import *
2
+ from .multiple_choice import extract_answer_from_item
3
+ import numpy as np
4
+ import re
5
+
6
+ FAIL_MSG = 'Failed to obtain answer via API.'
7
+
8
+ DURATIONS = [
9
+ 'short',
10
+ 'medium',
11
+ 'long',
12
+ ]
13
+
14
+ DOMAINS = [
15
+ 'Knowledge',
16
+ 'Film & Television',
17
+ 'Sports Competition',
18
+ 'Artistic Performance',
19
+ 'Life Record',
20
+ 'Multilingual'
21
+ ]
22
+
23
+ SUB_CATEGORIES = [
24
+ 'Humanity & History',
25
+ 'Literature & Art',
26
+ 'Biology & Medicine',
27
+ 'Finance & Commerce',
28
+ 'Astronomy',
29
+ 'Geography',
30
+ 'Law',
31
+ 'Life Tip',
32
+ 'Technology',
33
+ 'Animation',
34
+ 'Movie & TV Show',
35
+ 'Documentary',
36
+ 'News Report',
37
+ 'Esports',
38
+ 'Basketball',
39
+ 'Football',
40
+ 'Athletics',
41
+ 'Other Sports',
42
+ 'Stage Play',
43
+ 'Magic Show',
44
+ 'Variety Show',
45
+ 'Acrobatics',
46
+ 'Handicraft',
47
+ 'Food',
48
+ 'Fashion',
49
+ 'Daily Life',
50
+ 'Travel',
51
+ 'Pet & Animal',
52
+ 'Exercise',
53
+ 'Multilingual'
54
+ ]
55
+
56
+ TASK_CATEGORIES = [
57
+ 'Temporal Perception',
58
+ 'Spatial Perception',
59
+ 'Attribute Perception',
60
+ 'Action Recognition',
61
+ 'Object Recognition',
62
+ 'OCR Problems',
63
+ 'Counting Problem',
64
+ 'Temporal Reasoning',
65
+ 'Spatial Reasoning',
66
+ 'Action Reasoning',
67
+ 'Object Reasoning',
68
+ 'Information Synopsis',
69
+ ]
70
+
71
+
72
+ def get_dimension_rating(data_path):
73
+ data = load(data_path)
74
+
75
+ duration_rating = {k: {} for k in DURATIONS}
76
+ for duration in DURATIONS + ['overall']:
77
+ duration_rating[duration] = {
78
+ 'overall': '',
79
+ 'domain': {k: [] for k in DOMAINS},
80
+ 'sub_category': {k: [] for k in SUB_CATEGORIES},
81
+ 'task_type': {k: [] for k in TASK_CATEGORIES}
82
+ }
83
+
84
+ for i in range(len(data)):
85
+
86
+ domain = data.iloc[i]['domain']
87
+ sub_ctg = data.iloc[i]['sub_category']
88
+ task_ctg = data.iloc[i]['task_type']
89
+
90
+ duration = data.iloc[i]['duration']
91
+ duration_rating[duration]['domain'][domain].append(data.iloc[i]['score'])
92
+ duration_rating[duration]['sub_category'][sub_ctg].append(data.iloc[i]['score'])
93
+ duration_rating[duration]['task_type'][task_ctg].append(data.iloc[i]['score'])
94
+
95
+ duration_rating['overall']['domain'][domain].append(data.iloc[i]['score'])
96
+ duration_rating['overall']['sub_category'][sub_ctg].append(data.iloc[i]['score'])
97
+ duration_rating['overall']['task_type'][task_ctg].append(data.iloc[i]['score'])
98
+
99
+ for duration in DURATIONS + ['overall']:
100
+
101
+ overall_res_dur = f'{np.mean([x for x in sum(duration_rating[duration]["domain"].values(), []) if x >= 0]):.3f}'
102
+ duration_rating[duration]['overall'] = overall_res_dur
103
+
104
+ for domain in DOMAINS:
105
+ domain_res_dur = f'{np.mean([x for x in duration_rating[duration]["domain"][domain] if x >= 0]):.3f}'
106
+ duration_rating[duration]['domain'][domain] = domain_res_dur
107
+
108
+ for sub_ctg in SUB_CATEGORIES:
109
+ sub_res_dur = f'{np.mean([x for x in duration_rating[duration]["sub_category"][sub_ctg] if x >= 0]):.3f}'
110
+ duration_rating[duration]['sub_category'][sub_ctg] = sub_res_dur
111
+
112
+ for task_ctg in TASK_CATEGORIES:
113
+ task_res_dur = f'{np.mean([x for x in duration_rating[duration]["task_type"][task_ctg] if x >= 0]):.3f}'
114
+ duration_rating[duration]['task_type'][task_ctg] = task_res_dur
115
+
116
+ return duration_rating
117
+
118
+
119
+ def extract_option(model, input_item, dataset_name):
120
+ options = input_item['question'].split('\n')[1:]
121
+ for id, option in enumerate(options):
122
+ option_id = chr(ord('A') + id) + '.'
123
+ if option.find(option_id) >= 0:
124
+ input_item[chr(ord('A') + id)] = option[option.find(option_id) + len(option_id):].strip('. \n')
125
+ return extract_answer_from_item(model, input_item, dataset_name)['opt']
126
+
127
+
128
+ def extract_characters_regex(s):
129
+ s = s.strip()
130
+ answer_prefixes = [
131
+ 'The best answer is',
132
+ 'The correct answer is',
133
+ 'The answer is',
134
+ 'The answer',
135
+ 'The best option is'
136
+ 'The correct option is',
137
+ 'Best answer:'
138
+ 'Best option:',
139
+ 'Answer:',
140
+ 'Option:',
141
+ ]
142
+ for answer_prefix in answer_prefixes:
143
+ s = s.replace(answer_prefix, '')
144
+
145
+ if len(s.split()) > 10 and not re.search('[ABCD]', s):
146
+ return ''
147
+ matches = re.search(r'[ABCD]', s)
148
+ if matches is None:
149
+ return ''
150
+ return matches[0]
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/vqa_eval.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ # Partly adopted from https://github.com/GT-Vision-Lab/VQA
3
+ # Copyright (c) 2014, Aishwarya Agrawal
4
+
5
+ from ...smp import *
6
+ from typing import Optional
7
+
8
+
9
+ def _process_digit_article(inText):
10
+ outText = []
11
+ tempText = inText.lower().split()
12
+ articles = ['a', 'an', 'the']
13
+ manualMap = {
14
+ 'none': '0',
15
+ 'zero': '0',
16
+ 'one': '1',
17
+ 'two': '2',
18
+ 'three': '3',
19
+ 'four': '4',
20
+ 'five': '5',
21
+ 'six': '6',
22
+ 'seven': '7',
23
+ 'eight': '8',
24
+ 'nine': '9',
25
+ 'ten': '10',
26
+ }
27
+ contractions = {
28
+ 'aint': "ain't",
29
+ 'arent': "aren't",
30
+ 'cant': "can't",
31
+ 'couldve': "could've",
32
+ 'couldnt': "couldn't",
33
+ "couldn'tve": "couldn't've",
34
+ "couldnt've": "couldn't've",
35
+ 'didnt': "didn't",
36
+ 'doesnt': "doesn't",
37
+ 'dont': "don't",
38
+ 'hadnt': "hadn't",
39
+ "hadnt've": "hadn't've",
40
+ "hadn'tve": "hadn't've",
41
+ 'hasnt': "hasn't",
42
+ 'havent': "haven't",
43
+ 'hed': "he'd",
44
+ "hed've": "he'd've",
45
+ "he'dve": "he'd've",
46
+ 'hes': "he's",
47
+ 'howd': "how'd",
48
+ 'howll': "how'll",
49
+ 'hows': "how's",
50
+ "Id've": "I'd've",
51
+ "I'dve": "I'd've",
52
+ 'Im': "I'm",
53
+ 'Ive': "I've",
54
+ 'isnt': "isn't",
55
+ 'itd': "it'd",
56
+ "itd've": "it'd've",
57
+ "it'dve": "it'd've",
58
+ 'itll': "it'll",
59
+ "let's": "let's",
60
+ 'maam': "ma'am",
61
+ 'mightnt': "mightn't",
62
+ "mightnt've": "mightn't've",
63
+ "mightn'tve": "mightn't've",
64
+ 'mightve': "might've",
65
+ 'mustnt': "mustn't",
66
+ 'mustve': "must've",
67
+ 'neednt': "needn't",
68
+ 'notve': "not've",
69
+ 'oclock': "o'clock",
70
+ 'oughtnt': "oughtn't",
71
+ "ow's'at": "'ow's'at",
72
+ "'ows'at": "'ow's'at",
73
+ "'ow'sat": "'ow's'at",
74
+ 'shant': "shan't",
75
+ "shed've": "she'd've",
76
+ "she'dve": "she'd've",
77
+ "she's": "she's",
78
+ 'shouldve': "should've",
79
+ 'shouldnt': "shouldn't",
80
+ "shouldnt've": "shouldn't've",
81
+ "shouldn'tve": "shouldn't've",
82
+ "somebody'd": 'somebodyd',
83
+ "somebodyd've": "somebody'd've",
84
+ "somebody'dve": "somebody'd've",
85
+ 'somebodyll': "somebody'll",
86
+ 'somebodys': "somebody's",
87
+ 'someoned': "someone'd",
88
+ "someoned've": "someone'd've",
89
+ "someone'dve": "someone'd've",
90
+ 'someonell': "someone'll",
91
+ 'someones': "someone's",
92
+ 'somethingd': "something'd",
93
+ "somethingd've": "something'd've",
94
+ "something'dve": "something'd've",
95
+ 'somethingll': "something'll",
96
+ 'thats': "that's",
97
+ 'thered': "there'd",
98
+ "thered've": "there'd've",
99
+ "there'dve": "there'd've",
100
+ 'therere': "there're",
101
+ 'theres': "there's",
102
+ 'theyd': "they'd",
103
+ "theyd've": "they'd've",
104
+ "they'dve": "they'd've",
105
+ 'theyll': "they'll",
106
+ 'theyre': "they're",
107
+ 'theyve': "they've",
108
+ 'twas': "'twas",
109
+ 'wasnt': "wasn't",
110
+ "wed've": "we'd've",
111
+ "we'dve": "we'd've",
112
+ 'weve': "we've",
113
+ 'werent': "weren't",
114
+ 'whatll': "what'll",
115
+ 'whatre': "what're",
116
+ 'whats': "what's",
117
+ 'whatve': "what've",
118
+ 'whens': "when's",
119
+ 'whered': "where'd",
120
+ 'wheres': "where's",
121
+ 'whereve': "where've",
122
+ 'whod': "who'd",
123
+ "whod've": "who'd've",
124
+ "who'dve": "who'd've",
125
+ 'wholl': "who'll",
126
+ 'whos': "who's",
127
+ 'whove': "who've",
128
+ 'whyll': "why'll",
129
+ 'whyre': "why're",
130
+ 'whys': "why's",
131
+ 'wont': "won't",
132
+ 'wouldve': "would've",
133
+ 'wouldnt': "wouldn't",
134
+ "wouldnt've": "wouldn't've",
135
+ "wouldn'tve": "wouldn't've",
136
+ 'yall': "y'all",
137
+ "yall'll": "y'all'll",
138
+ "y'allll": "y'all'll",
139
+ "yall'd've": "y'all'd've",
140
+ "y'alld've": "y'all'd've",
141
+ "y'all'dve": "y'all'd've",
142
+ 'youd': "you'd",
143
+ "youd've": "you'd've",
144
+ "you'dve": "you'd've",
145
+ 'youll': "you'll",
146
+ 'youre': "you're",
147
+ 'youve': "you've",
148
+ }
149
+ for word in tempText:
150
+ word = manualMap.setdefault(word, word)
151
+ if word not in articles:
152
+ outText.append(word)
153
+ for wordId, word in enumerate(outText):
154
+ if word in contractions:
155
+ outText[wordId] = contractions[word]
156
+ outText = ' '.join(outText)
157
+ return outText
158
+
159
+
160
+ def hit_calculate(result, dataset_name, anls_threshold=0.5):
161
+ if listinstr(['TextVQA'], dataset_name):
162
+ return [np.mean(x['match']) for x in result]
163
+ elif listinstr(['DocVQA', 'InfoVQA'], dataset_name):
164
+ return [0.0 if 1 - np.min(x['match']) < anls_threshold else 1 - np.min(x['match']) for x in result]
165
+ elif listinstr(['ChartQA', 'OCRVQA'], dataset_name):
166
+ return [np.max(x['match']) for x in result]
167
+ else: # default using vqa_score to calculate score
168
+ return [np.mean(x['match']) for x in result]
169
+
170
+
171
+ # https://github.com/google-research/pix2struct/blob/main/pix2struct/metrics.py#L81
172
+ def relaxed_correctness(target: str,
173
+ prediction: str,
174
+ max_relative_change: float = 0.05) -> bool:
175
+ """Calculates relaxed correctness.
176
+
177
+ The correctness tolerates certain error ratio defined by max_relative_change.
178
+ See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
179
+ “Following Methani et al. (2020), we use a relaxed accuracy measure for the
180
+ numeric answers to allow a minor inaccuracy that may result from the automatic
181
+ data extraction process. We consider an answer to be correct if it is within
182
+ 5% of the gold answer. For non-numeric answers, we still need an exact match
183
+ to consider an answer to be correct.”
184
+
185
+ Args:
186
+ target: Target string.
187
+ prediction: Predicted string.
188
+ max_relative_change: Maximum relative change.
189
+
190
+ Returns:
191
+ Whether the prediction was correct given the specified tolerance.
192
+ """
193
+
194
+ def _to_float(text: str) -> Optional[float]:
195
+ try:
196
+ if text.endswith('%'):
197
+ # Convert percentages to floats.
198
+ return float(text.rstrip('%')) / 100.0
199
+ else:
200
+ return float(text)
201
+ except ValueError:
202
+ return None
203
+ prediction = str(prediction)
204
+ target = str(target)
205
+ prediction_float = _to_float(prediction)
206
+ target_float = _to_float(target)
207
+ if prediction_float is not None and target_float:
208
+ relative_change = abs(prediction_float - target_float) / abs(target_float)
209
+ return relative_change <= max_relative_change
210
+ else:
211
+ return prediction.lower() == target.lower()
212
+
213
+
214
+ def levenshtein_distance(s1, s2):
215
+ if len(s1) > len(s2):
216
+ s1, s2 = s2, s1
217
+
218
+ distances = range(len(s1) + 1)
219
+ for i2, c2 in enumerate(s2):
220
+ distances_ = [i2 + 1]
221
+ for i1, c1 in enumerate(s1):
222
+ if c1 == c2:
223
+ distances_.append(distances[i1])
224
+ else:
225
+ distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
226
+ distances = distances_
227
+ return distances[-1]
228
+
229
+
230
+ def anls_compute(groundtruth, prediction):
231
+ gt_answer = ' '.join(groundtruth.strip().lower().split())
232
+ det_answer = ' '.join(prediction.strip().lower().split())
233
+ dist = levenshtein_distance(gt_answer, det_answer)
234
+ length = max(len(groundtruth.upper()), len(prediction.upper()))
235
+ values = 0.0 if length == 0 else float(dist) / float(length)
236
+ return values
237
+
238
+
239
+ def process_answer(answer):
240
+ answer = answer.replace('\n', ' ')
241
+ answer = answer.replace('\t', ' ')
242
+ answer = answer.strip()
243
+ answer = process_punctuation(answer)
244
+ answer = _process_digit_article(answer)
245
+ return answer
246
+
247
+
248
+ def process_line(line, method='vqa_score'):
249
+ ret = {}
250
+ if istype(line['answer'], list):
251
+ answers = eval(line['answer'])
252
+ else:
253
+ answers = [line['answer']]
254
+ if method == 'vqa_score':
255
+ ret['gt'] = [process_answer(x) for x in answers]
256
+ ret['pred'] = process_answer(line['prediction'])
257
+ ret['match'] = []
258
+ for current_idx, gtAnsDatum in enumerate(ret['gt']):
259
+ otherGTAns = [
260
+ item for ret_gt_idx, item in enumerate(ret['gt'])
261
+ if ret_gt_idx != current_idx
262
+ ]
263
+ matchingAns = [
264
+ item for item in otherGTAns if item == ret['pred']
265
+ ]
266
+ acc = min(1, float(len(matchingAns)) / 3)
267
+ ret['match'].append(acc)
268
+ elif method == 'anls':
269
+ ret['gt'] = answers
270
+ ret['pred'] = line['prediction']
271
+ ret['match'] = [anls_compute(x, ret['pred']) for x in ret['gt']]
272
+ elif method == 'relaxed_accuracy':
273
+ ret['gt'] = answers
274
+ ret['pred'] = line['prediction'].strip()
275
+ ret['match'] = [relaxed_correctness(ret['pred'], x) for x in ret['gt']]
276
+ elif method == 'accuracy':
277
+ ret['gt'] = answers
278
+ ret['pred'] = line['prediction'].strip()
279
+ ret['match'] = [(1.0 if (x.strip().lower() == ret['pred'].strip().lower()) else 0.0) for x in ret['gt']]
280
+ else: # default using vqa_score to calculate score
281
+ ret['gt'] = [process_answer(x) for x in answers]
282
+ ret['pred'] = process_answer(line['prediction'])
283
+ ret['match'] = [x == ret['pred'] for x in ret['gt']]
284
+
285
+ return ret
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/wemath.py ADDED
@@ -0,0 +1,896 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pylint: skip-file
2
+
3
+ import pandas as pd
4
+ import json
5
+ import numpy as np
6
+ import os
7
+ import argparse
8
+
9
+ # four_dimensional_metrics.py
10
+
11
+
12
+ # Function to evaluate steps
13
+ def evaluate_evaluate_steps(json, steps): # noqa
14
+ jokers = [json[[f'joker_{i}', f'knowledge concept_{i}']] for i in range(1, steps + 1)]
15
+ for i in range(steps):
16
+ jokers[i].rename(
17
+ columns={f'joker_{i + 1}': 'joker', f'knowledge concept_{i + 1}': 'knowledge_concept'},
18
+ inplace=True,
19
+ )
20
+ concatenated_steps = pd.concat(jokers, axis=0)
21
+ return concatenated_steps
22
+
23
+
24
+ # Function to load and process JSON data
25
+ def load_and_process_data(filepath):
26
+ df = pd.read_excel(filepath)
27
+ if 'hit' not in df.columns:
28
+ df['processed_answer'] = (
29
+ df['prediction']
30
+ .str.split('Answer')
31
+ .str[-1]
32
+ .str.strip()
33
+ .str.replace(r'[>><<:.]', '', regex=True)
34
+ .str.strip()
35
+ )
36
+ df['processed_answer'] = df['processed_answer'].apply(lambda x: x[0] if x and x[0] in 'ABCDEFGH' else None)
37
+ df['joker'] = df['processed_answer'] == df['answer']
38
+ else:
39
+ df['joker'] = df['hit'].astype(bool)
40
+ return df
41
+
42
+
43
+ # Function to process steps data and merge results
44
+ def evaluate_process_steps_data(df, steps):
45
+ steps_data = {f'{steps}steps_{i}': df[df['key'] == f'{steps}steps_{i}'] for i in range(1, steps + 1)}
46
+ steps_data[f'{steps}steps_multi'] = df[df['key'] == f'{steps}steps_multi']
47
+ for key, data in steps_data.items():
48
+ data.columns = [col + f'_{key.split("_")[-1]}' for col in data.columns]
49
+ merged_data = steps_data[f'{steps}steps_1']
50
+ for i in range(2, steps + 1):
51
+ merged_data = pd.merge(
52
+ merged_data, steps_data[f'{steps}steps_{i}'], left_on=f'ID_1', right_on=f'ID_{i}', how='left' # noqa
53
+ )
54
+ merged_data = pd.merge(
55
+ merged_data, steps_data[f'{steps}steps_multi'], left_on=f'ID_1', right_on='ID_multi', how='left' # noqa
56
+ )
57
+ return merged_data
58
+
59
+
60
+ # Function to calculate evaluation metrics
61
+ def evaluate_calculate_metrics(merged_2steps, merged_3steps):
62
+ metrics = {}
63
+ metrics['steps2_filtered_rows_1_loose'] = merged_2steps[
64
+ ((merged_2steps['joker_1'] == False) & (merged_2steps['joker_2'] == False)) # noqa
65
+ & (merged_2steps['joker_multi'] == True) # noqa
66
+ ]
67
+ metrics['steps2_filtered_rows_1_strict'] = merged_2steps[
68
+ ((merged_2steps['joker_1'] == False) | (merged_2steps['joker_2'] == False)) # noqa
69
+ & (merged_2steps['joker_multi'] == True) # noqa
70
+ ]
71
+ metrics['steps2_filtered_rows_2'] = merged_2steps[
72
+ ((merged_2steps['joker_1'] == True) & (merged_2steps['joker_2'] == True)) # noqa
73
+ & (merged_2steps['joker_multi'] == False) # noqa
74
+ ]
75
+ metrics['steps2_filtered_rows_3'] = merged_2steps[
76
+ ((merged_2steps['joker_1'] == False) | (merged_2steps['joker_2'] == False)) # noqa
77
+ & (merged_2steps['joker_multi'] == False) # noqa
78
+ ]
79
+ metrics['steps2_filtered_rows_4_loose'] = merged_2steps[
80
+ ((merged_2steps['joker_1'] == True) | (merged_2steps['joker_2'] == True))
81
+ & (merged_2steps['joker_multi'] == True)
82
+ ]
83
+ metrics['steps2_filtered_rows_4_strict'] = merged_2steps[
84
+ ((merged_2steps['joker_1'] == True) & (merged_2steps['joker_2'] == True))
85
+ & (merged_2steps['joker_multi'] == True)
86
+ ]
87
+ metrics['steps3_filtered_rows_1_loose'] = merged_3steps[
88
+ (
89
+ (merged_3steps['joker_1'] == False)
90
+ & (merged_3steps['joker_2'] == False)
91
+ & (merged_3steps['joker_3'] == False)
92
+ )
93
+ & (merged_3steps['joker_multi'] == True)
94
+ ]
95
+ metrics['steps3_filtered_rows_1_strict'] = merged_3steps[
96
+ (
97
+ (merged_3steps['joker_1'] == False)
98
+ | (merged_3steps['joker_2'] == False)
99
+ | (merged_3steps['joker_3'] == False)
100
+ )
101
+ & (merged_3steps['joker_multi'] == True)
102
+ ]
103
+ metrics['steps3_filtered_rows_2'] = merged_3steps[
104
+ ((merged_3steps['joker_1'] == True) & (merged_3steps['joker_2'] == True) & (merged_3steps['joker_3'] == True))
105
+ & (merged_3steps['joker_multi'] == False)
106
+ ]
107
+ metrics['steps3_filtered_rows_3'] = merged_3steps[
108
+ (
109
+ (merged_3steps['joker_1'] == False)
110
+ | (merged_3steps['joker_2'] == False)
111
+ | (merged_3steps['joker_3'] == False)
112
+ )
113
+ & (merged_3steps['joker_multi'] == False)
114
+ ]
115
+ metrics['steps3_filtered_rows_4_loose'] = merged_3steps[
116
+ ((merged_3steps['joker_1'] == True) | (merged_3steps['joker_2'] == True) | (merged_3steps['joker_3'] == True))
117
+ & (merged_3steps['joker_multi'] == True)
118
+ ]
119
+ metrics['steps3_filtered_rows_4_strict'] = merged_3steps[
120
+ ((merged_3steps['joker_1'] == True) & (merged_3steps['joker_2'] == True) & (merged_3steps['joker_3'] == True))
121
+ & (merged_3steps['joker_multi'] == True)
122
+ ]
123
+ # metrics.to_csv("/Users/mac/Desktop/测试结��/error_anal/csv/gpt4o-0626.csv", index = False)
124
+ return metrics
125
+
126
+
127
+ # Function to compute evaluation rates and final scores
128
+ def evaluate_compute_final_scores(metrics, total_count):
129
+ total_counts = {
130
+ 'InadequateGeneralization': len(metrics['steps2_filtered_rows_2']) + len(metrics['steps3_filtered_rows_2']),
131
+ 'InsufficientKnowledge': len(metrics['steps2_filtered_rows_3']) + len(metrics['steps3_filtered_rows_3']),
132
+ 'CompleteMastery_loose': len(metrics['steps2_filtered_rows_4_loose'])
133
+ + len(metrics['steps3_filtered_rows_4_loose']),
134
+ 'CompleteMastery_strict': len(metrics['steps2_filtered_rows_4_strict'])
135
+ + len(metrics['steps3_filtered_rows_4_strict']),
136
+ 'RoteMemorization_loose': len(metrics['steps2_filtered_rows_1_loose'])
137
+ + len(metrics['steps3_filtered_rows_1_loose']),
138
+ 'RoteMemorization_strict': len(metrics['steps2_filtered_rows_1_strict'])
139
+ + len(metrics['steps3_filtered_rows_1_strict']),
140
+ }
141
+ rates = {
142
+ 'InadequateGeneralization_rate': "{:.2%}".format(total_counts['InadequateGeneralization'] / total_count),
143
+ 'InsufficientKnowledge_rate': "{:.2%}".format(total_counts['InsufficientKnowledge'] / total_count),
144
+ 'CompleteMastery_loose_rate': "{:.2%}".format(total_counts['CompleteMastery_loose'] / total_count),
145
+ 'CompleteMastery_strict_rate': "{:.2%}".format(total_counts['CompleteMastery_strict'] / total_count),
146
+ 'RoteMemorization_loose_rate': "{:.2%}".format(
147
+ total_counts['RoteMemorization_loose']
148
+ / (total_counts['CompleteMastery_loose'] + total_counts['RoteMemorization_loose'])
149
+ ),
150
+ 'RoteMemorization_strict_rate': "{:.2%}".format(
151
+ total_counts['RoteMemorization_strict']
152
+ / (total_counts['CompleteMastery_strict'] + total_counts['RoteMemorization_strict'])
153
+ ),
154
+ }
155
+ return total_counts, rates
156
+
157
+
158
+ # Function to update main results DataFrame
159
+ def evaluate_update_main_results_df(main_results_df, total_counts, rates):
160
+
161
+ final_score_loose = "{:.2%}".format(
162
+ (
163
+ 525
164
+ - 0.5 * total_counts['InadequateGeneralization']
165
+ - total_counts['RoteMemorization_loose']
166
+ - total_counts['InsufficientKnowledge']
167
+ )
168
+ / 525
169
+ )
170
+ final_score_strict = "{:.2%}".format(
171
+ (
172
+ 525
173
+ - 0.5 * total_counts['InadequateGeneralization']
174
+ - total_counts['RoteMemorization_strict']
175
+ - total_counts['InsufficientKnowledge']
176
+ )
177
+ / 525
178
+ )
179
+
180
+ new_row = {
181
+ # 'Model': model,
182
+ 'Score (Strict)': final_score_strict,
183
+ 'InsufficientKnowledge (Strict)': f"{rates['InsufficientKnowledge_rate']} ({total_counts['InsufficientKnowledge']})",
184
+ 'InadequateGeneralization (Strict)': f"{rates['InadequateGeneralization_rate']} ({total_counts['InadequateGeneralization']})",
185
+ 'CompleteMastery (Strict)': f"{rates['CompleteMastery_strict_rate']} ({total_counts['CompleteMastery_strict']})",
186
+ 'RoteMemorization (Strict)': f"{rates['RoteMemorization_strict_rate']} ({total_counts['RoteMemorization_strict']})",
187
+ 'Score (Loose)': final_score_loose,
188
+ 'InsufficientKnowledge (Loose)': f"{rates['InsufficientKnowledge_rate']} ({total_counts['InsufficientKnowledge']})",
189
+ 'InadequateGeneralization (Loose)': f"{rates['InadequateGeneralization_rate']} ({total_counts['InadequateGeneralization']})",
190
+ 'CompleteMastery (Loose)': f"{rates['CompleteMastery_loose_rate']} ({total_counts['CompleteMastery_loose']})",
191
+ 'RoteMemorization (Loose)': f"{rates['RoteMemorization_loose_rate']} ({total_counts['RoteMemorization_loose']})",
192
+ }
193
+ main_results_df = main_results_df._append(new_row, ignore_index=True)
194
+ return main_results_df
195
+
196
+
197
+ # Main function to evaluate models
198
+ def wemath_evaluate_models(output_json, main_results_csv_path=None):
199
+
200
+ main_results_df = pd.DataFrame(
201
+ columns=[
202
+ 'Model',
203
+ 'Score (Strict)',
204
+ 'InsufficientKnowledge (Strict)',
205
+ 'InadequateGeneralization (Strict)',
206
+ 'CompleteMastery (Strict)',
207
+ 'RoteMemorization (Strict)',
208
+ 'Score (Loose)',
209
+ 'InsufficientKnowledge (Loose)',
210
+ 'InadequateGeneralization (Loose)',
211
+ 'CompleteMastery (Loose)',
212
+ 'RoteMemorization (Loose)',
213
+ ]
214
+ )
215
+
216
+ # print(f"Evaluating model: {model_name}, JSON path: {output_json}")
217
+ data = load_and_process_data(output_json)
218
+ data_2steps = data[data['key'].str.contains('2steps')]
219
+ data_3steps = data[data['key'].str.contains('3steps')]
220
+ merged_2steps = evaluate_process_steps_data(data_2steps, 2)
221
+ merged_3steps = evaluate_process_steps_data(data_3steps, 3)
222
+
223
+ metrics = evaluate_calculate_metrics(merged_2steps, merged_3steps)
224
+ total_counts, rates = evaluate_compute_final_scores(metrics, total_count=525)
225
+
226
+ main_results_df = evaluate_update_main_results_df(main_results_df, total_counts, rates)
227
+
228
+ print(main_results_df.to_string(index=False))
229
+ if main_results_csv_path is not None:
230
+ main_results_df.to_csv(main_results_csv_path, index=False)
231
+ print("Evaluation completed and results saved to CSV.")
232
+ return main_results_df.to_dict()
233
+
234
+
235
+ ### Accuracy.py
236
+ # Function to load knowledge structure nodes
237
+ def load_knowledge_structure_nodes(filepath):
238
+ # with open(filepath, "r") as file:
239
+ # nodes = json.load(file)
240
+ nodes = knowledge_structure_nodes
241
+ nodes = pd.DataFrame(nodes)
242
+ nodes['final_key'] = nodes['full node'].str.split('_').str[-1]
243
+ nodes['root_2'] = nodes['full node'].str.split('_').str[1]
244
+ return nodes
245
+
246
+
247
+ # Function to evaluate steps
248
+ def accuracy_evaluate_steps(json, steps, nodes):
249
+ jokers = [json[[f'joker_{i}', f'knowledge concept_{i}']] for i in range(1, steps + 1)]
250
+ for i in range(steps):
251
+ jokers[i] = pd.merge(
252
+ jokers[i],
253
+ nodes[['final_key', 'full node', 'root_2']],
254
+ left_on=f'knowledge concept_{i + 1}',
255
+ right_on='final_key',
256
+ how='left',
257
+ )
258
+ jokers[i].rename(
259
+ columns={f'joker_{i + 1}': 'joker', f'knowledge concept_{i + 1}': 'knowledge_concept'},
260
+ inplace=True,
261
+ )
262
+ concatenated_steps = pd.concat(jokers, axis=0)
263
+ return concatenated_steps
264
+
265
+
266
+ # Function to process steps data and merge results
267
+ def accuracy_process_steps_data(df, steps):
268
+ steps_data = {f'{steps}steps_{i}': df[df['key'] == f'{steps}steps_{i}'] for i in range(1, steps + 1)}
269
+ steps_data[f'{steps}steps_multi'] = df[df['key'] == f'{steps}steps_multi']
270
+ for key, data in steps_data.items():
271
+ data.columns = [col + f'_{key.split("_")[-1]}' for col in data.columns]
272
+ merged_data = steps_data[f'{steps}steps_1']
273
+ for i in range(2, steps + 1):
274
+ merged_data = pd.merge(
275
+ merged_data, steps_data[f'{steps}steps_{i}'], left_on=f'ID_1', right_on=f'ID_{i}', how='left'
276
+ )
277
+ merged_data = pd.merge(
278
+ merged_data, steps_data[f'{steps}steps_multi'], left_on=f'ID_1', right_on='ID_multi', how='left'
279
+ )
280
+ return merged_data
281
+
282
+
283
+ # Function to update main results DataFrame
284
+ def accuracy_update_main_results_df(nodes, main_results_df, concatenated_data, merged_2steps, merged_3steps):
285
+ One_step_acc = "{:.2%}".format(concatenated_data['joker'].mean())
286
+ Two_step_acc = "{:.2%}".format(merged_2steps['joker_multi'].mean())
287
+ Three_step_acc = "{:.2%}".format(merged_3steps['joker_multi'].mean())
288
+
289
+ new_row = {
290
+ # 'Model': model_name,
291
+ 'One-step(S1)': One_step_acc,
292
+ 'Two-step(S2)': Two_step_acc,
293
+ 'Three-step(S3)': Three_step_acc,
294
+ }
295
+ # Calculate rates according to Nodes
296
+ nodes['final_rode'] = nodes['full node'].str.split('_').str[-1]
297
+ csv_final_score = concatenated_data.groupby('final_key')['joker'].mean()
298
+ csv_final_score = pd.merge(nodes, csv_final_score, left_on='final_rode', right_on='final_key', how='left')
299
+
300
+ new_row.update(csv_final_score.groupby('root2')['joker'].mean().apply(lambda x: "{:.2%}".format(x)).to_dict())
301
+ main_results_df = main_results_df._append(new_row, ignore_index=True)
302
+
303
+ return main_results_df
304
+
305
+
306
+ # Main function to evaluate models
307
+ def wemath_accuracy(output_json, main_results_csv_path=None):
308
+
309
+ # nodes = load_knowledge_structure_nodes(knowledge_structure_nodes_path)
310
+ nodes = knowledge_structure_nodes
311
+ nodes = pd.DataFrame(nodes)
312
+ nodes['final_key'] = nodes['full node'].str.split('_').str[-1]
313
+ nodes['root_2'] = nodes['full node'].str.split('_').str[1]
314
+
315
+ main_results_df = pd.DataFrame(
316
+ columns=[
317
+ 'Model',
318
+ 'One-step(S1)',
319
+ 'Two-step(S2)',
320
+ 'Three-step(S3)',
321
+ 'Understanding and Conversion of Units',
322
+ 'Angles and Length',
323
+ 'Calculation of Plane Figures',
324
+ 'Understanding of Plane Figures',
325
+ 'Calculation of Solid Figures',
326
+ 'Understanding of Solid Figures',
327
+ 'Basic Transformations of Figures',
328
+ 'Cutting and Combining of Figures',
329
+ 'Direction',
330
+ 'Position',
331
+ 'Route Map',
332
+ 'Correspondence of Coordinates and Positions',
333
+ ]
334
+ )
335
+
336
+ # print(f"Evaluating model: {model_name}, JSON path: {output_json}")
337
+ data = load_and_process_data(output_json)
338
+ data_2steps = data[data['key'].str.contains('2steps')]
339
+ data_3steps = data[data['key'].str.contains('3steps')]
340
+ merged_2steps = accuracy_process_steps_data(data_2steps, 2)
341
+ merged_3steps = accuracy_process_steps_data(data_3steps, 3)
342
+
343
+ concatenated_data = pd.concat(
344
+ [accuracy_evaluate_steps(merged_2steps, 2, nodes), accuracy_evaluate_steps(merged_3steps, 3, nodes)],
345
+ axis=0,
346
+ )
347
+ main_results_df = accuracy_update_main_results_df(
348
+ nodes, main_results_df, concatenated_data, merged_2steps, merged_3steps
349
+ )
350
+
351
+ print(main_results_df.to_string(index=False))
352
+ if main_results_csv_path is not None:
353
+ main_results_df.to_csv(main_results_csv_path, index=False)
354
+ print("Evaluation completed and results saved to CSV.")
355
+
356
+ return main_results_df.to_dict()
357
+
358
+
359
+ knowledge_structure_nodes = [
360
+ {
361
+ "root0": "Geometry and Figures",
362
+ "root1": "Measurement",
363
+ "root2": "Understanding and Conversion of Units",
364
+ "root3": "Conversion Rates and Calculations Between Area Units",
365
+ "root4": None,
366
+ "full node": "Measurement_Understanding and Conversion of Units_Conversion Rates and Calculations Between Area Units",
367
+ },
368
+ {
369
+ "root0": "Geometry and Figures",
370
+ "root1": "Measurement",
371
+ "root2": "Understanding and Conversion of Units",
372
+ "root3": "Conversion Rates and Calculations Between Volume Units (Including Liters and Milliliters)",
373
+ "root4": None,
374
+ "full node": "Measurement_Understanding and Conversion of Units_Conversion Rates and Calculations Between Volume Units (Including Liters and Milliliters)",
375
+ },
376
+ {
377
+ "root0": "Geometry and Figures",
378
+ "root1": "Measurement",
379
+ "root2": "Understanding and Conversion of Units",
380
+ "root3": "Conversion Rates and Calculations Between Length Units",
381
+ "root4": None,
382
+ "full node": "Measurement_Understanding and Conversion of Units_Conversion Rates and Calculations Between Length Units",
383
+ },
384
+ {
385
+ "root0": "Geometry and Figures",
386
+ "root1": "Measurement",
387
+ "root2": "Angles and Length",
388
+ "root3": "Understanding Angles (Using a Protractor)",
389
+ "root4": None,
390
+ "full node": "Measurement_Angles and Length_Understanding Angles (Using a Protractor)",
391
+ },
392
+ {
393
+ "root0": "Geometry and Figures",
394
+ "root1": "Measurement",
395
+ "root2": "Angles and Length",
396
+ "root3": "Understanding Length (Using a Ruler)",
397
+ "root4": None,
398
+ "full node": "Measurement_Angles and Length_Understanding Length (Using a Ruler)",
399
+ },
400
+ {
401
+ "root0": "Geometry and Figures",
402
+ "root1": "Solid Figures",
403
+ "root2": "Calculation of Solid Figures",
404
+ "root3": "Calculation of Surface Area of Solid Figures",
405
+ "root4": "Surface Area of Cylinders",
406
+ "full node": "Solid Figures_Calculation of Solid Figures_Calculation of Surface Area of Solid Figures_Surface Area of Cylinders",
407
+ },
408
+ {
409
+ "root0": "Geometry and Figures",
410
+ "root1": "Solid Figures",
411
+ "root2": "Calculation of Solid Figures",
412
+ "root3": "Calculation of Surface Area of Solid Figures",
413
+ "root4": "Surface Area of Rectangular Cuboids",
414
+ "full node": "Solid Figures_Calculation of Solid Figures_Calculation of Surface Area of Solid Figures_Surface Area of Rectangular Cuboids",
415
+ },
416
+ {
417
+ "root0": "Geometry and Figures",
418
+ "root1": "Solid Figures",
419
+ "root2": "Calculation of Solid Figures",
420
+ "root3": "Calculation of Surface Area of Solid Figures",
421
+ "root4": "Surface Area of Cubes",
422
+ "full node": "Solid Figures_Calculation of Solid Figures_Calculation of Surface Area of Solid Figures_Surface Area of Cubes",
423
+ },
424
+ {
425
+ "root0": "Geometry and Figures",
426
+ "root1": "Solid Figures",
427
+ "root2": "Calculation of Solid Figures",
428
+ "root3": "Calculation of Volume of Solid Figures",
429
+ "root4": "Volume and Capacity of Cylinders",
430
+ "full node": "Solid Figures_Calculation of Solid Figures_Calculation of Volume of Solid Figures_Volume and Capacity of Cylinders",
431
+ },
432
+ {
433
+ "root0": "Geometry and Figures",
434
+ "root1": "Solid Figures",
435
+ "root2": "Calculation of Solid Figures",
436
+ "root3": "Calculation of Volume of Solid Figures",
437
+ "root4": "Volume and Capacity of Cones",
438
+ "full node": "Solid Figures_Calculation of Solid Figures_Calculation of Volume of Solid Figures_Volume and Capacity of Cones",
439
+ },
440
+ {
441
+ "root0": "Geometry and Figures",
442
+ "root1": "Solid Figures",
443
+ "root2": "Calculation of Solid Figures",
444
+ "root3": "Calculation of Volume of Solid Figures",
445
+ "root4": "Volume and Capacity of Rectangular Cuboids",
446
+ "full node": "Solid Figures_Calculation of Solid Figures_Calculation of Volume of Solid Figures_Volume and Capacity of Rectangular Cuboids",
447
+ },
448
+ {
449
+ "root0": "Geometry and Figures",
450
+ "root1": "Solid Figures",
451
+ "root2": "Calculation of Solid Figures",
452
+ "root3": "Calculation of Volume of Solid Figures",
453
+ "root4": "Volume and Capacity of Cubes",
454
+ "full node": "Solid Figures_Calculation of Solid Figures_Calculation of Volume of Solid Figures_Volume and Capacity of Cubes",
455
+ },
456
+ {
457
+ "root0": "Geometry and Figures",
458
+ "root1": "Solid Figures",
459
+ "root2": "Understanding of Solid Figures",
460
+ "root3": "Expanded View of Solids",
461
+ "root4": "Expanded View of Cylinders",
462
+ "full node": "Solid Figures_Understanding of Solid Figures_Expanded View of Solids_Expanded View of Cylinders",
463
+ },
464
+ {
465
+ "root0": "Geometry and Figures",
466
+ "root1": "Solid Figures",
467
+ "root2": "Understanding of Solid Figures",
468
+ "root3": "Expanded View of Solids",
469
+ "root4": "Expanded View of Rectangular Cuboids",
470
+ "full node": "Solid Figures_Understanding of Solid Figures_Expanded View of Solids_Expanded View of Rectangular Cuboids",
471
+ },
472
+ {
473
+ "root0": "Geometry and Figures",
474
+ "root1": "Solid Figures",
475
+ "root2": "Understanding of Solid Figures",
476
+ "root3": "Expanded View of Solids",
477
+ "root4": "Expanded View of Cubes",
478
+ "full node": "Solid Figures_Understanding of Solid Figures_Expanded View of Solids_Expanded View of Cubes",
479
+ },
480
+ {
481
+ "root0": "Geometry and Figures",
482
+ "root1": "Solid Figures",
483
+ "root2": "Understanding of Solid Figures",
484
+ "root3": "Cylinders and Cones",
485
+ "root4": "Properties of Cylinders",
486
+ "full node": "Solid Figures_Understanding of Solid Figures_Cylinders and Cones_Properties of Cylinders",
487
+ },
488
+ {
489
+ "root0": "Geometry and Figures",
490
+ "root1": "Solid Figures",
491
+ "root2": "Understanding of Solid Figures",
492
+ "root3": "Cylinders and Cones",
493
+ "root4": "Properties of Cones",
494
+ "full node": "Solid Figures_Understanding of Solid Figures_Cylinders and Cones_Properties of Cones",
495
+ },
496
+ {
497
+ "root0": "Geometry and Figures",
498
+ "root1": "Solid Figures",
499
+ "root2": "Understanding of Solid Figures",
500
+ "root3": "Rectangular Cuboids and Cubes",
501
+ "root4": "Properties and Understanding of Rectangular Cuboids",
502
+ "full node": "Solid Figures_Understanding of Solid Figures_Rectangular Cuboids and Cubes_Properties and Understanding of Rectangular Cuboids",
503
+ },
504
+ {
505
+ "root0": "Geometry and Figures",
506
+ "root1": "Solid Figures",
507
+ "root2": "Understanding of Solid Figures",
508
+ "root3": "Rectangular Cuboids and Cubes",
509
+ "root4": "Properties and Understanding of Cubes",
510
+ "full node": "Solid Figures_Understanding of Solid Figures_Rectangular Cuboids and Cubes_Properties and Understanding of Cubes",
511
+ },
512
+ {
513
+ "root0": "Geometry and Figures",
514
+ "root1": "Solid Figures",
515
+ "root2": "Understanding of Solid Figures",
516
+ "root3": "Observing Objects",
517
+ "root4": None,
518
+ "full node": "Solid Figures_Understanding of Solid Figures_Observing Objects",
519
+ },
520
+ {
521
+ "root0": "Geometry and Figures",
522
+ "root1": "Plane Figures",
523
+ "root2": "Calculation of Plane Figures",
524
+ "root3": "Sum of Interior Angles of Polygons",
525
+ "root4": "Sum of Interior Angles of Other Polygons",
526
+ "full node": "Plane Figures_Calculation of Plane Figures_Sum of Interior Angles of Polygons_Sum of Interior Angles of Other Polygons",
527
+ },
528
+ {
529
+ "root0": "Geometry and Figures",
530
+ "root1": "Plane Figures",
531
+ "root2": "Calculation of Plane Figures",
532
+ "root3": "Sum of Interior Angles of Polygons",
533
+ "root4": "Sum of Interior Angles of Triangles",
534
+ "full node": "Plane Figures_Calculation of Plane Figures_Sum of Interior Angles of Polygons_Sum of Interior Angles of Triangles",
535
+ },
536
+ {
537
+ "root0": "Geometry and Figures",
538
+ "root1": "Plane Figures",
539
+ "root2": "Calculation of Plane Figures",
540
+ "root3": "Calculation and Comparison of Angles",
541
+ "root4": None,
542
+ "full node": "Plane Figures_Calculation of Plane Figures_Calculation and Comparison of Angles",
543
+ },
544
+ {
545
+ "root0": "Geometry and Figures",
546
+ "root1": "Plane Figures",
547
+ "root2": "Calculation of Plane Figures",
548
+ "root3": "Calculation of Areas",
549
+ "root4": "Area of Parallelograms",
550
+ "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Parallelograms",
551
+ },
552
+ {
553
+ "root0": "Geometry and Figures",
554
+ "root1": "Plane Figures",
555
+ "root2": "Calculation of Plane Figures",
556
+ "root3": "Calculation of Areas",
557
+ "root4": "Area of Triangles",
558
+ "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Triangles",
559
+ },
560
+ {
561
+ "root0": "Geometry and Figures",
562
+ "root1": "Plane Figures",
563
+ "root2": "Calculation of Plane Figures",
564
+ "root3": "Calculation of Areas",
565
+ "root4": "Area of Sectors",
566
+ "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Sectors",
567
+ },
568
+ {
569
+ "root0": "Geometry and Figures",
570
+ "root1": "Plane Figures",
571
+ "root2": "Calculation of Plane Figures",
572
+ "root3": "Calculation of Areas",
573
+ "root4": "Area of Trapezoids",
574
+ "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Trapezoids",
575
+ },
576
+ {
577
+ "root0": "Geometry and Figures",
578
+ "root1": "Plane Figures",
579
+ "root2": "Calculation of Plane Figures",
580
+ "root3": "Calculation of Areas",
581
+ "root4": "Area of Circles",
582
+ "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Circles",
583
+ },
584
+ {
585
+ "root0": "Geometry and Figures",
586
+ "root1": "Plane Figures",
587
+ "root2": "Calculation of Plane Figures",
588
+ "root3": "Calculation of Areas",
589
+ "root4": "Area of Rectangles",
590
+ "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Rectangles",
591
+ },
592
+ {
593
+ "root0": "Geometry and Figures",
594
+ "root1": "Plane Figures",
595
+ "root2": "Calculation of Plane Figures",
596
+ "root3": "Calculation of Areas",
597
+ "root4": "Area of Squares",
598
+ "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Squares",
599
+ },
600
+ {
601
+ "root0": "Geometry and Figures",
602
+ "root1": "Plane Figures",
603
+ "root2": "Calculation of Plane Figures",
604
+ "root3": "Calculation of Perimeters",
605
+ "root4": "Perimeter of Parallelograms",
606
+ "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Perimeters_Perimeter of Parallelograms",
607
+ },
608
+ {
609
+ "root0": "Geometry and Figures",
610
+ "root1": "Plane Figures",
611
+ "root2": "Calculation of Plane Figures",
612
+ "root3": "Calculation of Perimeters",
613
+ "root4": "Perimeter of Triangles",
614
+ "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Perimeters_Perimeter of Triangles",
615
+ },
616
+ {
617
+ "root0": "Geometry and Figures",
618
+ "root1": "Plane Figures",
619
+ "root2": "Calculation of Plane Figures",
620
+ "root3": "Calculation of Perimeters",
621
+ "root4": "Perimeter of Trapezoids",
622
+ "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Perimeters_Perimeter of Trapezoids",
623
+ },
624
+ {
625
+ "root0": "Geometry and Figures",
626
+ "root1": "Plane Figures",
627
+ "root2": "Calculation of Plane Figures",
628
+ "root3": "Calculation of Perimeters",
629
+ "root4": "Circumference of Circles",
630
+ "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Perimeters_Circumference of Circles",
631
+ },
632
+ {
633
+ "root0": "Geometry and Figures",
634
+ "root1": "Plane Figures",
635
+ "root2": "Calculation of Plane Figures",
636
+ "root3": "Calculation of Perimeters",
637
+ "root4": "Perimeter of Rectangles",
638
+ "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Perimeters_Perimeter of Rectangles",
639
+ },
640
+ {
641
+ "root0": "Geometry and Figures",
642
+ "root1": "Plane Figures",
643
+ "root2": "Calculation of Plane Figures",
644
+ "root3": "Calculation of Perimeters",
645
+ "root4": "Perimeter of Squares",
646
+ "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Perimeters_Perimeter of Squares",
647
+ },
648
+ {
649
+ "root0": "Geometry and Figures",
650
+ "root1": "Plane Figures",
651
+ "root2": "Understanding of Plane Figures",
652
+ "root3": "Polygons",
653
+ "root4": "Properties and Understanding of Parallelograms",
654
+ "full node": "Plane Figures_Understanding of Plane Figures_Polygons_Properties and Understanding of Parallelograms",
655
+ },
656
+ {
657
+ "root0": "Geometry and Figures",
658
+ "root1": "Plane Figures",
659
+ "root2": "Understanding of Plane Figures",
660
+ "root3": "Polygons",
661
+ "root4": "Properties and Understanding of Triangles",
662
+ "full node": "Plane Figures_Understanding of Plane Figures_Polygons_Properties and Understanding of Triangles",
663
+ },
664
+ {
665
+ "root0": "Geometry and Figures",
666
+ "root1": "Plane Figures",
667
+ "root2": "Understanding of Plane Figures",
668
+ "root3": "Polygons",
669
+ "root4": "Properties and Understanding of Trapezoids",
670
+ "full node": "Plane Figures_Understanding of Plane Figures_Polygons_Properties and Understanding of Trapezoids",
671
+ },
672
+ {
673
+ "root0": "Geometry and Figures",
674
+ "root1": "Plane Figures",
675
+ "root2": "Understanding of Plane Figures",
676
+ "root3": "Polygons",
677
+ "root4": "Properties and Understanding of Rectangles",
678
+ "full node": "Plane Figures_Understanding of Plane Figures_Polygons_Properties and Understanding of Rectangles",
679
+ },
680
+ {
681
+ "root0": "Geometry and Figures",
682
+ "root1": "Plane Figures",
683
+ "root2": "Understanding of Plane Figures",
684
+ "root3": "Polygons",
685
+ "root4": "Properties and Understanding of Squares",
686
+ "full node": "Plane Figures_Understanding of Plane Figures_Polygons_Properties and Understanding of Squares",
687
+ },
688
+ {
689
+ "root0": "Geometry and Figures",
690
+ "root1": "Plane Figures",
691
+ "root2": "Understanding of Plane Figures",
692
+ "root3": "Classification and Understanding of Angles",
693
+ "root4": "Understanding Triangular Rulers",
694
+ "full node": "Plane Figures_Understanding of Plane Figures_Classification and Understanding of Angles_Understanding Triangular Rulers",
695
+ },
696
+ {
697
+ "root0": "Geometry and Figures",
698
+ "root1": "Plane Figures",
699
+ "root2": "Understanding of Plane Figures",
700
+ "root3": "Classification and Understanding of Angles",
701
+ "root4": "Understanding and Representing Angles",
702
+ "full node": "Plane Figures_Understanding of Plane Figures_Classification and Understanding of Angles_Understanding and Representing Angles",
703
+ },
704
+ {
705
+ "root0": "Geometry and Figures",
706
+ "root1": "Plane Figures",
707
+ "root2": "Understanding of Plane Figures",
708
+ "root3": "Properties and Understanding of Line Segments",
709
+ "root4": "Distance Between Two Points",
710
+ "full node": "Plane Figures_Understanding of Plane Figures_Properties and Understanding of Line Segments_Distance Between Two Points",
711
+ },
712
+ {
713
+ "root0": "Geometry and Figures",
714
+ "root1": "Plane Figures",
715
+ "root2": "Understanding of Plane Figures",
716
+ "root3": "Properties and Understanding of Line Segments",
717
+ "root4": "Understanding Line Segments, Lines, and Rays",
718
+ "full node": "Plane Figures_Understanding of Plane Figures_Properties and Understanding of Line Segments_Understanding Line Segments, Lines, and Rays",
719
+ },
720
+ {
721
+ "root0": "Geometry and Figures",
722
+ "root1": "Plane Figures",
723
+ "root2": "Understanding of Plane Figures",
724
+ "root3": "Positional Relationships Between Line Segments",
725
+ "root4": "perpendicularity",
726
+ "full node": "Plane Figures_Understanding of Plane Figures_Positional Relationships Between Line Segments_perpendicularity",
727
+ },
728
+ {
729
+ "root0": "Geometry and Figures",
730
+ "root1": "Plane Figures",
731
+ "root2": "Understanding of Plane Figures",
732
+ "root3": "Positional Relationships Between Line Segments",
733
+ "root4": "Parallel",
734
+ "full node": "Plane Figures_Understanding of Plane Figures_Positional Relationships Between Line Segments_Parallel",
735
+ },
736
+ {
737
+ "root0": "Geometry and Figures",
738
+ "root1": "Plane Figures",
739
+ "root2": "Understanding of Plane Figures",
740
+ "root3": "Circles and Sectors",
741
+ "root4": "Understanding Sectors",
742
+ "full node": "Plane Figures_Understanding of Plane Figures_Circles and Sectors_Understanding Sectors",
743
+ },
744
+ {
745
+ "root0": "Geometry and Figures",
746
+ "root1": "Plane Figures",
747
+ "root2": "Understanding of Plane Figures",
748
+ "root3": "Circles and Sectors",
749
+ "root4": "Understanding Circles",
750
+ "full node": "Plane Figures_Understanding of Plane Figures_Circles and Sectors_Understanding Circles",
751
+ },
752
+ {
753
+ "root0": "Geometry and Figures",
754
+ "root1": "Plane Figures",
755
+ "root2": "Understanding of Plane Figures",
756
+ "root3": "Observing Figures",
757
+ "root4": None,
758
+ "full node": "Plane Figures_Understanding of Plane Figures_Observing Figures",
759
+ },
760
+ {
761
+ "root0": "Geometry and Figures",
762
+ "root1": "Transformation and Motion of Figures",
763
+ "root2": "Basic Transformations of Figures",
764
+ "root3": "Axial Symmetry",
765
+ "root4": None,
766
+ "full node": "Transformation and Motion of Figures_Basic Transformations of Figures_Axial Symmetry",
767
+ },
768
+ {
769
+ "root0": "Geometry and Figures",
770
+ "root1": "Transformation and Motion of Figures",
771
+ "root2": "Basic Transformations of Figures",
772
+ "root3": "Translation",
773
+ "root4": None,
774
+ "full node": "Transformation and Motion of Figures_Basic Transformations of Figures_Translation",
775
+ },
776
+ {
777
+ "root0": "Geometry and Figures",
778
+ "root1": "Transformation and Motion of Figures",
779
+ "root2": "Basic Transformations of Figures",
780
+ "root3": "Rotation",
781
+ "root4": None,
782
+ "full node": "Transformation and Motion of Figures_Basic Transformations of Figures_Rotation",
783
+ },
784
+ {
785
+ "root0": "Geometry and Figures",
786
+ "root1": "Transformation and Motion of Figures",
787
+ "root2": "Cutting and Combining of Figures",
788
+ "root3": "Combining and Dividing Solids",
789
+ "root4": None,
790
+ "full node": "Transformation and Motion of Figures_Cutting and Combining of Figures_Combining and Dividing Solids",
791
+ },
792
+ {
793
+ "root0": "Geometry and Figures",
794
+ "root1": "Transformation and Motion of Figures",
795
+ "root2": "Cutting and Combining of Figures",
796
+ "root3": "Combining Plane Figures",
797
+ "root4": "Division of Plane Figures",
798
+ "full node": "Transformation and Motion of Figures_Cutting and Combining of Figures_Combining Plane Figures_Division of Plane Figures",
799
+ },
800
+ {
801
+ "root0": "Geometry and Figures",
802
+ "root1": "Transformation and Motion of Figures",
803
+ "root2": "Cutting and Combining of Figures",
804
+ "root3": "Combining Plane Figures",
805
+ "root4": "Combining Plane Figures",
806
+ "full node": "Transformation and Motion of Figures_Cutting and Combining of Figures_Combining Plane Figures_Combining Plane Figures",
807
+ },
808
+ {
809
+ "root0": "Geometry and Figures",
810
+ "root1": "Transformation and Motion of Figures",
811
+ "root2": "Cutting and Combining of Figures",
812
+ "root3": "Combining Plane Figures",
813
+ "root4": "Tessellation of Figures",
814
+ "full node": "Transformation and Motion of Figures_Cutting and Combining of Figures_Combining Plane Figures_Tessellation of Figures",
815
+ },
816
+ {
817
+ "root0": "Geometry and Figures",
818
+ "root1": "Transformation and Motion of Figures",
819
+ "root2": "Cutting and Combining of Figures",
820
+ "root3": "Combining Plane Figures",
821
+ "root4": "Folding Problems of Figures",
822
+ "full node": "Transformation and Motion of Figures_Cutting and Combining of Figures_Combining Plane Figures_Folding Problems of Figures",
823
+ },
824
+ {
825
+ "root0": "Geometry and Figures",
826
+ "root1": "Position and Direction",
827
+ "root2": "Direction",
828
+ "root3": "Southeast, Southwest, Northeast, Northwest Directions",
829
+ "root4": None,
830
+ "full node": "Position and Direction_Direction_Southeast, Southwest, Northeast, Northwest Directions",
831
+ },
832
+ {
833
+ "root0": "Geometry and Figures",
834
+ "root1": "Position and Direction",
835
+ "root2": "Direction",
836
+ "root3": "Cardinal Directions (East, South, West, North)",
837
+ "root4": None,
838
+ "full node": "Position and Direction_Direction_Cardinal Directions (East, South, West, North)",
839
+ },
840
+ {
841
+ "root0": "Geometry and Figures",
842
+ "root1": "Position and Direction",
843
+ "root2": "Route Map",
844
+ "root3": "Determining the Positions of Objects Based on Direction, Angle, and Distance",
845
+ "root4": None,
846
+ "full node": "Position and Direction_Route Map_Determining the Positions of Objects Based on Direction, Angle, and Distance",
847
+ },
848
+ {
849
+ "root0": "Geometry and Figures",
850
+ "root1": "Position and Direction",
851
+ "root2": "Route Map",
852
+ "root3": "Describing Simple Routes Based on Direction and Distance",
853
+ "root4": None,
854
+ "full node": "Position and Direction_Route Map_Describing Simple Routes Based on Direction and Distance",
855
+ },
856
+ {
857
+ "root0": "Geometry and Figures",
858
+ "root1": "Position and Direction",
859
+ "root2": "Correspondence of Coordinates and Positions",
860
+ "root3": "Representing Positions Using Ordered Pairs",
861
+ "root4": None,
862
+ "full node": "Position and Direction_Correspondence of Coordinates and Positions_Representing Positions Using Ordered Pairs",
863
+ },
864
+ {
865
+ "root0": "Geometry and Figures",
866
+ "root1": "Position and Direction",
867
+ "root2": "Correspondence of Coordinates and Positions",
868
+ "root3": "Finding Positions Based on Ordered Pairs",
869
+ "root4": None,
870
+ "full node": "Position and Direction_Correspondence of Coordinates and Positions_Finding Positions Based on Ordered Pairs",
871
+ },
872
+ {
873
+ "root0": "Geometry and Figures",
874
+ "root1": "Position and Direction",
875
+ "root2": "Position",
876
+ "root3": "Front-Back Position",
877
+ "root4": None,
878
+ "full node": "Position and Direction_Position_Front-Back Position",
879
+ },
880
+ {
881
+ "root0": "Geometry and Figures",
882
+ "root1": "Position and Direction",
883
+ "root2": "Position",
884
+ "root3": "Up-Down Position",
885
+ "root4": None,
886
+ "full node": "Position and Direction_Position_Up-Down Position",
887
+ },
888
+ {
889
+ "root0": "Geometry and Figures",
890
+ "root1": "Position and Direction",
891
+ "root2": "Position",
892
+ "root3": "Left-Right Position",
893
+ "root4": None,
894
+ "full node": "Position and Direction_Position_Left-Right Position",
895
+ },
896
+ ]
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/utils/yorn.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ...smp import *
2
+
3
+
4
+ def AMBER_rating(data_file):
5
+ data = load(data_file)
6
+ stats = defaultdict(dict)
7
+ lt = len(data)
8
+ category_mapping = {
9
+ 'discriminative-attribute-state': 'Attribute',
10
+ 'discriminative-attribute-number': 'Attribute',
11
+ 'discriminative-attribute-action': 'Attribute',
12
+ 'discriminative-hallucination': 'Existence',
13
+ 'discriminative-relation': 'Relation',
14
+ 'relation': 'Relation'
15
+ }
16
+
17
+ for i in range(lt):
18
+ item = data.iloc[i]
19
+ category = item['category']
20
+ image_path = item['image_path']
21
+ score = item['score']
22
+
23
+ new_category = category_mapping.get(category, category)
24
+
25
+ if image_path not in stats[new_category]:
26
+ stats[new_category][image_path] = []
27
+ stats[new_category][image_path].append(score)
28
+
29
+ def acc(key):
30
+ res = stats[key]
31
+ values = []
32
+ for val in res.values():
33
+ values.extend(val)
34
+ return np.mean(values) * 100
35
+
36
+ scores = {}
37
+ for k in stats:
38
+ scores[k] = acc(k)
39
+
40
+ scores['Avg ACC'] = np.mean(list(scores.values()))
41
+ ret = d2df(scores)
42
+ return ret
43
+
44
+
45
+ def MME_rating(data_file):
46
+ data = load(data_file)
47
+ stats = defaultdict(dict)
48
+ lt = len(data)
49
+ for i in range(lt):
50
+ item = data.iloc[i]
51
+ category = item['category']
52
+ image_path = item['image_path']
53
+ score = item['score']
54
+ if image_path not in stats[category]:
55
+ stats[category][image_path] = []
56
+ stats[category][image_path].append(score)
57
+
58
+ def acc(key, mode='normal'):
59
+ res = stats[key]
60
+ values = []
61
+ for val in res.values():
62
+ if mode == 'normal':
63
+ values.extend(val)
64
+ elif mode == 'plus':
65
+ values.append(val[0] * val[1])
66
+ return np.mean(values) * 100
67
+
68
+ scores = {}
69
+ for k in stats:
70
+ scores[k] = acc(k) + acc(k, 'plus')
71
+
72
+ super_cates = dict(
73
+ perception=[
74
+ 'OCR', 'artwork', 'celebrity', 'color', 'count', 'existence',
75
+ 'landmark', 'position', 'posters', 'scene'
76
+ ],
77
+ reasoning=['code_reasoning', 'commonsense_reasoning', 'numerical_calculation', 'text_translation']
78
+ )
79
+
80
+ ret = {}
81
+ for sc, cate_list in super_cates.items():
82
+ base = 0
83
+ for c in cate_list:
84
+ base += scores[c]
85
+ ret[sc] = base
86
+ ret.update(scores)
87
+ ret = d2df(ret)
88
+ return ret
89
+
90
+
91
+ def Hallusion_rating(data_file):
92
+ def calc_fAcc(data):
93
+ res = defaultdict(list)
94
+ lt = len(data)
95
+ for i in range(lt):
96
+ line = data.iloc[i]
97
+ res[f"{line['l2-category']}_{line['set_id']}_{line['figure_id']}"].append(line['score'])
98
+ return np.mean([np.all(x) for x in res.values()]) * 100
99
+
100
+ def calc_qAcc(data):
101
+ res = defaultdict(list)
102
+ lt = len(data)
103
+ for i in range(lt):
104
+ line = data.iloc[i]
105
+ res[f"{line['l2-category']}_{line['set_id']}_{line['question_id']}"].append(line['score'])
106
+ return np.mean([np.all(x) for x in res.values()]) * 100
107
+
108
+ def calc_aAcc(data):
109
+ return np.mean(data['score']) * 100
110
+
111
+ data = load(data_file)
112
+ data['set_id'] = [x.split('_')[3] for x in data['index']]
113
+ data['figure_id'] = [x.split('_')[4] for x in data['index']]
114
+ data['question_id'] = [x.split('_')[5] for x in data['index']]
115
+
116
+ res = dict(split=[], aAcc=[], fAcc=[], qAcc=[])
117
+ res['split'].append('Overall')
118
+ res['aAcc'].append(calc_aAcc(data))
119
+ res['fAcc'].append(calc_fAcc(data))
120
+ res['qAcc'].append(calc_qAcc(data))
121
+
122
+ if 'category' in data:
123
+ cates = list(set(data['category']))
124
+ for c in cates:
125
+ sub = data[data['category'] == c]
126
+ res['split'].append(c)
127
+ res['aAcc'].append(calc_aAcc(sub))
128
+ res['fAcc'].append(calc_fAcc(sub))
129
+ res['qAcc'].append(calc_qAcc(sub))
130
+
131
+ if 'l2-category' in data:
132
+ cates = list(set(data['l2-category']))
133
+ for c in cates:
134
+ sub = data[data['l2-category'] == c]
135
+ res['split'].append(c)
136
+ res['aAcc'].append(calc_aAcc(sub))
137
+ res['fAcc'].append(calc_fAcc(sub))
138
+ res['qAcc'].append(calc_qAcc(sub))
139
+ ret = pd.DataFrame(res)
140
+ return ret
141
+
142
+
143
+ def POPE_rating(data_file):
144
+ def cal_f1_score(y_true, y_pred):
145
+ tp = sum((y_true == 1) & (y_pred == 1))
146
+ fp = sum((y_true == 0) & (y_pred == 1))
147
+ fn = sum((y_true == 1) & (y_pred == 0))
148
+
149
+ precision = tp / (tp + fp) if (tp + fp) != 0 else 0
150
+ recall = tp / (tp + fn) if (tp + fn) != 0 else 0
151
+ f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
152
+ return f1_score, precision, recall
153
+
154
+ data = load(data_file)
155
+ data = data.assign(category=data['category'].str.split(',')).explode('category')
156
+ data['index'] = range(len(data))
157
+ res = dict(split=[], Overall=[], acc=[], precision=[], recall=[])
158
+ y_true = np.array([1 if i == 'Yes' else 0 for i in data['answer']])
159
+ y_pred = np.array([1 if i == 'Yes' else 0 for i in data['extracted']])
160
+ f1_score, precision, recall = cal_f1_score(y_true, y_pred)
161
+ res['split'].append('Overall')
162
+ res['Overall'].append(f1_score * 100)
163
+ res['acc'].append(np.mean(data['score']) * 100)
164
+ res['precision'].append(precision * 100)
165
+ res['recall'].append(recall * 100)
166
+
167
+ if 'category' in data:
168
+ cates = list(set(data['category']))
169
+ cates = [c for c in cates if not pd.isna(c)]
170
+ for c in cates:
171
+ sub = data[data['category'] == c]
172
+ y_true = np.array([1 if i == 'Yes' else 0 for i in sub['answer']])
173
+ y_pred = np.array([1 if i == 'Yes' else 0 for i in sub['extracted']])
174
+ f1_score, precision, recall = cal_f1_score(y_true, y_pred)
175
+ res['split'].append(c)
176
+ res['Overall'].append(f1_score * 100)
177
+ res['acc'].append(np.mean(sub['score']) * 100)
178
+ res['precision'].append(precision * 100)
179
+ res['recall'].append(recall * 100)
180
+
181
+ ret = pd.DataFrame(res)
182
+ return ret
183
+
184
+
185
+ def default_rating(data_file):
186
+ data = load(data_file)
187
+ res = {}
188
+ res['Overall'] = np.mean(data['score']) * 100
189
+ if 'category' in data:
190
+ cates = list(set(data['category']))
191
+ cates = [c for c in cates if not pd.isna(c)]
192
+ cates.sort()
193
+ for c in cates:
194
+ sub = data[data['category'] == c]
195
+ res[c] = np.mean(sub['score']) * 100
196
+ if 'l2-category' in data:
197
+ cates = list(set(data['l2-category']))
198
+ cates = [c for c in cates if not pd.isna(c)]
199
+ cates.sort()
200
+ for c in cates:
201
+ sub = data[data['l2-category'] == c]
202
+ res[c] = np.mean(sub['score']) * 100
203
+ ret = d2df(res)
204
+ return ret
205
+
206
+
207
+ def YOrN_match_prompt(line):
208
+ tmpl = (
209
+ 'You are an AI assistant who will help me to match an answer with two options of a question. '
210
+ 'The options are only Yes / No. '
211
+ 'You are provided with a question and an answer, '
212
+ 'and you need to find which option (Yes / No) is most similar to the answer. '
213
+ 'If the meaning of all options are significantly different from the answer, output Unknown. '
214
+ 'Your should output a single word among the following 3 choices: Yes, No, Unknown.\n'
215
+ 'Example 1: \n'
216
+ "Question: Is the word in this image 'Hello'?\nAnswer: The word in this image is 'Hello'.\nYour output: Yes\n"
217
+ 'Example 2: \n'
218
+ "Question: Is the word in this image 'Hello'?\n"
219
+ "Answer: The word in this image is not 'Hello'.\nYour output: No\n"
220
+ 'Example 3: \n'
221
+ 'Question: {}?\nAnswer: {}\nYour output: '
222
+ )
223
+ return tmpl.format(line['question'], line['prediction'])
224
+
225
+
226
+ def YOrN_Extraction(output):
227
+ s = output.lower()
228
+ words = process_punctuation(s).split()
229
+ if 'yes' in words and 'no' not in words:
230
+ return 'Yes'
231
+ if 'yes' not in words and 'no' in words:
232
+ return 'No'
233
+ return 'Unknown'
234
+
235
+
236
+ def YOrN_auxeval(model, line):
237
+ prompt = YOrN_match_prompt(line)
238
+ retry = 5
239
+ for i in range(retry):
240
+ output = model.generate(prompt, temperature=0.5 * i)
241
+ ans = YOrN_Extraction(output)
242
+ if ans != 'Unknown':
243
+ return ans
244
+ return 'Unknown'