|
|
import os |
|
|
from collections import defaultdict |
|
|
|
|
|
from anls import anls_score |
|
|
from loguru import logger as eval_logger |
|
|
|
|
|
dir_name = os.path.dirname(os.path.abspath(__file__)) |
|
|
|
|
|
|
|
|
eval_type_dict = { |
|
|
"Sensation": ["count", "color", "scene", "poster", "attribute_recognition", "ocr", "position"], |
|
|
"Cognition": ["calculation", "code", "translation", "math", "cross_instance_reason", "attribute_reason"], |
|
|
"Knowledge": ["celebrity", "chemistry", "physics", "biology", "landmark", "artwork"], |
|
|
} |
|
|
|
|
|
|
|
|
def conbench_doc_to_visual(doc): |
|
|
return [doc["image"].convert("RGB")] |
|
|
|
|
|
|
|
|
def conbench_doc_to_text(doc): |
|
|
question = doc["question"].strip() |
|
|
return question |
|
|
|
|
|
|
|
|
def parse_pred_ans_NY(pred_ans): |
|
|
pred_label = None |
|
|
if pred_ans in ["yes", "no"]: |
|
|
pred_label = pred_ans |
|
|
else: |
|
|
prefix_pred_ans = pred_ans[:4] |
|
|
|
|
|
if "yes" in prefix_pred_ans: |
|
|
pred_label = "yes" |
|
|
elif "no" in prefix_pred_ans: |
|
|
pred_label = "no" |
|
|
else: |
|
|
pred_label = "other" |
|
|
return pred_label |
|
|
|
|
|
|
|
|
def parse_pred_ans_choice(pred_ans): |
|
|
return pred_ans.replace(" ", "")[0] |
|
|
|
|
|
|
|
|
def conbench_process_results(doc, results): |
|
|
""" |
|
|
Args: |
|
|
doc: a instance of the eval dataset |
|
|
results: [pred] |
|
|
Returns: |
|
|
a dictionary with key: metric name (in this case mme score), value: metric value |
|
|
""" |
|
|
pred = results[0] |
|
|
pred = pred.replace("\n", "").lower() |
|
|
|
|
|
if doc["question_field"] == "N/Y": |
|
|
pred_ans = parse_pred_ans_NY(pred) |
|
|
elif doc["question_field"] == "Choices": |
|
|
pred_ans = parse_pred_ans_choice(pred) |
|
|
else: |
|
|
pred_ans = pred |
|
|
|
|
|
gt_ans = doc["answer"].lower() |
|
|
|
|
|
|
|
|
score = 1 if (doc["question_field"] == "Q/A" and anls_score(prediction=pred_ans, gold_labels=[gt_ans], threshold=0.95) >= 0.4) or (gt_ans == pred_ans) else 0 |
|
|
|
|
|
|
|
|
return {"ConScore_D": {"image_id": doc["image_id"], "question_field": doc["question_field"], "score": score}} |
|
|
|
|
|
|
|
|
def conbench_aggregate_results(results): |
|
|
""" |
|
|
Args: |
|
|
results: a list of values returned by process_results |
|
|
Returns: |
|
|
A score |
|
|
""" |
|
|
summary = defaultdict(dict) |
|
|
for result in results: |
|
|
image_id = result["image_id"] |
|
|
score = result["score"] |
|
|
if image_id not in summary.keys(): |
|
|
summary[image_id] = 0 |
|
|
summary[image_id] += score |
|
|
|
|
|
cnt_con = 0 |
|
|
for image_id, score in summary.items(): |
|
|
if score == 3: |
|
|
cnt_con += 1 |
|
|
|
|
|
print("Consistency Cases are ", cnt_con) |
|
|
cnt_con = cnt_con / (len(results) / 3) |
|
|
eval_logger.info(f"ConScore_D: {cnt_con:.2f}") |
|
|
return cnt_con |
|
|
|