llm_cp2 / src /lmms-eval /lmms_eval /tasks /conbench /utils.py

Upload folder using huggingface_hub

b0c0df0 verified about 1 month ago

2.85 kB

	import os
	from collections import defaultdict

	from anls import anls_score
	from loguru import logger as eval_logger

	dir_name = os.path.dirname(os.path.abspath(__file__))

	# 19 classes
	eval_type_dict = {
	"Sensation": ["count", "color", "scene", "poster", "attribute_recognition", "ocr", "position"],
	"Cognition": ["calculation", "code", "translation", "math", "cross_instance_reason", "attribute_reason"],
	"Knowledge": ["celebrity", "chemistry", "physics", "biology", "landmark", "artwork"],
	}


	def conbench_doc_to_visual(doc):
	return [doc["image"].convert("RGB")]


	def conbench_doc_to_text(doc):
	question = doc["question"].strip()
	return question


	def parse_pred_ans_NY(pred_ans):
	pred_label = None
	if pred_ans in ["yes", "no"]:
	pred_label = pred_ans
	else:
	prefix_pred_ans = pred_ans[:4]

	if "yes" in prefix_pred_ans:
	pred_label = "yes"
	elif "no" in prefix_pred_ans:
	pred_label = "no"
	else:
	pred_label = "other"
	return pred_label


	def parse_pred_ans_choice(pred_ans):
	return pred_ans.replace(" ", "")[0]


	def conbench_process_results(doc, results):
	"""
	Args:
	doc: a instance of the eval dataset
	results: [pred]
	Returns:
	a dictionary with key: metric name (in this case mme score), value: metric value
	"""
	pred = results[0]
	pred = pred.replace("\n", "").lower()
	# parser
	if doc["question_field"] == "N/Y":
	pred_ans = parse_pred_ans_NY(pred)
	elif doc["question_field"] == "Choices":
	pred_ans = parse_pred_ans_choice(pred)
	else:
	pred_ans = pred

	gt_ans = doc["answer"].lower()

	# score
	score = 1 if (doc["question_field"] == "Q/A" and anls_score(prediction=pred_ans, gold_labels=[gt_ans], threshold=0.95) >= 0.4) or (gt_ans == pred_ans) else 0
	# Note: the key name here is very important. It decides which aggregation function will receive the results
	# We note down the question id/category to help us aggregate the results later
	return {"ConScore_D": {"image_id": doc["image_id"], "question_field": doc["question_field"], "score": score}}


	def conbench_aggregate_results(results):
	"""
	Args:
	results: a list of values returned by process_results
	Returns:
	A score
	"""
	summary = defaultdict(dict)
	for result in results:
	image_id = result["image_id"]
	score = result["score"]
	if image_id not in summary.keys():
	summary[image_id] = 0
	summary[image_id] += score

	cnt_con = 0
	for image_id, score in summary.items():
	if score == 3:
	cnt_con += 1

	print("Consistency Cases are ", cnt_con)
	cnt_con = cnt_con / (len(results) / 3)
	eval_logger.info(f"ConScore_D: {cnt_con:.2f}")
	return cnt_con