llm_cp2 / src /lmms-eval /lmms_eval /tasks /chartqa /utils.py

Upload folder using huggingface_hub

b0c0df0 verified about 1 month ago

2.37 kB

	def chartqa_doc_to_visual(doc):
	return [doc["image"].convert("RGB")]


	def chartqa_doc_to_text(doc, lmms_eval_specific_kwargs):
	question = doc["question"]
	pre_prompt = lmms_eval_specific_kwargs["pre_prompt"]
	post_prompt = lmms_eval_specific_kwargs["post_prompt"]
	return f"{pre_prompt}{question}{post_prompt}"


	def chartqa_process_results(doc, results):
	pred = results[0]
	type = doc["type"]
	score = relaxed_correctness(pred, doc["answer"])
	score = 1.0 if score else 0.0
	return_dict = {"relaxed_overall": score}
	if type == "human_test":
	return_dict["relaxed_human_split"] = score
	else:
	return_dict["relaxed_augmented_split"] = score
	return return_dict


	def relaxed_correctness(prediction, target, max_relative_change: float = 0.05) -> bool:
	"""Calculates relaxed correctness.

	The correctness tolerates certain error ratio defined by max_relative_change.
	See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
	“Following Methani et al. (2020), we use a relaxed accuracy measure for the
	numeric answers to allow a minor inaccuracy that may result from the automatic
	data extraction process. We consider an answer to be correct if it is within
	5% of the gold answer. For non-numeric answers, we still need an exact match
	to consider an answer to be correct.”

	This funcion is taken from https://github.com/QwenLM/Qwen-VL/blob/34b4c0ee7b07726371b960911f249fe61b362ca3/eval_mm/evaluate_vqa.py#L113
	Args:
	target: List of target string.
	prediction: List of predicted string.
	max_relative_change: Maximum relative change.

	Returns:
	Whether the prediction was correct given the specified tolerance.
	"""

	def _to_float(text: str):
	try:
	if text.endswith("%"):
	# Convert percentages to floats.
	return float(text.rstrip("%")) / 100.0
	else:
	return float(text)
	except ValueError:
	return None

	prediction_float = _to_float(prediction)
	target_float = _to_float(target)
	if prediction_float is not None and target_float:
	relative_change = abs(prediction_float - target_float) / abs(target_float)
	return relative_change <= max_relative_change
	else:
	return prediction.lower() == target.lower()