llm_cp2 / src /lmms-eval /lmms_eval /tasks /csbench /utils.py

Upload folder using huggingface_hub

b0c0df0 verified about 1 month ago

3.11 kB

	import random
	import re
	from typing import Dict, List, Tuple

	import numpy as np

	assertion_prompt = """Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: $LETTER' (without quotes), where LETTER is one of A or B."""

	mcq_prompt = """Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: $LETTER' (without quotes), where LETTER is one of A, B, C, or D."""


	def csbench_mcq_doc_to_text(doc: Dict, lmms_eval_specific_kwargs: Dict) -> str:
	q = doc["Question"]
	a = doc["A"]
	b = doc["B"]
	c = doc["C"]
	d = doc["D"]
	question = f"{assertion_prompt}\nQuestion: {q}\nA: {a}\nB: {b}\nC: {c}\nD: {d}\n"
	return question


	def csbench_assertion_doc_to_text(doc: Dict, lmms_eval_specific_kwargs: Dict) -> str:
	q = doc["Question"]
	question = f"{assertion_prompt}\nQuestion: {q}\n A: True\n B: False\n"
	return question


	def csbench_doc_to_target(doc: Dict) -> str:
	if doc["Format"].strip() == "Multiple-choice":
	return doc["Answer"].strip().upper()
	else:
	return "A" if doc["Answer"].strip() == "True" else "B"


	def csbench_doc_to_choice(doc: Dict) -> List[str]:
	if doc["Format"].strip() == "Multiple-choice":
	return ["A", "B", "C", "D"]
	else:
	return ["A", "B"]


	def parse_multi_choice_response(response, all_choices):
	"""
	Parse the prediction from the generated response.
	Return the predicted choice letter e.g., A, B, C, D.
	"""
	# Clean response of unwanted characters
	for char in [",", ".", "!", "?", ";", ":", "'"]:
	response = response.strip(char)
	response = " " + response + " " # Add space to avoid partial match

	candidates = []
	# Look for choices with parentheses, e.g., (A)
	for choice in all_choices:
	if f"({choice})" in response:
	candidates.append(choice)

	# Look for simple choices, e.g., A, B, C
	if len(candidates) == 0:
	for choice in all_choices:
	if f" {choice} " in response:
	candidates.append(choice)

	# Look for choices with periods, e.g., A., B., C.
	if len(candidates) == 0:
	for choice in all_choices:
	if f"{choice}." in response:
	candidates.append(choice)

	# If no candidates, randomly choose one
	if len(candidates) == 0:
	pred_index = random.choice(all_choices)
	elif len(candidates) > 1:
	# If more than one candidate, choose the last one found
	start_indexes = [response.rfind(f" {can} ") for can in candidates]
	pred_index = candidates[np.argmax(start_indexes)]
	else:
	# If only one candidate, use it
	pred_index = candidates[0]

	return pred_index


	def csbench_process_results(doc: Dict, result: List[str]) -> Dict[str, float]:
	pred = parse_multi_choice_response(result[0], csbench_doc_to_choice(doc))
	gt = csbench_doc_to_target(doc)
	score = 1.0 if pred == gt else 0.0
	return {"accuracy": score}