eval-leaderboard

Sleeping

eval-leaderboard / refactor_eval_results.py

xeon27

Add o3-mini and DeepSeek-R1 results

5458f38 10 months ago

7.69 kB

	import json
	import os


	METRIC_NAME = {
	# single-turn
	"arc_easy": "accuracy",
	"arc_challenge": "accuracy",
	"gpqa_diamond": "accuracy",
	"drop": "mean",
	"winogrande": "accuracy",
	"gsm8k": "accuracy",
	"hellaswag": "accuracy",
	"humaneval": "mean",
	"ifeval": "final_acc",
	"math": "accuracy",
	"mmlu": "accuracy",
	"mmlu_pro": "accuracy",
	"mmmu_multiple_choice": "accuracy",
	"mmmu_open": "accuracy",

	# agentic
	"gaia": "accuracy",
	"gdm_intercode_ctf": "accuracy",
	"gdm_in_house_ctf": "accuracy",
	"agentharm": "avg_score",
	"agentharm_benign": "avg_score",
	"swe_bench": "mean",
	}

	MODEL_SHA_MAP = {
	# open source models
	"c4ai-command-r-plus": "https://huggingface.co/CohereForAI/c4ai-command-r-plus",
	"Meta-Llama-3.1-70B-Instruct": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct",
	"Mistral-Large-Instruct-2407": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
	"Qwen2.5-72B-Instruct": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",

	# closed source models
	"claude-3-5-sonnet-20241022": "https://www.anthropic.com/claude/sonnet",
	"gemini-1.5-flash": "https://deepmind.google/technologies/gemini/flash", # TODO: points to 2.0, can't find page for 1.5
	"gemini-1.5-pro": "https://deepmind.google/technologies/gemini/pro",
	"gpt-4o": "https://openai.com/index/hello-gpt-4o",
	"gpt-4o-mini": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
	"o1": "https://openai.com/o1",
	"o3-mini": "https://openai.com/index/openai-o3-mini",
	"DeepSeek-R1": "https://api-docs.deepseek.com/news/news250120"
	}

	MODEL_VERSION_MAP = {
	# open source models
	"c4ai-command-r-plus": "c4ai-command-r-plus",
	"Meta-Llama-3.1-70B-Instruct": "Llama-3.1-70B-Instruct",
	"Mistral-Large-Instruct-2407": "Mistral-Large-Instruct-2407",
	"Qwen2.5-72B-Instruct": "Qwen2.5-72B-Instruct",

	# closed source models
	"claude-3-5-sonnet-20241022": "Claude-3.5-Sonnet-20241022",
	"gemini-1.5-flash": "Gemini-1.5-Flash",
	"gemini-1.5-pro": "Gemini-1.5-Pro-002",
	"gpt-4o": "GPT-4o-20240806",
	"gpt-4o-mini": "GPT-4o-mini-20240718",
	"o1": "o1-20241217",
	"o3-mini": "o3-mini-20250131",
	"DeepSeek-R1": "DeepSeek-R1",
	}

	AGENTIC_LOG_MODEL_NAME_MAP = {
	"claude-3-5-sonnet-20241022": "claude-3-5-sonnet-20241022",
	"gemini-1.5-pro": "gemini-1.5-pro-002",
	"gpt-4o": "gpt-4o-2024-08-06",
	"o1": "o1-2024-12-17",
	"o3-mini": "o3-mini-2025-01-31",
	}

	AGENTIC_TASKS = ["gaia", "gdm-intercode-ctf", "gdm-in-house-ctf", "agentharm", "swe-bench"]


	def combine_eval_results(results_path: str, model_name: str, type: str,) -> dict:
	results = dict(
	{
	"config": {
	"model_name": model_name,
	# dummy keys
	"model_sha": MODEL_SHA_MAP[model_name],
	"model_dtype": "torch.float16",
	},
	"results": {},
	}
	)

	if type == "base":
	for file in os.listdir(os.path.join(results_path, model_name)):
	if file.endswith(".json"):
	with open(os.path.join(results_path, model_name, file), "r") as f:
	try:
	result = json.load(f)
	task_name = result["eval"]["task"].split("/")[-1]
	if task_name == "math":
	metrics = [elm for elm in result["results"]["scores"] if elm["name"] == "expression_equivalance"][0]["metrics"] # TODO: change scorer if required
	else:
	metrics = result["results"]["scores"][0]["metrics"]
	metric_name = metrics[METRIC_NAME[task_name]]["name"]
	metric_value = metrics[METRIC_NAME[task_name]]["value"]
	results["results"].update(
	{
	task_name: {
	metric_name: metric_value
	}
	}
	)
	except KeyError as e:
	print(f"KeyError: {e}")
	print(model_name)
	print(file)

	elif type == "agentic":
	model_name = AGENTIC_LOG_MODEL_NAME_MAP[model_name] # change name based on log file structure
	results_path = os.path.join(results_path, model_name)
	for task in AGENTIC_TASKS:
	for file in os.listdir(os.path.join(results_path, task)):
	if file.endswith(".json"):
	with open(os.path.join(results_path, task, file), "r") as f:
	try:
	result = json.load(f)
	task_name = result["eval"]["task"].split("/")[-1]
	metrics = result["results"]["scores"][0]["metrics"]
	metric_name = metrics[METRIC_NAME[task_name]]["name"].split("/")[-1]
	metric_value = metrics[METRIC_NAME[task_name]]["value"]
	results["results"].update(
	{
	task_name: {
	metric_name: metric_value
	}
	}
	)
	except KeyError as e:
	print(f"KeyError: {e}")
	print(model_name)
	print(file)

	return results


	def main():

	CACHE_PATH=os.getenv("HF_HOME", ".")
	EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
	EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")

	base_bm_input_path = "./base_benchmarking_logs"
	agentic_bm_input_path = "/fs01/projects/aieng/public/inspect_evals/agentic_benchmarking_runs"
	os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
	os.makedirs(EVAL_REQUESTS_PATH, exist_ok=True)

	for model_name in os.listdir(base_bm_input_path):

	if os.path.isdir(os.path.join(base_bm_input_path, model_name)):
	results = combine_eval_results(base_bm_input_path, model_name, "base")
	# TMP: Add missing benchmarks to the results
	for metric in METRIC_NAME.items():
	if metric[0] not in results["results"]:
	results["results"].update({metric[0]: {metric[1]: None}})

	if os.path.isdir(os.path.join(agentic_bm_input_path, AGENTIC_LOG_MODEL_NAME_MAP.get(model_name, "NA"))):
	agentic_bm_results = combine_eval_results(agentic_bm_input_path, model_name, "agentic")
	results["results"].update(agentic_bm_results["results"])
	with open(os.path.join(EVAL_RESULTS_PATH, f"{model_name}.json"), "w") as f:
	json.dump(results, f, indent=4)

	# Create dummy requests file
	requests = {
	"model": model_name,
	"model_sha": MODEL_SHA_MAP[model_name],
	"model_version": MODEL_VERSION_MAP[model_name],
	"base_model": "",
	"revision": "main",
	"private": False,
	"precision": "float16",
	"weight_type": "Original",
	"status": "FINISHED",
	"submitted_time": "",
	"model_type": "pretrained",
	"likes": 0,
	"params": 0,
	"license": "custom",
	}
	with open(os.path.join(EVAL_REQUESTS_PATH, f"{model_name}.json"), "w") as f:
	json.dump(requests, f, indent=4)


	if __name__ == "__main__":
	main()