InstruSumEval

Runtime error

App Files Files Community

InstruSumEval / src /populate.py

henryL7

debug

b7dfde8 over 1 year ago

raw

history blame

3.68 kB

	import json

	import pandas as pd

	import yaml
	from sklearn.metrics import cohen_kappa_score
	import numpy as np
	from datasets import load_dataset
	from .envs import TOKEN

	TYPES = ["str", "number", "number", "number", "number", "number"]


	def read_json(file_path: str) -> list[dict]:
	"""
	Read a JSON/JSONL file and return its contents as a list of dictionaries.

	Parameters:
	file_path (str): The path to the JSON file.

	Returns:
	list[dict]: The contents of the JSON file as a list of dictionaries.
	"""
	try:
	with open(file_path) as f:
	data = [json.loads(x) for x in f]
	return data
	except json.decoder.JSONDecodeError:
	with open(file_path) as f:
	data = json.load(f)
	return data


	def pairwise_compare(
	evaluator1_responses: list[dict],
	evaluator2_responses: list[dict],
	) -> tuple[float, float]:
	"""
	Compare pairwise evaluators.

	Args:
	evaluator1_responses: The responses from the first evaluator.
	evaluator2_responses: The responses from the second evaluator.
	Returns:
	None
	"""

	assert len(evaluator1_responses) == len(evaluator2_responses)
	evaluator1_winners = np.array([response["winner"] for response in evaluator1_responses])
	evaluator2_winners = np.array([response["winner"] for response in evaluator2_responses])
	acc = (evaluator1_winners == evaluator2_winners).mean().item()
	agreement = cohen_kappa_score(evaluator1_winners, evaluator2_winners)
	return acc, agreement


	def pairwise_meta_eval(human_responses: list[dict], model_dir: str, model_dir_swap: str) -> dict[float]:
	"""
	Evaluate a pairwise evaluator.

	Args:
	human_responses: The responses from the human evaluator.
	model_dir: The directory containing the model responses.
	model_dir_swap: The directory containing the model responses with swapped inputs.

	Returns:
	dict[float]: The accuracy and agreement.
	"""
	model_responses = read_json(model_dir)
	model_responses_swap = read_json(model_dir_swap)
	acc, agr = pairwise_compare(human_responses, model_responses)
	swap_acc, swap_agr = pairwise_compare(
	human_responses,
	model_responses_swap,
	)
	acc = (acc + swap_acc) / 2
	agr = (agr + swap_agr) / 2
	models_acc, models_agr = pairwise_compare(
	model_responses,
	model_responses_swap,
	)
	return acc, agr, models_acc, models_agr


	def load_leaderboard() -> pd.DataFrame:
	"""Loads the leaderboard from the file system"""
	with open("./data/models.yaml") as fp:
	models = yaml.safe_load(fp)
	human_responses = load_dataset("salesforce/instrusum", "human_eval_pairwise", token=TOKEN)["data"]
	human_responses = [x for x in human_responses]

	predictions = {k: [] for k in ["Model", "Accuracy", "Agreement", "Self-Accuracy", "Self-Agreement"]}

	for model in models:
	fdir = model["fdir"]
	acc, agr, models_acc, models_agr = pairwise_meta_eval(
	human_responses, f"./predictions/{fdir}.jsonl", f"./predictions/{fdir}_swap.jsonl"
	)
	predictions["Model"].append(model["name"])
	predictions["Accuracy"].append(acc)
	predictions["Agreement"].append(agr)
	predictions["Self-Accuracy"].append(models_acc)
	predictions["Self-Agreement"].append(models_agr)
	df = pd.DataFrame(predictions).sort_values(by="Agreement", ascending=False).round(decimals=3)
	df.reset_index(drop=True, inplace=True)
	df[' '] = pd.Series(range(1, len(df) + 1))
	columns = [' '] + [col for col in df.columns if col != ' ']
	df = df[columns]
	return df