Spaces:
Runtime error
Runtime error
| """This file checks two things: | |
| 1. Is the LLMs codegen completed for each benchmark? | |
| 2. Warn the code that are not compilable (it could be some impl issues). | |
| """ | |
| from termcolor import colored | |
| from evalplus.data import load_solutions | |
| from evalplus.sanitize import syntax_check | |
| if __name__ == "__main__": | |
| import argparse | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--samples", type=str, required=True) | |
| parser.add_argument( | |
| "--dataset", required=True, type=str, choices=["humaneval", "mbpp"] | |
| ) | |
| parser.add_argument("--nsample", type=int, default=1) | |
| parser.add_argument("--verbose", action="store_true") | |
| args = parser.parse_args() | |
| # List[Dict{"task_id", "solution"}] | |
| solutions = load_solutions(args.samples) | |
| if args.dataset == "humaneval": | |
| from evalplus.data import get_human_eval_plus | |
| dataset = get_human_eval_plus() | |
| dataset_name = "HumanEval" | |
| elif args.dataset == "mbpp": | |
| from evalplus.data import get_mbpp_plus | |
| dataset = get_mbpp_plus() | |
| dataset_name = "Mbpp" | |
| id2solutions = {} | |
| for solution in solutions: | |
| task_id = solution["task_id"] | |
| if task_id not in id2solutions: | |
| id2solutions[task_id] = [] | |
| if "solution" not in solution: | |
| assert "completion" in solution, "solution or completion must exist!" | |
| solution["solution"] = dataset[task_id]["prompt"] + solution["completion"] | |
| id2solutions[task_id].append(solution) | |
| nsample = max(args.nsample, max(len(v) for v in id2solutions.values())) | |
| print(colored("==============================", "blue")) | |
| print(colored(" ::: Checking completeness... ", "blue")) | |
| print(colored(" ::::: All tasks complete? ", "blue")) | |
| ndone = 0 | |
| task_ids = dataset.keys() | |
| ntask = len(task_ids) | |
| for task_id in task_ids: | |
| if task_id not in id2solutions: | |
| print(colored(f" ⚠️ {task_id} is missing!", "red")) | |
| continue | |
| nfiles = len(id2solutions[task_id]) | |
| if nfiles == nsample: | |
| ndone += 1 | |
| continue | |
| print( | |
| colored( | |
| f" ⚠️ {task_id} only has {nfiles} samples! But {nsample} are expected.", | |
| "red", | |
| ) | |
| ) | |
| if ntask != ndone: | |
| ntbd = ntask - ndone | |
| print(colored(f" ::::: ⚠️ {ntbd}/{ntask} tasks incomplete!", "red")) | |
| else: | |
| print(colored(f" ::::: All {ntask} tasks complete!", "green")) | |
| print(colored("==============================", "blue")) | |
| print(colored(" ::: Checking compilation... ", "blue")) | |
| print(colored(" ::::: All code compilable? ", "blue")) | |
| ncode = 0 | |
| nwrong = 0 | |
| for task_id in task_ids: | |
| # task_id must exist | |
| if task_id not in id2solutions: | |
| continue | |
| for solution in id2solutions[task_id]: | |
| ncode += 1 | |
| code = solution["solution"] | |
| dbg_identifier = solution["_identifier"] | |
| if code.strip() == "": | |
| print(colored(f" ⚠️ {dbg_identifier} is empty!", "red")) | |
| nwrong += 1 | |
| elif not syntax_check(code, args.verbose): | |
| print(colored(f" ⚠️ {dbg_identifier} is not compilable!", "red")) | |
| nwrong += 1 | |
| if 0 != nwrong: | |
| print(colored(f" ::::: ⚠️ {nwrong}/{ncode} code are not compilable!", "red")) | |
| else: | |
| print(colored(f" ::::: All {ncode} code are compilable!", "green")) | |