hjkim00's picture
Restore all essential files - code, configs, and MBPP/HumanEval data
24c2665 verified
# Convert a directory of solutions to a JSONL file
import json
import os
from tqdm import tqdm
from evalplus.data import get_human_eval_plus, get_mbpp_plus
from evalplus.sanitize import sanitize
def main(directory, sanitize_code: bool = True):
basename = os.path.basename(directory)
parent_dir = os.path.dirname(directory)
target_jsonl_path = os.path.join(parent_dir, f"{basename}.jsonl")
datasets = {**get_human_eval_plus(), **get_mbpp_plus()}
with open(target_jsonl_path, "w") as f:
# iterate directories
for subdir_name in tqdm(os.listdir(directory)):
subdir_path = os.path.join(directory, subdir_name)
if not os.path.isdir(subdir_path):
continue
assert "_" in subdir_name
dataset_name, task_num = subdir_name.split("_")
task_id = f"{dataset_name}/{task_num}"
entrypoint = datasets[task_id]["entry_point"]
for sample_name in os.listdir(subdir_path):
if not sample_name.endswith(".py"):
continue
code = open(os.path.join(subdir_path, sample_name)).read()
if sanitize_code:
try:
code = sanitize(code, entrypoint=entrypoint)
except ValueError as e:
print(f"Failed to sanitize {task_id}/{sample_name}: {e}")
print(code)
continue
f.write(json.dumps({"task_id": task_id, "solution": code}) + "\n")
if __name__ == "__main__":
from fire import Fire
Fire(main)