"""Code reasoning / completion benchmark adapters (CRUXEval, SAFIM, HumanEval-X).""" from __future__ import annotations import re from typing import Any from adapters import DatasetAdapter # Injected at runtime by _set_helpers() _highlight_code = None _code_offset = None _extract_test_classes = None # --------------------------------------------------------------------------- # CRUXEval adapter (HuggingFace: cruxeval-org/cruxeval) # --------------------------------------------------------------------------- class CRUXEvalAdapter(DatasetAdapter): slug = "cruxeval" display_name = "CRUXEval" has_ground_truth = False has_tasks = True def __init__(self, hf_dataset): self._ds = hf_dataset def problem_count(self) -> int: return len(self._ds) def get_problem_summary(self, idx: int) -> dict[str, Any]: row = self._ds[idx] return { "idx": idx, "task_id": row["id"], "entry_point": "f", "num_inputs": 1, "source": "CRUXEval", } def get_problem_detail(self, idx: int) -> dict[str, Any]: row = self._ds[idx] code = row["code"] return { "idx": idx, "task_id": row["id"], "entry_point": "f", "code": code, "highlighted_code": _highlight_code(code), "inputs": [row["input"]], "outputs": [row["output"]], "test": None, "tasks": [ { "name": "Output Prediction", "description": "Given the code and input, predict the output.", "given": "input", "predict": "output", "input": row["input"], "output": row["output"], }, { "name": "Input Prediction", "description": "Given the code and output, predict the input.", "given": "output", "predict": "input", "input": row["input"], "output": row["output"], }, ], "source": "CRUXEval", "has_ground_truth": False, "has_tasks": True, } # --------------------------------------------------------------------------- # SAFIM adapter (HuggingFace: gonglinyuan/safim) # --------------------------------------------------------------------------- class SAFIMAdapter(DatasetAdapter): slug = "safim" display_name = "SAFIM" has_ground_truth = False has_tasks = False def __init__(self, hf_dataset): self._ds = hf_dataset def problem_count(self) -> int: return len(self._ds) def get_problem_summary(self, idx: int) -> dict[str, Any]: row = self._ds[idx] return { "idx": idx, "task_id": row.get("task_id", str(idx)), "entry_point": row.get("task_id", f"safim_{idx}"), "num_inputs": 0, "source": row.get("lang", "unknown"), } # Patterns that mark where the completion should be inserted _HOLE_MARKERS = [ "{{completion}}", "/* TODO: Your code here */", "// TODO: Your code here", "# TODO: Your code here", ] def _find_hole_marker(self, prompt: str) -> str | None: """Return the first matching hole marker found in the prompt, or None.""" for marker in self._HOLE_MARKERS: if marker in prompt: return marker return None def get_problem_detail(self, idx: int) -> dict[str, Any]: row = self._ds[idx] prompt = row.get("prompt", "") ground_truth = row.get("ground_truth", "") lang = row.get("lang", "python") marker = self._find_hole_marker(prompt) if marker: display_code = prompt.replace(marker, "/* [HOLE] */") before_hole = prompt.split(marker)[0] merged_code = prompt.replace(marker, ground_truth) else: display_code = prompt + "\n/* [HOLE] */\n" before_hole = prompt + "\n" merged_code = prompt + "\n" + ground_truth + "\n" # Compute 1-indexed line range of the inserted ground truth gt_start_line = before_hole.count("\n") + 1 gt_line_count = ground_truth.count("\n") + (1 if ground_truth else 0) gt_end_line = gt_start_line + gt_line_count - 1 lang_key = {"Python": "python", "Java": "java", "C++": "cpp", "C#": "csharp"}.get( lang, lang.lower() ) return { "idx": idx, "task_id": row.get("task_id", str(idx)), "entry_point": row.get("task_id", f"safim_{idx}"), "code": display_code, "highlighted_code": _highlight_code(display_code, language=lang_key), "inputs": [], "outputs": [], "test": None, "tasks": [], "source": lang, "has_ground_truth": False, "has_tasks": False, "fim_prefix": prompt, "fim_ground_truth": ground_truth, "fim_ground_truth_highlighted": _highlight_code(ground_truth, language=lang_key), "fim_merged_code": merged_code, "fim_merged_highlighted": _highlight_code( merged_code, highlight_lines=list(range(gt_start_line, gt_end_line + 1)), language=lang_key, ), "fim_gt_start_line": gt_start_line, "fim_gt_end_line": gt_end_line, "language": lang, } # --------------------------------------------------------------------------- # HumanEval-X adapter (HuggingFace: THUDM/humaneval-x) # --------------------------------------------------------------------------- def _extract_func_name(declaration: str) -> str: """Extract the function/method name from a code declaration string.""" m = re.search(r"def\s+(\w+)\s*\(", declaration) if m: return m.group(1) m = re.search(r"(\w+)\s*\(", declaration) if m: return m.group(1) return "" # --------------------------------------------------------------------------- # HumanEvalPack adapter (HuggingFace: bigcode/humanevalpack) # --------------------------------------------------------------------------- class HumanEvalPackAdapter(DatasetAdapter): slug = "humanevalpack" display_name = "HumanEvalPack" has_ground_truth = False has_tasks = False LANGUAGES = ["python", "js", "cpp", "go", "java", "rust"] def __init__(self, datasets_by_lang: dict[str, Any]): self._by_lang = datasets_by_lang first_lang = next(iter(self._by_lang)) self._count = len(self._by_lang[first_lang]) def problem_count(self) -> int: return self._count def get_problem_summary(self, idx: int) -> dict[str, Any]: first_lang = next(iter(self._by_lang)) row = self._by_lang[first_lang][idx] return { "idx": idx, "task_id": row["task_id"], "entry_point": row.get("entry_point", f"problem_{idx}"), "num_inputs": len(self._by_lang), "source": "HumanEvalPack", } def get_problem_detail(self, idx: int) -> dict[str, Any]: first_lang = next(iter(self._by_lang)) row = self._by_lang[first_lang][idx] lang_labels = { "python": "Python", "js": "JavaScript", "cpp": "C++", "go": "Go", "java": "Java", "rust": "Rust", } lang_pygments = { "python": "python", "js": "javascript", "cpp": "cpp", "go": "go", "java": "java", "rust": "rust", } lang_solutions = [] for lang in self.LANGUAGES: if lang not in self._by_lang: continue lrow = self._by_lang[lang][idx] canonical = lrow.get("prompt", "") + lrow.get("canonical_solution", "") buggy = lrow.get("prompt", "") + lrow.get("buggy_solution", "") lang_key = lang_pygments.get(lang, lang) lang_solutions.append( { "language": lang, "language_label": lang_labels.get(lang, lang), "code": canonical, "highlighted_code": _highlight_code(canonical, language=lang_key), "buggy_code": buggy, "buggy_highlighted_code": _highlight_code(buggy, language=lang_key), "test": lrow.get("test", ""), "example_test": lrow.get("example_test", ""), "bug_type": lrow.get("bug_type", ""), "failure_symptoms": lrow.get("failure_symptoms", ""), } ) py_row = self._by_lang.get("python", self._by_lang[first_lang])[idx] default_code = py_row.get("prompt", "") + py_row.get("canonical_solution", "") return { "idx": idx, "task_id": row["task_id"], "entry_point": row.get("entry_point", f"problem_{idx}"), "code": default_code, "highlighted_code": _highlight_code(default_code), "inputs": [], "outputs": [], "test": py_row.get("test", ""), "tasks": [], "source": "HumanEvalPack", "has_ground_truth": False, "has_tasks": False, "description": row.get("instruction", row.get("docstring", "")), "lang_solutions": lang_solutions, "bug_type": py_row.get("bug_type", ""), "failure_symptoms": py_row.get("failure_symptoms", ""), } # --------------------------------------------------------------------------- # HumanEval-X adapter (HuggingFace: THUDM/humaneval-x) # --------------------------------------------------------------------------- class HumanEvalXAdapter(DatasetAdapter): slug = "humanevalx" display_name = "HumanEval-X" has_ground_truth = False has_tasks = False LANGUAGES = ["python", "cpp", "java", "go", "js"] def __init__(self, datasets_by_lang: dict[str, Any]): """datasets_by_lang maps language name -> HF dataset split.""" self._by_lang = datasets_by_lang first_lang = next(iter(self._by_lang)) self._count = len(self._by_lang[first_lang]) def problem_count(self) -> int: return self._count def get_problem_summary(self, idx: int) -> dict[str, Any]: first_lang = next(iter(self._by_lang)) row = self._by_lang[first_lang][idx] task_id = row["task_id"].split("/")[-1] decl = row.get("declaration", row.get("prompt", "")) entry = _extract_func_name(decl) or f"problem_{task_id}" return { "idx": idx, "task_id": f"HumanEval/{task_id}", "entry_point": entry, "num_inputs": len(self._by_lang), "source": "HumanEval-X", } def get_problem_detail(self, idx: int) -> dict[str, Any]: first_lang = next(iter(self._by_lang)) row = self._by_lang[first_lang][idx] task_id = row["task_id"].split("/")[-1] decl = row.get("declaration", row.get("prompt", "")) entry = _extract_func_name(decl) or f"problem_{task_id}" lang_solutions = [] for lang in self.LANGUAGES: if lang not in self._by_lang: continue lrow = self._by_lang[lang][idx] code = lrow["prompt"] + lrow["canonical_solution"] lang_solutions.append( { "language": lang, "code": code, "highlighted_code": _highlight_code(code, language=lang), "test": lrow.get("test", ""), "example_test": lrow.get("example_test", ""), } ) py_row = self._by_lang.get("python", self._by_lang[first_lang])[idx] default_code = py_row["prompt"] + py_row["canonical_solution"] return { "idx": idx, "task_id": f"HumanEval/{task_id}", "entry_point": entry, "code": default_code, "highlighted_code": _highlight_code(default_code), "inputs": [], "outputs": [], "test": py_row.get("test", ""), "tasks": [], "source": "HumanEval-X", "has_ground_truth": False, "has_tasks": False, "lang_solutions": lang_solutions, }