| | """Code reasoning / completion benchmark adapters (CRUXEval, SAFIM, HumanEval-X).""" |
| |
|
| | from __future__ import annotations |
| |
|
| | import re |
| | from typing import Any |
| |
|
| | from adapters import DatasetAdapter |
| |
|
| | |
| | _highlight_code = None |
| | _code_offset = None |
| | _extract_test_classes = None |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | class CRUXEvalAdapter(DatasetAdapter): |
| | slug = "cruxeval" |
| | display_name = "CRUXEval" |
| | has_ground_truth = False |
| | has_tasks = True |
| |
|
| | def __init__(self, hf_dataset): |
| | self._ds = hf_dataset |
| |
|
| | def problem_count(self) -> int: |
| | return len(self._ds) |
| |
|
| | def get_problem_summary(self, idx: int) -> dict[str, Any]: |
| | row = self._ds[idx] |
| | return { |
| | "idx": idx, |
| | "task_id": row["id"], |
| | "entry_point": "f", |
| | "num_inputs": 1, |
| | "source": "CRUXEval", |
| | } |
| |
|
| | def get_problem_detail(self, idx: int) -> dict[str, Any]: |
| | row = self._ds[idx] |
| | code = row["code"] |
| | return { |
| | "idx": idx, |
| | "task_id": row["id"], |
| | "entry_point": "f", |
| | "code": code, |
| | "highlighted_code": _highlight_code(code), |
| | "inputs": [row["input"]], |
| | "outputs": [row["output"]], |
| | "test": None, |
| | "tasks": [ |
| | { |
| | "name": "Output Prediction", |
| | "description": "Given the code and input, predict the output.", |
| | "given": "input", |
| | "predict": "output", |
| | "input": row["input"], |
| | "output": row["output"], |
| | }, |
| | { |
| | "name": "Input Prediction", |
| | "description": "Given the code and output, predict the input.", |
| | "given": "output", |
| | "predict": "input", |
| | "input": row["input"], |
| | "output": row["output"], |
| | }, |
| | ], |
| | "source": "CRUXEval", |
| | "has_ground_truth": False, |
| | "has_tasks": True, |
| | } |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | class SAFIMAdapter(DatasetAdapter): |
| | slug = "safim" |
| | display_name = "SAFIM" |
| | has_ground_truth = False |
| | has_tasks = False |
| |
|
| | def __init__(self, hf_dataset): |
| | self._ds = hf_dataset |
| |
|
| | def problem_count(self) -> int: |
| | return len(self._ds) |
| |
|
| | def get_problem_summary(self, idx: int) -> dict[str, Any]: |
| | row = self._ds[idx] |
| | return { |
| | "idx": idx, |
| | "task_id": row.get("task_id", str(idx)), |
| | "entry_point": row.get("task_id", f"safim_{idx}"), |
| | "num_inputs": 0, |
| | "source": row.get("lang", "unknown"), |
| | } |
| |
|
| | |
| | _HOLE_MARKERS = [ |
| | "{{completion}}", |
| | "/* TODO: Your code here */", |
| | "// TODO: Your code here", |
| | "# TODO: Your code here", |
| | ] |
| |
|
| | def _find_hole_marker(self, prompt: str) -> str | None: |
| | """Return the first matching hole marker found in the prompt, or None.""" |
| | for marker in self._HOLE_MARKERS: |
| | if marker in prompt: |
| | return marker |
| | return None |
| |
|
| | def get_problem_detail(self, idx: int) -> dict[str, Any]: |
| | row = self._ds[idx] |
| | prompt = row.get("prompt", "") |
| | ground_truth = row.get("ground_truth", "") |
| | lang = row.get("lang", "python") |
| |
|
| | marker = self._find_hole_marker(prompt) |
| |
|
| | if marker: |
| | display_code = prompt.replace(marker, "/* [HOLE] */") |
| | before_hole = prompt.split(marker)[0] |
| | merged_code = prompt.replace(marker, ground_truth) |
| | else: |
| | display_code = prompt + "\n/* [HOLE] */\n" |
| | before_hole = prompt + "\n" |
| | merged_code = prompt + "\n" + ground_truth + "\n" |
| |
|
| | |
| | gt_start_line = before_hole.count("\n") + 1 |
| | gt_line_count = ground_truth.count("\n") + (1 if ground_truth else 0) |
| | gt_end_line = gt_start_line + gt_line_count - 1 |
| |
|
| | lang_key = {"Python": "python", "Java": "java", "C++": "cpp", "C#": "csharp"}.get( |
| | lang, lang.lower() |
| | ) |
| |
|
| | return { |
| | "idx": idx, |
| | "task_id": row.get("task_id", str(idx)), |
| | "entry_point": row.get("task_id", f"safim_{idx}"), |
| | "code": display_code, |
| | "highlighted_code": _highlight_code(display_code, language=lang_key), |
| | "inputs": [], |
| | "outputs": [], |
| | "test": None, |
| | "tasks": [], |
| | "source": lang, |
| | "has_ground_truth": False, |
| | "has_tasks": False, |
| | "fim_prefix": prompt, |
| | "fim_ground_truth": ground_truth, |
| | "fim_ground_truth_highlighted": _highlight_code(ground_truth, language=lang_key), |
| | "fim_merged_code": merged_code, |
| | "fim_merged_highlighted": _highlight_code( |
| | merged_code, |
| | highlight_lines=list(range(gt_start_line, gt_end_line + 1)), |
| | language=lang_key, |
| | ), |
| | "fim_gt_start_line": gt_start_line, |
| | "fim_gt_end_line": gt_end_line, |
| | "language": lang, |
| | } |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | def _extract_func_name(declaration: str) -> str: |
| | """Extract the function/method name from a code declaration string.""" |
| | m = re.search(r"def\s+(\w+)\s*\(", declaration) |
| | if m: |
| | return m.group(1) |
| | m = re.search(r"(\w+)\s*\(", declaration) |
| | if m: |
| | return m.group(1) |
| | return "" |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | class HumanEvalPackAdapter(DatasetAdapter): |
| | slug = "humanevalpack" |
| | display_name = "HumanEvalPack" |
| | has_ground_truth = False |
| | has_tasks = False |
| |
|
| | LANGUAGES = ["python", "js", "cpp", "go", "java", "rust"] |
| |
|
| | def __init__(self, datasets_by_lang: dict[str, Any]): |
| | self._by_lang = datasets_by_lang |
| | first_lang = next(iter(self._by_lang)) |
| | self._count = len(self._by_lang[first_lang]) |
| |
|
| | def problem_count(self) -> int: |
| | return self._count |
| |
|
| | def get_problem_summary(self, idx: int) -> dict[str, Any]: |
| | first_lang = next(iter(self._by_lang)) |
| | row = self._by_lang[first_lang][idx] |
| | return { |
| | "idx": idx, |
| | "task_id": row["task_id"], |
| | "entry_point": row.get("entry_point", f"problem_{idx}"), |
| | "num_inputs": len(self._by_lang), |
| | "source": "HumanEvalPack", |
| | } |
| |
|
| | def get_problem_detail(self, idx: int) -> dict[str, Any]: |
| | first_lang = next(iter(self._by_lang)) |
| | row = self._by_lang[first_lang][idx] |
| |
|
| | lang_labels = { |
| | "python": "Python", |
| | "js": "JavaScript", |
| | "cpp": "C++", |
| | "go": "Go", |
| | "java": "Java", |
| | "rust": "Rust", |
| | } |
| | lang_pygments = { |
| | "python": "python", |
| | "js": "javascript", |
| | "cpp": "cpp", |
| | "go": "go", |
| | "java": "java", |
| | "rust": "rust", |
| | } |
| |
|
| | lang_solutions = [] |
| | for lang in self.LANGUAGES: |
| | if lang not in self._by_lang: |
| | continue |
| | lrow = self._by_lang[lang][idx] |
| | canonical = lrow.get("prompt", "") + lrow.get("canonical_solution", "") |
| | buggy = lrow.get("prompt", "") + lrow.get("buggy_solution", "") |
| | lang_key = lang_pygments.get(lang, lang) |
| | lang_solutions.append( |
| | { |
| | "language": lang, |
| | "language_label": lang_labels.get(lang, lang), |
| | "code": canonical, |
| | "highlighted_code": _highlight_code(canonical, language=lang_key), |
| | "buggy_code": buggy, |
| | "buggy_highlighted_code": _highlight_code(buggy, language=lang_key), |
| | "test": lrow.get("test", ""), |
| | "example_test": lrow.get("example_test", ""), |
| | "bug_type": lrow.get("bug_type", ""), |
| | "failure_symptoms": lrow.get("failure_symptoms", ""), |
| | } |
| | ) |
| |
|
| | py_row = self._by_lang.get("python", self._by_lang[first_lang])[idx] |
| | default_code = py_row.get("prompt", "") + py_row.get("canonical_solution", "") |
| |
|
| | return { |
| | "idx": idx, |
| | "task_id": row["task_id"], |
| | "entry_point": row.get("entry_point", f"problem_{idx}"), |
| | "code": default_code, |
| | "highlighted_code": _highlight_code(default_code), |
| | "inputs": [], |
| | "outputs": [], |
| | "test": py_row.get("test", ""), |
| | "tasks": [], |
| | "source": "HumanEvalPack", |
| | "has_ground_truth": False, |
| | "has_tasks": False, |
| | "description": row.get("instruction", row.get("docstring", "")), |
| | "lang_solutions": lang_solutions, |
| | "bug_type": py_row.get("bug_type", ""), |
| | "failure_symptoms": py_row.get("failure_symptoms", ""), |
| | } |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | class HumanEvalXAdapter(DatasetAdapter): |
| | slug = "humanevalx" |
| | display_name = "HumanEval-X" |
| | has_ground_truth = False |
| | has_tasks = False |
| |
|
| | LANGUAGES = ["python", "cpp", "java", "go", "js"] |
| |
|
| | def __init__(self, datasets_by_lang: dict[str, Any]): |
| | """datasets_by_lang maps language name -> HF dataset split.""" |
| | self._by_lang = datasets_by_lang |
| | first_lang = next(iter(self._by_lang)) |
| | self._count = len(self._by_lang[first_lang]) |
| |
|
| | def problem_count(self) -> int: |
| | return self._count |
| |
|
| | def get_problem_summary(self, idx: int) -> dict[str, Any]: |
| | first_lang = next(iter(self._by_lang)) |
| | row = self._by_lang[first_lang][idx] |
| | task_id = row["task_id"].split("/")[-1] |
| | decl = row.get("declaration", row.get("prompt", "")) |
| | entry = _extract_func_name(decl) or f"problem_{task_id}" |
| | return { |
| | "idx": idx, |
| | "task_id": f"HumanEval/{task_id}", |
| | "entry_point": entry, |
| | "num_inputs": len(self._by_lang), |
| | "source": "HumanEval-X", |
| | } |
| |
|
| | def get_problem_detail(self, idx: int) -> dict[str, Any]: |
| | first_lang = next(iter(self._by_lang)) |
| | row = self._by_lang[first_lang][idx] |
| | task_id = row["task_id"].split("/")[-1] |
| | decl = row.get("declaration", row.get("prompt", "")) |
| | entry = _extract_func_name(decl) or f"problem_{task_id}" |
| |
|
| | lang_solutions = [] |
| | for lang in self.LANGUAGES: |
| | if lang not in self._by_lang: |
| | continue |
| | lrow = self._by_lang[lang][idx] |
| | code = lrow["prompt"] + lrow["canonical_solution"] |
| | lang_solutions.append( |
| | { |
| | "language": lang, |
| | "code": code, |
| | "highlighted_code": _highlight_code(code, language=lang), |
| | "test": lrow.get("test", ""), |
| | "example_test": lrow.get("example_test", ""), |
| | } |
| | ) |
| |
|
| | py_row = self._by_lang.get("python", self._by_lang[first_lang])[idx] |
| | default_code = py_row["prompt"] + py_row["canonical_solution"] |
| |
|
| | return { |
| | "idx": idx, |
| | "task_id": f"HumanEval/{task_id}", |
| | "entry_point": entry, |
| | "code": default_code, |
| | "highlighted_code": _highlight_code(default_code), |
| | "inputs": [], |
| | "outputs": [], |
| | "test": py_row.get("test", ""), |
| | "tasks": [], |
| | "source": "HumanEval-X", |
| | "has_ground_truth": False, |
| | "has_tasks": False, |
| | "lang_solutions": lang_solutions, |
| | } |
| |
|