egor-bogomolov's picture
Add 28 benchmark datasets with rich visualization views
9a8a9c5
"""Code reasoning / completion benchmark adapters (CRUXEval, SAFIM, HumanEval-X)."""
from __future__ import annotations
import re
from typing import Any
from adapters import DatasetAdapter
# Injected at runtime by _set_helpers()
_highlight_code = None
_code_offset = None
_extract_test_classes = None
# ---------------------------------------------------------------------------
# CRUXEval adapter (HuggingFace: cruxeval-org/cruxeval)
# ---------------------------------------------------------------------------
class CRUXEvalAdapter(DatasetAdapter):
slug = "cruxeval"
display_name = "CRUXEval"
has_ground_truth = False
has_tasks = True
def __init__(self, hf_dataset):
self._ds = hf_dataset
def problem_count(self) -> int:
return len(self._ds)
def get_problem_summary(self, idx: int) -> dict[str, Any]:
row = self._ds[idx]
return {
"idx": idx,
"task_id": row["id"],
"entry_point": "f",
"num_inputs": 1,
"source": "CRUXEval",
}
def get_problem_detail(self, idx: int) -> dict[str, Any]:
row = self._ds[idx]
code = row["code"]
return {
"idx": idx,
"task_id": row["id"],
"entry_point": "f",
"code": code,
"highlighted_code": _highlight_code(code),
"inputs": [row["input"]],
"outputs": [row["output"]],
"test": None,
"tasks": [
{
"name": "Output Prediction",
"description": "Given the code and input, predict the output.",
"given": "input",
"predict": "output",
"input": row["input"],
"output": row["output"],
},
{
"name": "Input Prediction",
"description": "Given the code and output, predict the input.",
"given": "output",
"predict": "input",
"input": row["input"],
"output": row["output"],
},
],
"source": "CRUXEval",
"has_ground_truth": False,
"has_tasks": True,
}
# ---------------------------------------------------------------------------
# SAFIM adapter (HuggingFace: gonglinyuan/safim)
# ---------------------------------------------------------------------------
class SAFIMAdapter(DatasetAdapter):
slug = "safim"
display_name = "SAFIM"
has_ground_truth = False
has_tasks = False
def __init__(self, hf_dataset):
self._ds = hf_dataset
def problem_count(self) -> int:
return len(self._ds)
def get_problem_summary(self, idx: int) -> dict[str, Any]:
row = self._ds[idx]
return {
"idx": idx,
"task_id": row.get("task_id", str(idx)),
"entry_point": row.get("task_id", f"safim_{idx}"),
"num_inputs": 0,
"source": row.get("lang", "unknown"),
}
# Patterns that mark where the completion should be inserted
_HOLE_MARKERS = [
"{{completion}}",
"/* TODO: Your code here */",
"// TODO: Your code here",
"# TODO: Your code here",
]
def _find_hole_marker(self, prompt: str) -> str | None:
"""Return the first matching hole marker found in the prompt, or None."""
for marker in self._HOLE_MARKERS:
if marker in prompt:
return marker
return None
def get_problem_detail(self, idx: int) -> dict[str, Any]:
row = self._ds[idx]
prompt = row.get("prompt", "")
ground_truth = row.get("ground_truth", "")
lang = row.get("lang", "python")
marker = self._find_hole_marker(prompt)
if marker:
display_code = prompt.replace(marker, "/* [HOLE] */")
before_hole = prompt.split(marker)[0]
merged_code = prompt.replace(marker, ground_truth)
else:
display_code = prompt + "\n/* [HOLE] */\n"
before_hole = prompt + "\n"
merged_code = prompt + "\n" + ground_truth + "\n"
# Compute 1-indexed line range of the inserted ground truth
gt_start_line = before_hole.count("\n") + 1
gt_line_count = ground_truth.count("\n") + (1 if ground_truth else 0)
gt_end_line = gt_start_line + gt_line_count - 1
lang_key = {"Python": "python", "Java": "java", "C++": "cpp", "C#": "csharp"}.get(
lang, lang.lower()
)
return {
"idx": idx,
"task_id": row.get("task_id", str(idx)),
"entry_point": row.get("task_id", f"safim_{idx}"),
"code": display_code,
"highlighted_code": _highlight_code(display_code, language=lang_key),
"inputs": [],
"outputs": [],
"test": None,
"tasks": [],
"source": lang,
"has_ground_truth": False,
"has_tasks": False,
"fim_prefix": prompt,
"fim_ground_truth": ground_truth,
"fim_ground_truth_highlighted": _highlight_code(ground_truth, language=lang_key),
"fim_merged_code": merged_code,
"fim_merged_highlighted": _highlight_code(
merged_code,
highlight_lines=list(range(gt_start_line, gt_end_line + 1)),
language=lang_key,
),
"fim_gt_start_line": gt_start_line,
"fim_gt_end_line": gt_end_line,
"language": lang,
}
# ---------------------------------------------------------------------------
# HumanEval-X adapter (HuggingFace: THUDM/humaneval-x)
# ---------------------------------------------------------------------------
def _extract_func_name(declaration: str) -> str:
"""Extract the function/method name from a code declaration string."""
m = re.search(r"def\s+(\w+)\s*\(", declaration)
if m:
return m.group(1)
m = re.search(r"(\w+)\s*\(", declaration)
if m:
return m.group(1)
return ""
# ---------------------------------------------------------------------------
# HumanEvalPack adapter (HuggingFace: bigcode/humanevalpack)
# ---------------------------------------------------------------------------
class HumanEvalPackAdapter(DatasetAdapter):
slug = "humanevalpack"
display_name = "HumanEvalPack"
has_ground_truth = False
has_tasks = False
LANGUAGES = ["python", "js", "cpp", "go", "java", "rust"]
def __init__(self, datasets_by_lang: dict[str, Any]):
self._by_lang = datasets_by_lang
first_lang = next(iter(self._by_lang))
self._count = len(self._by_lang[first_lang])
def problem_count(self) -> int:
return self._count
def get_problem_summary(self, idx: int) -> dict[str, Any]:
first_lang = next(iter(self._by_lang))
row = self._by_lang[first_lang][idx]
return {
"idx": idx,
"task_id": row["task_id"],
"entry_point": row.get("entry_point", f"problem_{idx}"),
"num_inputs": len(self._by_lang),
"source": "HumanEvalPack",
}
def get_problem_detail(self, idx: int) -> dict[str, Any]:
first_lang = next(iter(self._by_lang))
row = self._by_lang[first_lang][idx]
lang_labels = {
"python": "Python",
"js": "JavaScript",
"cpp": "C++",
"go": "Go",
"java": "Java",
"rust": "Rust",
}
lang_pygments = {
"python": "python",
"js": "javascript",
"cpp": "cpp",
"go": "go",
"java": "java",
"rust": "rust",
}
lang_solutions = []
for lang in self.LANGUAGES:
if lang not in self._by_lang:
continue
lrow = self._by_lang[lang][idx]
canonical = lrow.get("prompt", "") + lrow.get("canonical_solution", "")
buggy = lrow.get("prompt", "") + lrow.get("buggy_solution", "")
lang_key = lang_pygments.get(lang, lang)
lang_solutions.append(
{
"language": lang,
"language_label": lang_labels.get(lang, lang),
"code": canonical,
"highlighted_code": _highlight_code(canonical, language=lang_key),
"buggy_code": buggy,
"buggy_highlighted_code": _highlight_code(buggy, language=lang_key),
"test": lrow.get("test", ""),
"example_test": lrow.get("example_test", ""),
"bug_type": lrow.get("bug_type", ""),
"failure_symptoms": lrow.get("failure_symptoms", ""),
}
)
py_row = self._by_lang.get("python", self._by_lang[first_lang])[idx]
default_code = py_row.get("prompt", "") + py_row.get("canonical_solution", "")
return {
"idx": idx,
"task_id": row["task_id"],
"entry_point": row.get("entry_point", f"problem_{idx}"),
"code": default_code,
"highlighted_code": _highlight_code(default_code),
"inputs": [],
"outputs": [],
"test": py_row.get("test", ""),
"tasks": [],
"source": "HumanEvalPack",
"has_ground_truth": False,
"has_tasks": False,
"description": row.get("instruction", row.get("docstring", "")),
"lang_solutions": lang_solutions,
"bug_type": py_row.get("bug_type", ""),
"failure_symptoms": py_row.get("failure_symptoms", ""),
}
# ---------------------------------------------------------------------------
# HumanEval-X adapter (HuggingFace: THUDM/humaneval-x)
# ---------------------------------------------------------------------------
class HumanEvalXAdapter(DatasetAdapter):
slug = "humanevalx"
display_name = "HumanEval-X"
has_ground_truth = False
has_tasks = False
LANGUAGES = ["python", "cpp", "java", "go", "js"]
def __init__(self, datasets_by_lang: dict[str, Any]):
"""datasets_by_lang maps language name -> HF dataset split."""
self._by_lang = datasets_by_lang
first_lang = next(iter(self._by_lang))
self._count = len(self._by_lang[first_lang])
def problem_count(self) -> int:
return self._count
def get_problem_summary(self, idx: int) -> dict[str, Any]:
first_lang = next(iter(self._by_lang))
row = self._by_lang[first_lang][idx]
task_id = row["task_id"].split("/")[-1]
decl = row.get("declaration", row.get("prompt", ""))
entry = _extract_func_name(decl) or f"problem_{task_id}"
return {
"idx": idx,
"task_id": f"HumanEval/{task_id}",
"entry_point": entry,
"num_inputs": len(self._by_lang),
"source": "HumanEval-X",
}
def get_problem_detail(self, idx: int) -> dict[str, Any]:
first_lang = next(iter(self._by_lang))
row = self._by_lang[first_lang][idx]
task_id = row["task_id"].split("/")[-1]
decl = row.get("declaration", row.get("prompt", ""))
entry = _extract_func_name(decl) or f"problem_{task_id}"
lang_solutions = []
for lang in self.LANGUAGES:
if lang not in self._by_lang:
continue
lrow = self._by_lang[lang][idx]
code = lrow["prompt"] + lrow["canonical_solution"]
lang_solutions.append(
{
"language": lang,
"code": code,
"highlighted_code": _highlight_code(code, language=lang),
"test": lrow.get("test", ""),
"example_test": lrow.get("example_test", ""),
}
)
py_row = self._by_lang.get("python", self._by_lang[first_lang])[idx]
default_code = py_row["prompt"] + py_row["canonical_solution"]
return {
"idx": idx,
"task_id": f"HumanEval/{task_id}",
"entry_point": entry,
"code": default_code,
"highlighted_code": _highlight_code(default_code),
"inputs": [],
"outputs": [],
"test": py_row.get("test", ""),
"tasks": [],
"source": "HumanEval-X",
"has_ground_truth": False,
"has_tasks": False,
"lang_solutions": lang_solutions,
}