Spaces:

JetBrains-Research
/

ml4se-evals-visualization

Running

App Files Files Community

ml4se-evals-visualization / adapters /code_reasoning.py

egor-bogomolov

Add 28 benchmark datasets with rich visualization views

9a8a9c5 1 day ago

raw

history blame contribute delete

12.7 kB

	"""Code reasoning / completion benchmark adapters (CRUXEval, SAFIM, HumanEval-X)."""

	from __future__ import annotations

	import re
	from typing import Any

	from adapters import DatasetAdapter

	# Injected at runtime by _set_helpers()
	_highlight_code = None
	_code_offset = None
	_extract_test_classes = None


	# ---------------------------------------------------------------------------
	# CRUXEval adapter (HuggingFace: cruxeval-org/cruxeval)
	# ---------------------------------------------------------------------------


	class CRUXEvalAdapter(DatasetAdapter):
	slug = "cruxeval"
	display_name = "CRUXEval"
	has_ground_truth = False
	has_tasks = True

	def __init__(self, hf_dataset):
	self._ds = hf_dataset

	def problem_count(self) -> int:
	return len(self._ds)

	def get_problem_summary(self, idx: int) -> dict[str, Any]:
	row = self._ds[idx]
	return {
	"idx": idx,
	"task_id": row["id"],
	"entry_point": "f",
	"num_inputs": 1,
	"source": "CRUXEval",
	}

	def get_problem_detail(self, idx: int) -> dict[str, Any]:
	row = self._ds[idx]
	code = row["code"]
	return {
	"idx": idx,
	"task_id": row["id"],
	"entry_point": "f",
	"code": code,
	"highlighted_code": _highlight_code(code),
	"inputs": [row["input"]],
	"outputs": [row["output"]],
	"test": None,
	"tasks": [
	{
	"name": "Output Prediction",
	"description": "Given the code and input, predict the output.",
	"given": "input",
	"predict": "output",
	"input": row["input"],
	"output": row["output"],
	},
	{
	"name": "Input Prediction",
	"description": "Given the code and output, predict the input.",
	"given": "output",
	"predict": "input",
	"input": row["input"],
	"output": row["output"],
	},
	],
	"source": "CRUXEval",
	"has_ground_truth": False,
	"has_tasks": True,
	}


	# ---------------------------------------------------------------------------
	# SAFIM adapter (HuggingFace: gonglinyuan/safim)
	# ---------------------------------------------------------------------------


	class SAFIMAdapter(DatasetAdapter):
	slug = "safim"
	display_name = "SAFIM"
	has_ground_truth = False
	has_tasks = False

	def __init__(self, hf_dataset):
	self._ds = hf_dataset

	def problem_count(self) -> int:
	return len(self._ds)

	def get_problem_summary(self, idx: int) -> dict[str, Any]:
	row = self._ds[idx]
	return {
	"idx": idx,
	"task_id": row.get("task_id", str(idx)),
	"entry_point": row.get("task_id", f"safim_{idx}"),
	"num_inputs": 0,
	"source": row.get("lang", "unknown"),
	}

	# Patterns that mark where the completion should be inserted
	_HOLE_MARKERS = [
	"{{completion}}",
	"/* TODO: Your code here */",
	"// TODO: Your code here",
	"# TODO: Your code here",
	]

	def _find_hole_marker(self, prompt: str) -> str \| None:
	"""Return the first matching hole marker found in the prompt, or None."""
	for marker in self._HOLE_MARKERS:
	if marker in prompt:
	return marker
	return None

	def get_problem_detail(self, idx: int) -> dict[str, Any]:
	row = self._ds[idx]
	prompt = row.get("prompt", "")
	ground_truth = row.get("ground_truth", "")
	lang = row.get("lang", "python")

	marker = self._find_hole_marker(prompt)

	if marker:
	display_code = prompt.replace(marker, "/* [HOLE] */")
	before_hole = prompt.split(marker)[0]
	merged_code = prompt.replace(marker, ground_truth)
	else:
	display_code = prompt + "\n/* [HOLE] */\n"
	before_hole = prompt + "\n"
	merged_code = prompt + "\n" + ground_truth + "\n"

	# Compute 1-indexed line range of the inserted ground truth
	gt_start_line = before_hole.count("\n") + 1
	gt_line_count = ground_truth.count("\n") + (1 if ground_truth else 0)
	gt_end_line = gt_start_line + gt_line_count - 1

	lang_key = {"Python": "python", "Java": "java", "C++": "cpp", "C#": "csharp"}.get(
	lang, lang.lower()
	)

	return {
	"idx": idx,
	"task_id": row.get("task_id", str(idx)),
	"entry_point": row.get("task_id", f"safim_{idx}"),
	"code": display_code,
	"highlighted_code": _highlight_code(display_code, language=lang_key),
	"inputs": [],
	"outputs": [],
	"test": None,
	"tasks": [],
	"source": lang,
	"has_ground_truth": False,
	"has_tasks": False,
	"fim_prefix": prompt,
	"fim_ground_truth": ground_truth,
	"fim_ground_truth_highlighted": _highlight_code(ground_truth, language=lang_key),
	"fim_merged_code": merged_code,
	"fim_merged_highlighted": _highlight_code(
	merged_code,
	highlight_lines=list(range(gt_start_line, gt_end_line + 1)),
	language=lang_key,
	),
	"fim_gt_start_line": gt_start_line,
	"fim_gt_end_line": gt_end_line,
	"language": lang,
	}


	# ---------------------------------------------------------------------------
	# HumanEval-X adapter (HuggingFace: THUDM/humaneval-x)
	# ---------------------------------------------------------------------------


	def _extract_func_name(declaration: str) -> str:
	"""Extract the function/method name from a code declaration string."""
	m = re.search(r"def\s+(\w+)\s*\(", declaration)
	if m:
	return m.group(1)
	m = re.search(r"(\w+)\s*\(", declaration)
	if m:
	return m.group(1)
	return ""


	# ---------------------------------------------------------------------------
	# HumanEvalPack adapter (HuggingFace: bigcode/humanevalpack)
	# ---------------------------------------------------------------------------


	class HumanEvalPackAdapter(DatasetAdapter):
	slug = "humanevalpack"
	display_name = "HumanEvalPack"
	has_ground_truth = False
	has_tasks = False

	LANGUAGES = ["python", "js", "cpp", "go", "java", "rust"]

	def __init__(self, datasets_by_lang: dict[str, Any]):
	self._by_lang = datasets_by_lang
	first_lang = next(iter(self._by_lang))
	self._count = len(self._by_lang[first_lang])

	def problem_count(self) -> int:
	return self._count

	def get_problem_summary(self, idx: int) -> dict[str, Any]:
	first_lang = next(iter(self._by_lang))
	row = self._by_lang[first_lang][idx]
	return {
	"idx": idx,
	"task_id": row["task_id"],
	"entry_point": row.get("entry_point", f"problem_{idx}"),
	"num_inputs": len(self._by_lang),
	"source": "HumanEvalPack",
	}

	def get_problem_detail(self, idx: int) -> dict[str, Any]:
	first_lang = next(iter(self._by_lang))
	row = self._by_lang[first_lang][idx]

	lang_labels = {
	"python": "Python",
	"js": "JavaScript",
	"cpp": "C++",
	"go": "Go",
	"java": "Java",
	"rust": "Rust",
	}
	lang_pygments = {
	"python": "python",
	"js": "javascript",
	"cpp": "cpp",
	"go": "go",
	"java": "java",
	"rust": "rust",
	}

	lang_solutions = []
	for lang in self.LANGUAGES:
	if lang not in self._by_lang:
	continue
	lrow = self._by_lang[lang][idx]
	canonical = lrow.get("prompt", "") + lrow.get("canonical_solution", "")
	buggy = lrow.get("prompt", "") + lrow.get("buggy_solution", "")
	lang_key = lang_pygments.get(lang, lang)
	lang_solutions.append(
	{
	"language": lang,
	"language_label": lang_labels.get(lang, lang),
	"code": canonical,
	"highlighted_code": _highlight_code(canonical, language=lang_key),
	"buggy_code": buggy,
	"buggy_highlighted_code": _highlight_code(buggy, language=lang_key),
	"test": lrow.get("test", ""),
	"example_test": lrow.get("example_test", ""),
	"bug_type": lrow.get("bug_type", ""),
	"failure_symptoms": lrow.get("failure_symptoms", ""),
	}
	)

	py_row = self._by_lang.get("python", self._by_lang[first_lang])[idx]
	default_code = py_row.get("prompt", "") + py_row.get("canonical_solution", "")

	return {
	"idx": idx,
	"task_id": row["task_id"],
	"entry_point": row.get("entry_point", f"problem_{idx}"),
	"code": default_code,
	"highlighted_code": _highlight_code(default_code),
	"inputs": [],
	"outputs": [],
	"test": py_row.get("test", ""),
	"tasks": [],
	"source": "HumanEvalPack",
	"has_ground_truth": False,
	"has_tasks": False,
	"description": row.get("instruction", row.get("docstring", "")),
	"lang_solutions": lang_solutions,
	"bug_type": py_row.get("bug_type", ""),
	"failure_symptoms": py_row.get("failure_symptoms", ""),
	}


	# ---------------------------------------------------------------------------
	# HumanEval-X adapter (HuggingFace: THUDM/humaneval-x)
	# ---------------------------------------------------------------------------


	class HumanEvalXAdapter(DatasetAdapter):
	slug = "humanevalx"
	display_name = "HumanEval-X"
	has_ground_truth = False
	has_tasks = False

	LANGUAGES = ["python", "cpp", "java", "go", "js"]

	def __init__(self, datasets_by_lang: dict[str, Any]):
	"""datasets_by_lang maps language name -> HF dataset split."""
	self._by_lang = datasets_by_lang
	first_lang = next(iter(self._by_lang))
	self._count = len(self._by_lang[first_lang])

	def problem_count(self) -> int:
	return self._count

	def get_problem_summary(self, idx: int) -> dict[str, Any]:
	first_lang = next(iter(self._by_lang))
	row = self._by_lang[first_lang][idx]
	task_id = row["task_id"].split("/")[-1]
	decl = row.get("declaration", row.get("prompt", ""))
	entry = _extract_func_name(decl) or f"problem_{task_id}"
	return {
	"idx": idx,
	"task_id": f"HumanEval/{task_id}",
	"entry_point": entry,
	"num_inputs": len(self._by_lang),
	"source": "HumanEval-X",
	}

	def get_problem_detail(self, idx: int) -> dict[str, Any]:
	first_lang = next(iter(self._by_lang))
	row = self._by_lang[first_lang][idx]
	task_id = row["task_id"].split("/")[-1]
	decl = row.get("declaration", row.get("prompt", ""))
	entry = _extract_func_name(decl) or f"problem_{task_id}"

	lang_solutions = []
	for lang in self.LANGUAGES:
	if lang not in self._by_lang:
	continue
	lrow = self._by_lang[lang][idx]
	code = lrow["prompt"] + lrow["canonical_solution"]
	lang_solutions.append(
	{
	"language": lang,
	"code": code,
	"highlighted_code": _highlight_code(code, language=lang),
	"test": lrow.get("test", ""),
	"example_test": lrow.get("example_test", ""),
	}
	)

	py_row = self._by_lang.get("python", self._by_lang[first_lang])[idx]
	default_code = py_row["prompt"] + py_row["canonical_solution"]

	return {
	"idx": idx,
	"task_id": f"HumanEval/{task_id}",
	"entry_point": entry,
	"code": default_code,
	"highlighted_code": _highlight_code(default_code),
	"inputs": [],
	"outputs": [],
	"test": py_row.get("test", ""),
	"tasks": [],
	"source": "HumanEval-X",
	"has_ground_truth": False,
	"has_tasks": False,
	"lang_solutions": lang_solutions,
	}