Spaces:

JetBrains-Research
/

ml4se-evals-visualization

Running

File size: 12,729 Bytes

9a8a9c5

"""Code reasoning / completion benchmark adapters (CRUXEval, SAFIM, HumanEval-X)."""

from __future__ import annotations

import re
from typing import Any

from adapters import DatasetAdapter

# Injected at runtime by _set_helpers()
_highlight_code = None
_code_offset = None
_extract_test_classes = None


# ---------------------------------------------------------------------------
# CRUXEval adapter  (HuggingFace: cruxeval-org/cruxeval)
# ---------------------------------------------------------------------------


class CRUXEvalAdapter(DatasetAdapter):
    slug = "cruxeval"
    display_name = "CRUXEval"
    has_ground_truth = False
    has_tasks = True

    def __init__(self, hf_dataset):
        self._ds = hf_dataset

    def problem_count(self) -> int:
        return len(self._ds)

    def get_problem_summary(self, idx: int) -> dict[str, Any]:
        row = self._ds[idx]
        return {
            "idx": idx,
            "task_id": row["id"],
            "entry_point": "f",
            "num_inputs": 1,
            "source": "CRUXEval",
        }

    def get_problem_detail(self, idx: int) -> dict[str, Any]:
        row = self._ds[idx]
        code = row["code"]
        return {
            "idx": idx,
            "task_id": row["id"],
            "entry_point": "f",
            "code": code,
            "highlighted_code": _highlight_code(code),
            "inputs": [row["input"]],
            "outputs": [row["output"]],
            "test": None,
            "tasks": [
                {
                    "name": "Output Prediction",
                    "description": "Given the code and input, predict the output.",
                    "given": "input",
                    "predict": "output",
                    "input": row["input"],
                    "output": row["output"],
                },
                {
                    "name": "Input Prediction",
                    "description": "Given the code and output, predict the input.",
                    "given": "output",
                    "predict": "input",
                    "input": row["input"],
                    "output": row["output"],
                },
            ],
            "source": "CRUXEval",
            "has_ground_truth": False,
            "has_tasks": True,
        }


# ---------------------------------------------------------------------------
# SAFIM adapter  (HuggingFace: gonglinyuan/safim)
# ---------------------------------------------------------------------------


class SAFIMAdapter(DatasetAdapter):
    slug = "safim"
    display_name = "SAFIM"
    has_ground_truth = False
    has_tasks = False

    def __init__(self, hf_dataset):
        self._ds = hf_dataset

    def problem_count(self) -> int:
        return len(self._ds)

    def get_problem_summary(self, idx: int) -> dict[str, Any]:
        row = self._ds[idx]
        return {
            "idx": idx,
            "task_id": row.get("task_id", str(idx)),
            "entry_point": row.get("task_id", f"safim_{idx}"),
            "num_inputs": 0,
            "source": row.get("lang", "unknown"),
        }

    # Patterns that mark where the completion should be inserted
    _HOLE_MARKERS = [
        "{{completion}}",
        "/* TODO: Your code here */",
        "// TODO: Your code here",
        "# TODO: Your code here",
    ]

    def _find_hole_marker(self, prompt: str) -> str | None:
        """Return the first matching hole marker found in the prompt, or None."""
        for marker in self._HOLE_MARKERS:
            if marker in prompt:
                return marker
        return None

    def get_problem_detail(self, idx: int) -> dict[str, Any]:
        row = self._ds[idx]
        prompt = row.get("prompt", "")
        ground_truth = row.get("ground_truth", "")
        lang = row.get("lang", "python")

        marker = self._find_hole_marker(prompt)

        if marker:
            display_code = prompt.replace(marker, "/* [HOLE] */")
            before_hole = prompt.split(marker)[0]
            merged_code = prompt.replace(marker, ground_truth)
        else:
            display_code = prompt + "\n/* [HOLE] */\n"
            before_hole = prompt + "\n"
            merged_code = prompt + "\n" + ground_truth + "\n"

        # Compute 1-indexed line range of the inserted ground truth
        gt_start_line = before_hole.count("\n") + 1
        gt_line_count = ground_truth.count("\n") + (1 if ground_truth else 0)
        gt_end_line = gt_start_line + gt_line_count - 1

        lang_key = {"Python": "python", "Java": "java", "C++": "cpp", "C#": "csharp"}.get(
            lang, lang.lower()
        )

        return {
            "idx": idx,
            "task_id": row.get("task_id", str(idx)),
            "entry_point": row.get("task_id", f"safim_{idx}"),
            "code": display_code,
            "highlighted_code": _highlight_code(display_code, language=lang_key),
            "inputs": [],
            "outputs": [],
            "test": None,
            "tasks": [],
            "source": lang,
            "has_ground_truth": False,
            "has_tasks": False,
            "fim_prefix": prompt,
            "fim_ground_truth": ground_truth,
            "fim_ground_truth_highlighted": _highlight_code(ground_truth, language=lang_key),
            "fim_merged_code": merged_code,
            "fim_merged_highlighted": _highlight_code(
                merged_code,
                highlight_lines=list(range(gt_start_line, gt_end_line + 1)),
                language=lang_key,
            ),
            "fim_gt_start_line": gt_start_line,
            "fim_gt_end_line": gt_end_line,
            "language": lang,
        }


# ---------------------------------------------------------------------------
# HumanEval-X adapter  (HuggingFace: THUDM/humaneval-x)
# ---------------------------------------------------------------------------


def _extract_func_name(declaration: str) -> str:
    """Extract the function/method name from a code declaration string."""
    m = re.search(r"def\s+(\w+)\s*\(", declaration)
    if m:
        return m.group(1)
    m = re.search(r"(\w+)\s*\(", declaration)
    if m:
        return m.group(1)
    return ""


# ---------------------------------------------------------------------------
# HumanEvalPack adapter  (HuggingFace: bigcode/humanevalpack)
# ---------------------------------------------------------------------------


class HumanEvalPackAdapter(DatasetAdapter):
    slug = "humanevalpack"
    display_name = "HumanEvalPack"
    has_ground_truth = False
    has_tasks = False

    LANGUAGES = ["python", "js", "cpp", "go", "java", "rust"]

    def __init__(self, datasets_by_lang: dict[str, Any]):
        self._by_lang = datasets_by_lang
        first_lang = next(iter(self._by_lang))
        self._count = len(self._by_lang[first_lang])

    def problem_count(self) -> int:
        return self._count

    def get_problem_summary(self, idx: int) -> dict[str, Any]:
        first_lang = next(iter(self._by_lang))
        row = self._by_lang[first_lang][idx]
        return {
            "idx": idx,
            "task_id": row["task_id"],
            "entry_point": row.get("entry_point", f"problem_{idx}"),
            "num_inputs": len(self._by_lang),
            "source": "HumanEvalPack",
        }

    def get_problem_detail(self, idx: int) -> dict[str, Any]:
        first_lang = next(iter(self._by_lang))
        row = self._by_lang[first_lang][idx]

        lang_labels = {
            "python": "Python",
            "js": "JavaScript",
            "cpp": "C++",
            "go": "Go",
            "java": "Java",
            "rust": "Rust",
        }
        lang_pygments = {
            "python": "python",
            "js": "javascript",
            "cpp": "cpp",
            "go": "go",
            "java": "java",
            "rust": "rust",
        }

        lang_solutions = []
        for lang in self.LANGUAGES:
            if lang not in self._by_lang:
                continue
            lrow = self._by_lang[lang][idx]
            canonical = lrow.get("prompt", "") + lrow.get("canonical_solution", "")
            buggy = lrow.get("prompt", "") + lrow.get("buggy_solution", "")
            lang_key = lang_pygments.get(lang, lang)
            lang_solutions.append(
                {
                    "language": lang,
                    "language_label": lang_labels.get(lang, lang),
                    "code": canonical,
                    "highlighted_code": _highlight_code(canonical, language=lang_key),
                    "buggy_code": buggy,
                    "buggy_highlighted_code": _highlight_code(buggy, language=lang_key),
                    "test": lrow.get("test", ""),
                    "example_test": lrow.get("example_test", ""),
                    "bug_type": lrow.get("bug_type", ""),
                    "failure_symptoms": lrow.get("failure_symptoms", ""),
                }
            )

        py_row = self._by_lang.get("python", self._by_lang[first_lang])[idx]
        default_code = py_row.get("prompt", "") + py_row.get("canonical_solution", "")

        return {
            "idx": idx,
            "task_id": row["task_id"],
            "entry_point": row.get("entry_point", f"problem_{idx}"),
            "code": default_code,
            "highlighted_code": _highlight_code(default_code),
            "inputs": [],
            "outputs": [],
            "test": py_row.get("test", ""),
            "tasks": [],
            "source": "HumanEvalPack",
            "has_ground_truth": False,
            "has_tasks": False,
            "description": row.get("instruction", row.get("docstring", "")),
            "lang_solutions": lang_solutions,
            "bug_type": py_row.get("bug_type", ""),
            "failure_symptoms": py_row.get("failure_symptoms", ""),
        }


# ---------------------------------------------------------------------------
# HumanEval-X adapter  (HuggingFace: THUDM/humaneval-x)
# ---------------------------------------------------------------------------


class HumanEvalXAdapter(DatasetAdapter):
    slug = "humanevalx"
    display_name = "HumanEval-X"
    has_ground_truth = False
    has_tasks = False

    LANGUAGES = ["python", "cpp", "java", "go", "js"]

    def __init__(self, datasets_by_lang: dict[str, Any]):
        """datasets_by_lang maps language name -> HF dataset split."""
        self._by_lang = datasets_by_lang
        first_lang = next(iter(self._by_lang))
        self._count = len(self._by_lang[first_lang])

    def problem_count(self) -> int:
        return self._count

    def get_problem_summary(self, idx: int) -> dict[str, Any]:
        first_lang = next(iter(self._by_lang))
        row = self._by_lang[first_lang][idx]
        task_id = row["task_id"].split("/")[-1]
        decl = row.get("declaration", row.get("prompt", ""))
        entry = _extract_func_name(decl) or f"problem_{task_id}"
        return {
            "idx": idx,
            "task_id": f"HumanEval/{task_id}",
            "entry_point": entry,
            "num_inputs": len(self._by_lang),
            "source": "HumanEval-X",
        }

    def get_problem_detail(self, idx: int) -> dict[str, Any]:
        first_lang = next(iter(self._by_lang))
        row = self._by_lang[first_lang][idx]
        task_id = row["task_id"].split("/")[-1]
        decl = row.get("declaration", row.get("prompt", ""))
        entry = _extract_func_name(decl) or f"problem_{task_id}"

        lang_solutions = []
        for lang in self.LANGUAGES:
            if lang not in self._by_lang:
                continue
            lrow = self._by_lang[lang][idx]
            code = lrow["prompt"] + lrow["canonical_solution"]
            lang_solutions.append(
                {
                    "language": lang,
                    "code": code,
                    "highlighted_code": _highlight_code(code, language=lang),
                    "test": lrow.get("test", ""),
                    "example_test": lrow.get("example_test", ""),
                }
            )

        py_row = self._by_lang.get("python", self._by_lang[first_lang])[idx]
        default_code = py_row["prompt"] + py_row["canonical_solution"]

        return {
            "idx": idx,
            "task_id": f"HumanEval/{task_id}",
            "entry_point": entry,
            "code": default_code,
            "highlighted_code": _highlight_code(default_code),
            "inputs": [],
            "outputs": [],
            "test": py_row.get("test", ""),
            "tasks": [],
            "source": "HumanEval-X",
            "has_ground_truth": False,
            "has_tasks": False,
            "lang_solutions": lang_solutions,
        }