Spaces:

JetBrains-Research
/

ml4se-evals-visualization

Running

File size: 14,359 Bytes

"""Code editing benchmark adapters (SWE-bench, DebugBench, CanItEdit, CodeEditorBench)."""

from __future__ import annotations

import json
from typing import Any

from adapters import DatasetAdapter

# Injected at runtime by _set_helpers()
_highlight_code = None
_code_offset = None
_extract_test_classes = None


# ---------------------------------------------------------------------------
# SWE-bench Lite adapter  (HuggingFace: princeton-nlp/SWE-bench_Lite)
# ---------------------------------------------------------------------------


class SWEBenchLiteAdapter(DatasetAdapter):
    slug = "swebenchlite"
    display_name = "SWE-bench Lite"
    has_ground_truth = False
    has_tasks = False

    def __init__(self, hf_dataset):
        self._ds = hf_dataset

    def problem_count(self) -> int:
        return len(self._ds)

    def get_problem_summary(self, idx: int) -> dict[str, Any]:
        row = self._ds[idx]
        return {
            "idx": idx,
            "task_id": row["instance_id"],
            "entry_point": row["instance_id"].split("__")[-1],
            "num_inputs": 0,
            "source": row["repo"],
        }

    @staticmethod
    def _github_links(instance_id: str, repo: str, base_commit: str) -> dict[str, str]:
        """Build GitHub URLs from SWE-bench instance metadata."""
        links: dict[str, str] = {}
        if repo:
            links["repo_url"] = f"https://github.com/{repo}"
        # instance_id format: "repo__issue-number" e.g. "astropy__astropy-12907"
        parts = instance_id.rsplit("-", 1)
        if len(parts) == 2 and parts[1].isdigit() and repo:
            links["issue_url"] = f"https://github.com/{repo}/issues/{parts[1]}"
        if base_commit and repo:
            links["commit_url"] = f"https://github.com/{repo}/commit/{base_commit}"
        return links

    def get_problem_detail(self, idx: int) -> dict[str, Any]:
        row = self._ds[idx]
        patch = row["patch"]
        raw_f2p = row["FAIL_TO_PASS"]
        fail_to_pass = raw_f2p if isinstance(raw_f2p, list) else (json.loads(raw_f2p) if raw_f2p else [])
        raw_p2p = row["PASS_TO_PASS"]
        pass_to_pass = raw_p2p if isinstance(raw_p2p, list) else (json.loads(raw_p2p) if raw_p2p else [])
        instance_id = row["instance_id"]
        repo = row["repo"]
        base_commit = row.get("base_commit", "")
        return {
            "idx": idx,
            "task_id": instance_id,
            "entry_point": instance_id.split("__")[-1],
            "code": patch,
            "highlighted_code": "",
            "inputs": [],
            "outputs": [],
            "test": None,
            "tasks": [],
            "source": repo,
            "has_ground_truth": False,
            "has_tasks": False,
            "description": row["problem_statement"],
            "patch": patch,
            "test_patch": row.get("test_patch", ""),
            "fail_to_pass": fail_to_pass,
            "pass_to_pass": pass_to_pass,
            "hints": row.get("hints_text", ""),
            "repo": repo,
            "base_commit": base_commit,
            "version": row.get("version", ""),
            "created_at": row.get("created_at", ""),
            **self._github_links(instance_id, repo, base_commit),
        }


# ---------------------------------------------------------------------------
# SWE-bench Verified adapter  (HuggingFace: princeton-nlp/SWE-bench_Verified)
# ---------------------------------------------------------------------------


class SWEBenchVerifiedAdapter(SWEBenchLiteAdapter):
    slug = "swebenchverified"
    display_name = "SWE-bench Verified"


class SWEBenchFullAdapter(SWEBenchLiteAdapter):
    slug = "swebenchfull"
    display_name = "SWE-bench"


# ---------------------------------------------------------------------------
# DebugBench adapter  (HuggingFace: Rtian/DebugBench)
# ---------------------------------------------------------------------------


class DebugBenchAdapter(DatasetAdapter):
    slug = "debugbench"
    display_name = "DebugBench"
    has_ground_truth = False
    has_tasks = False

    def __init__(self, hf_dataset):
        self._ds = hf_dataset

    def problem_count(self) -> int:
        return len(self._ds)

    def get_problem_summary(self, idx: int) -> dict[str, Any]:
        row = self._ds[idx]
        return {
            "idx": idx,
            "task_id": row["slug"],
            "entry_point": row["slug"],
            "num_inputs": len(row["examples"]),
            "source": f"{row['language']}/{row['category']}",
        }

    def get_problem_detail(self, idx: int) -> dict[str, Any]:
        row = self._ds[idx]
        lang = row["language"]
        buggy = row["buggy_code"]
        fixed = row["solution"]
        return {
            "idx": idx,
            "task_id": row["slug"],
            "entry_point": row["slug"],
            "code": fixed,
            "highlighted_code": _highlight_code(fixed, language=lang),
            "inputs": [],
            "outputs": [],
            "test": None,
            "tasks": [],
            "source": f"{lang}/{row['category']}",
            "has_ground_truth": False,
            "has_tasks": False,
            "description": row["question"],
            "language": lang,
            "buggy_code": buggy,
            "buggy_highlighted_code": _highlight_code(buggy, language=lang),
            "fixed_code": fixed,
            "fixed_highlighted_code": _highlight_code(fixed, language=lang),
            "bug_category": row["category"],
            "bug_subtype": row["subtype"],
            "bug_explanation": row["bug_explanation"],
            "difficulty": row["level"],
            "examples": list(row["examples"]),
        }


# ---------------------------------------------------------------------------
# CanItEdit adapter  (HuggingFace: nuprl/CanItEdit)
# ---------------------------------------------------------------------------


class CanItEditAdapter(DatasetAdapter):
    slug = "canitedit"
    display_name = "CanItEdit"
    has_ground_truth = False
    has_tasks = False

    def __init__(self, hf_dataset):
        self._ds = hf_dataset

    def problem_count(self) -> int:
        return len(self._ds)

    def get_problem_summary(self, idx: int) -> dict[str, Any]:
        row = self._ds[idx]
        taxonomy = row.get("taxonomy", {})
        change_kind = taxonomy.get("change_kind", "") if isinstance(taxonomy, dict) else ""
        return {
            "idx": idx,
            "task_id": row.get("full_name", str(row.get("id", idx))),
            "entry_point": row.get("name", f"edit_{idx}"),
            "num_inputs": 0,
            "source": change_kind or "CanItEdit",
        }

    def get_problem_detail(self, idx: int) -> dict[str, Any]:
        row = self._ds[idx]
        before = row["before"]
        after = row["after"]
        taxonomy = row.get("taxonomy", {})
        if not isinstance(taxonomy, dict):
            taxonomy = {}
        return {
            "idx": idx,
            "task_id": row.get("full_name", str(row.get("id", idx))),
            "entry_point": row.get("name", f"edit_{idx}"),
            "code": after,
            "highlighted_code": _highlight_code(after),
            "inputs": [],
            "outputs": [],
            "test": row.get("tests", ""),
            "tasks": [],
            "source": taxonomy.get("change_kind", "CanItEdit"),
            "has_ground_truth": False,
            "has_tasks": False,
            "description": row.get("instruction_descriptive", ""),
            "buggy_code": before,
            "buggy_highlighted_code": _highlight_code(before),
            "fixed_code": after,
            "fixed_highlighted_code": _highlight_code(after),
            "bug_category": taxonomy.get("change_kind", ""),
            "bug_subtype": taxonomy.get("topic", ""),
            "bug_explanation": row.get("instruction_lazy", ""),
        }


# ---------------------------------------------------------------------------
# CodeEditorBench adapter  (HuggingFace: m-a-p/CodeEditorBench)
# ---------------------------------------------------------------------------


class CodeEditorBenchAdapter(DatasetAdapter):
    slug = "codeeditorbench"
    display_name = "CodeEditorBench"
    has_ground_truth = False
    has_tasks = False

    def __init__(self, rows: list[dict[str, Any]]):
        self._rows = rows

    def problem_count(self) -> int:
        return len(self._rows)

    def get_problem_summary(self, idx: int) -> dict[str, Any]:
        row = self._rows[idx]
        return {
            "idx": idx,
            "task_id": str(row.get("idx", idx)),
            "entry_point": row.get("title", f"problem_{idx}"),
            "num_inputs": 0,
            "source": row.get("_task_type", "unknown"),
        }

    def get_problem_detail(self, idx: int) -> dict[str, Any]:
        row = self._rows[idx]
        task_type = row.get("_task_type", "unknown")
        lang = row.get("code_language", row.get("source_lang", "python")) or "python"
        lang_key = lang.lower()

        if task_type == "code_debug":
            buggy = row.get("incorrect_solutions", "")
            fixed = row.get("solutions", "")
        elif task_type == "code_translate":
            buggy = row.get("source_code", "")
            fixed = row.get("solutions", row.get("source_code", ""))
        elif task_type == "code_polishment":
            buggy = row.get("source_code", "")
            fixed = row.get("solutions", row.get("source_code", ""))
        else:  # code_switch
            buggy = row.get("similar_source_code", row.get("source_code", ""))
            fixed = row.get("solutions", row.get("source_code", ""))

        return {
            "idx": idx,
            "task_id": str(row.get("idx", idx)),
            "entry_point": row.get("title", f"problem_{idx}"),
            "code": fixed,
            "highlighted_code": _highlight_code(fixed, language=lang_key) if fixed else "",
            "inputs": [],
            "outputs": [],
            "test": None,
            "tasks": [],
            "source": task_type,
            "has_ground_truth": False,
            "has_tasks": False,
            "description": "",
            "buggy_code": buggy,
            "buggy_highlighted_code": _highlight_code(buggy, language=lang_key) if buggy else "",
            "fixed_code": fixed,
            "fixed_highlighted_code": _highlight_code(fixed, language=lang_key) if fixed else "",
            "bug_category": task_type,
            "bug_subtype": row.get("difficulty", ""),
            "bug_explanation": "",
            "difficulty": row.get("difficulty", ""),
            "language": lang,
        }


# ---------------------------------------------------------------------------
# CodeXGLUE Code Refinement adapter  (HuggingFace: google/code_x_glue_cc_code_refinement)
# ---------------------------------------------------------------------------


class CodeXGLUERefinementAdapter(DatasetAdapter):
    slug = "codexgluerefinement"
    display_name = "CodeXGLUE Code Refinement"
    has_ground_truth = False
    has_tasks = False

    def __init__(self, hf_dataset):
        self._ds = hf_dataset

    def problem_count(self) -> int:
        return len(self._ds)

    def get_problem_summary(self, idx: int) -> dict[str, Any]:
        row = self._ds[idx]
        return {
            "idx": idx,
            "task_id": str(row.get("id", idx)),
            "entry_point": f"refinement_{row.get('id', idx)}",
            "num_inputs": 0,
            "source": "CodeXGLUE",
        }

    def get_problem_detail(self, idx: int) -> dict[str, Any]:
        row = self._ds[idx]
        buggy = row.get("buggy", "")
        fixed = row.get("fixed", "")
        return {
            "idx": idx,
            "task_id": str(row.get("id", idx)),
            "entry_point": f"refinement_{row.get('id', idx)}",
            "code": fixed,
            "highlighted_code": _highlight_code(fixed, language="java") if fixed else "",
            "inputs": [],
            "outputs": [],
            "test": None,
            "tasks": [],
            "source": "CodeXGLUE",
            "has_ground_truth": False,
            "has_tasks": False,
            "description": "",
            "buggy_code": buggy,
            "buggy_highlighted_code": _highlight_code(buggy, language="java") if buggy else "",
            "fixed_code": fixed,
            "fixed_highlighted_code": _highlight_code(fixed, language="java") if fixed else "",
            "bug_category": "Code Refinement",
            "bug_subtype": "",
            "bug_explanation": "",
            "language": "Java",
        }


# ---------------------------------------------------------------------------
# CommitBench adapter  (HuggingFace: Maxscha/commitbench)
# ---------------------------------------------------------------------------


class CommitBenchAdapter(DatasetAdapter):
    slug = "commitbench"
    display_name = "CommitBench"
    has_ground_truth = False
    has_tasks = False

    def __init__(self, hf_dataset):
        self._ds = hf_dataset

    def problem_count(self) -> int:
        return len(self._ds)

    def get_problem_summary(self, idx: int) -> dict[str, Any]:
        row = self._ds[idx]
        return {
            "idx": idx,
            "task_id": row.get("hash", str(idx))[:12],
            "entry_point": row.get("project", f"commit_{idx}"),
            "num_inputs": 0,
            "source": row.get("diff_languages", "unknown"),
        }

    def get_problem_detail(self, idx: int) -> dict[str, Any]:
        row = self._ds[idx]
        diff = row.get("diff", "")
        message = row.get("message", "")
        return {
            "idx": idx,
            "task_id": row.get("hash", str(idx))[:12],
            "entry_point": row.get("project", f"commit_{idx}"),
            "code": diff,
            "highlighted_code": "",
            "inputs": [],
            "outputs": [],
            "test": None,
            "tasks": [],
            "source": row.get("diff_languages", "unknown"),
            "has_ground_truth": False,
            "has_tasks": False,
            "description": message,
            "patch": diff,
            "repo": row.get("project", ""),
            "commit_hash": row.get("hash", ""),
            "diff_languages": row.get("diff_languages", ""),
        }