Spaces:

JetBrains-Research
/

ml4se-evals-visualization

Running

File size: 19,988 Bytes

9f85fac

"""Additional benchmark adapters (dpaia EE-Dataset, Multi-SWE-bench, SWE-bench
Multilingual, CrossCodeEval, RepoBench, McEval, MultiPL-E, Defects4J)."""

from __future__ import annotations

import json
from typing import Any

from adapters import DatasetAdapter
from adapters.code_editing import SWEBenchLiteAdapter

# Injected at runtime by _set_helpers()
_highlight_code = None
_code_offset = None
_extract_test_classes = None


# ---------------------------------------------------------------------------
# dpaia Enterprise Evaluation Dataset
# (GitHub: dpaia/ee-dataset — SWE-bench-style format for Java/Spring)
# ---------------------------------------------------------------------------


class DPAIAEEDatasetAdapter(DatasetAdapter):
    slug = "dpaia-ee"
    display_name = "DPAIA EE-Dataset"
    has_ground_truth = False
    has_tasks = False

    def __init__(self, rows: list[dict[str, Any]]):
        self._rows = rows

    def problem_count(self) -> int:
        return len(self._rows)

    def get_problem_summary(self, idx: int) -> dict[str, Any]:
        row = self._rows[idx]
        tags = row.get("tags", [])
        tag_str = ", ".join(tags[:3]) if isinstance(tags, list) else str(tags)
        return {
            "idx": idx,
            "task_id": row.get("instance_id", str(idx)),
            "entry_point": row.get("repo", f"dpaia_{idx}"),
            "num_inputs": 0,
            "source": tag_str or "DPAIA",
        }

    def get_problem_detail(self, idx: int) -> dict[str, Any]:
        row = self._rows[idx]
        patch = row.get("patch", "")
        test_patch = row.get("test_patch", "")
        fail_to_pass = row.get("FAIL_TO_PASS", [])
        if isinstance(fail_to_pass, str):
            try:
                fail_to_pass = json.loads(fail_to_pass)
            except (json.JSONDecodeError, TypeError):
                fail_to_pass = [fail_to_pass]
        pass_to_pass = row.get("PASS_TO_PASS", [])
        if isinstance(pass_to_pass, str):
            try:
                pass_to_pass = json.loads(pass_to_pass)
            except (json.JSONDecodeError, TypeError):
                pass_to_pass = [pass_to_pass]

        instance_id = row.get("instance_id", str(idx))
        repo = row.get("repo", "")

        return {
            "idx": idx,
            "task_id": instance_id,
            "entry_point": repo or f"dpaia_{idx}",
            "code": patch,
            "highlighted_code": "",
            "inputs": [],
            "outputs": [],
            "test": None,
            "tasks": [],
            "source": ", ".join(row.get("tags", [])[:3])
            if isinstance(row.get("tags"), list)
            else "DPAIA",
            "has_ground_truth": False,
            "has_tasks": False,
            "description": row.get("problem_statement", ""),
            "patch": patch,
            "test_patch": test_patch,
            "fail_to_pass": fail_to_pass,
            "pass_to_pass": pass_to_pass,
            "repo": repo,
            "base_commit": row.get("base_commit", ""),
        }


# ---------------------------------------------------------------------------
# Multi-SWE-bench  (HuggingFace: ByteDance-Seed/Multi-SWE-bench)
# Multilingual SWE-bench spanning 7 languages
# ---------------------------------------------------------------------------


class MultiSWEBenchAdapter(DatasetAdapter):
    slug = "multiswebench"
    display_name = "Multi-SWE-bench"
    has_ground_truth = False
    has_tasks = False

    def __init__(self, rows: list[dict[str, Any]]):
        self._rows = rows

    def problem_count(self) -> int:
        return len(self._rows)

    def get_problem_summary(self, idx: int) -> dict[str, Any]:
        row = self._rows[idx]
        instance_id = row.get("instance_id", str(idx))
        org = row.get("org", "")
        repo = row.get("repo", "")
        full_repo = f"{org}/{repo}" if org and repo else repo
        return {
            "idx": idx,
            "task_id": instance_id,
            "entry_point": instance_id.split("__")[-1] if instance_id else f"mswe_{idx}",
            "num_inputs": 0,
            "source": row.get("_language", full_repo or "unknown"),
        }

    def get_problem_detail(self, idx: int) -> dict[str, Any]:
        row = self._rows[idx]
        patch = row.get("fix_patch", "")
        instance_id = row.get("instance_id", str(idx))
        org = row.get("org", "")
        repo_name = row.get("repo", "")
        full_repo = f"{org}/{repo_name}" if org and repo_name else repo_name
        lang = row.get("_language", "")
        number = row.get("number", "")

        # Build description from title + body
        title = row.get("title", "")
        body = row.get("body", "")
        description = title
        if body:
            description += "\n\n" + body

        links: dict[str, str] = {}
        if full_repo:
            links["repo_url"] = f"https://github.com/{full_repo}"
        if number and full_repo:
            links["issue_url"] = f"https://github.com/{full_repo}/pull/{number}"

        return {
            "idx": idx,
            "task_id": instance_id,
            "entry_point": instance_id.split("__")[-1] if instance_id else f"mswe_{idx}",
            "code": patch,
            "highlighted_code": "",
            "inputs": [],
            "outputs": [],
            "test": None,
            "tasks": [],
            "source": lang or full_repo,
            "has_ground_truth": False,
            "has_tasks": False,
            "description": description,
            "patch": patch,
            "test_patch": row.get("test_patch", ""),
            "fail_to_pass": [],
            "pass_to_pass": [],
            "repo": full_repo,
            "hints": row.get("hints", ""),
            **links,
        }


# ---------------------------------------------------------------------------
# SWE-bench Multilingual  (HuggingFace: SWE-bench/SWE-bench_Multilingual)
# 300 tasks across 42 repos in multiple languages
# ---------------------------------------------------------------------------


class SWEBenchMultilingualAdapter(SWEBenchLiteAdapter):
    slug = "swebenchmultilingual"
    display_name = "SWE-bench Multilingual"


# ---------------------------------------------------------------------------
# CrossCodeEval  (HuggingFace: Vincentvmt/CrossCodeEval or amazon-science/cceval)
# Cross-file code completion in 4 languages
# ---------------------------------------------------------------------------


class CrossCodeEvalAdapter(DatasetAdapter):
    slug = "crosscodeeval"
    display_name = "CrossCodeEval"
    has_ground_truth = False
    has_tasks = False

    def __init__(self, rows: list[dict[str, Any]]):
        self._rows = rows

    def problem_count(self) -> int:
        return len(self._rows)

    @staticmethod
    def _get_metadata(row: dict, key: str, default: str = "") -> str:
        """Extract a value from the nested metadata dict."""
        meta = row.get("metadata", {})
        if isinstance(meta, dict):
            return meta.get(key, default)
        return default

    def get_problem_summary(self, idx: int) -> dict[str, Any]:
        row = self._rows[idx]
        task_id = self._get_metadata(row, "task_id", str(idx))
        return {
            "idx": idx,
            "task_id": task_id,
            "entry_point": task_id.rsplit("/", 1)[-1] if task_id else f"cceval_{idx}",
            "num_inputs": 0,
            "source": row.get("language", "unknown"),
        }

    def get_problem_detail(self, idx: int) -> dict[str, Any]:
        row = self._rows[idx]
        prompt = row.get("prompt", "")
        reference = row.get("groundtruth", "")
        right_context = row.get("right_context", "")
        lang = row.get("language", "python")
        lang_key = lang.lower()

        task_id = self._get_metadata(row, "task_id", str(idx))

        # Build a FIM-style display: prompt with hole, then merged view
        display_code = prompt + "\n/* [HOLE] */\n" + right_context
        merged_code = prompt + reference + right_context if reference else prompt + right_context

        before_hole = prompt
        gt_start_line = before_hole.count("\n") + 1
        gt_line_count = reference.count("\n") + (1 if reference else 0)
        gt_end_line = gt_start_line + gt_line_count - 1

        return {
            "idx": idx,
            "task_id": task_id,
            "entry_point": task_id.rsplit("/", 1)[-1] if task_id else f"cceval_{idx}",
            "code": display_code,
            "highlighted_code": _highlight_code(display_code, language=lang_key),
            "inputs": [],
            "outputs": [],
            "test": None,
            "tasks": [],
            "source": lang,
            "has_ground_truth": False,
            "has_tasks": False,
            "fim_prefix": prompt,
            "fim_ground_truth": reference,
            "fim_ground_truth_highlighted": _highlight_code(reference, language=lang_key)
            if reference
            else "",
            "fim_merged_code": merged_code,
            "fim_merged_highlighted": _highlight_code(
                merged_code,
                highlight_lines=list(range(gt_start_line, gt_end_line + 1)),
                language=lang_key,
            )
            if merged_code
            else "",
            "fim_gt_start_line": gt_start_line,
            "fim_gt_end_line": gt_end_line,
            "language": lang,
        }


# ---------------------------------------------------------------------------
# RepoBench  (HuggingFace: tianyang/repobench-p)
# Repository-level code completion across Python and Java
# ---------------------------------------------------------------------------


class RepoBenchAdapter(DatasetAdapter):
    slug = "repobench"
    display_name = "RepoBench"
    has_ground_truth = False
    has_tasks = False

    def __init__(self, rows: list[dict[str, Any]]):
        self._rows = rows

    def problem_count(self) -> int:
        return len(self._rows)

    def get_problem_summary(self, idx: int) -> dict[str, Any]:
        row = self._rows[idx]
        return {
            "idx": idx,
            "task_id": str(row.get("repo_name", idx)),
            "entry_point": row.get("file_path", f"repobench_{idx}").rsplit("/", 1)[-1],
            "num_inputs": 0,
            "source": row.get("language", row.get("_setting", "unknown")),
        }

    def get_problem_detail(self, idx: int) -> dict[str, Any]:
        row = self._rows[idx]
        # RepoBench has context code and a next_line to predict
        context = row.get("all_code", row.get("context", ""))
        next_line = row.get("next_line", row.get("gold_snippet_code", ""))
        lang = row.get("language", "python")
        lang_key = lang.lower()

        display_code = context + "\n/* [HOLE] */\n" if context else ""
        merged_code = context + "\n" + next_line if context and next_line else context

        gt_start_line = context.count("\n") + 2 if context else 1
        gt_line_count = next_line.count("\n") + 1 if next_line else 0
        gt_end_line = gt_start_line + gt_line_count - 1

        return {
            "idx": idx,
            "task_id": str(row.get("repo_name", idx)),
            "entry_point": row.get("file_path", f"repobench_{idx}").rsplit("/", 1)[-1],
            "code": display_code,
            "highlighted_code": _highlight_code(display_code, language=lang_key)
            if display_code
            else "",
            "inputs": [],
            "outputs": [],
            "test": None,
            "tasks": [],
            "source": row.get("_setting", lang),
            "has_ground_truth": False,
            "has_tasks": False,
            "fim_prefix": context,
            "fim_ground_truth": next_line,
            "fim_ground_truth_highlighted": _highlight_code(next_line, language=lang_key)
            if next_line
            else "",
            "fim_merged_code": merged_code,
            "fim_merged_highlighted": _highlight_code(
                merged_code,
                highlight_lines=list(range(gt_start_line, gt_end_line + 1)),
                language=lang_key,
            )
            if merged_code
            else "",
            "fim_gt_start_line": gt_start_line,
            "fim_gt_end_line": gt_end_line,
            "language": lang,
        }


# ---------------------------------------------------------------------------
# McEval  (HuggingFace: Multilingual-Multimodal-NLP/McEval)
# Massively multilingual code evaluation — 40 languages, 16K samples
# ---------------------------------------------------------------------------


class McEvalAdapter(DatasetAdapter):
    slug = "mceval"
    display_name = "McEval"
    has_ground_truth = False
    has_tasks = False

    def __init__(self, hf_dataset):
        self._ds = hf_dataset

    def problem_count(self) -> int:
        return len(self._ds)

    def get_problem_summary(self, idx: int) -> dict[str, Any]:
        row = self._ds[idx]
        return {
            "idx": idx,
            "task_id": row.get("task_id", str(idx)),
            "entry_point": row.get("entry_point", row.get("task_id", f"mceval_{idx}")),
            "num_inputs": 0,
            "source": row.get("language", "unknown"),
        }

    def get_problem_detail(self, idx: int) -> dict[str, Any]:
        row = self._ds[idx]
        prompt = row.get("prompt", "")
        canonical = row.get("canonical_solution", "")
        code = prompt + canonical
        lang = row.get("language", "python")
        lang_key = lang.lower()
        # Map some known language names to Pygments lexer names
        lang_map = {
            "c++": "cpp",
            "c#": "csharp",
            "objective-c": "objectivec",
            "visual basic": "vb.net",
            "typescript": "typescript",
        }
        lang_key = lang_map.get(lang_key, lang_key)

        return {
            "idx": idx,
            "task_id": row.get("task_id", str(idx)),
            "entry_point": row.get("entry_point", row.get("task_id", f"mceval_{idx}")),
            "code": code,
            "highlighted_code": _highlight_code(code, language=lang_key),
            "inputs": [],
            "outputs": [],
            "test": row.get("test", ""),
            "tasks": [],
            "source": lang,
            "has_ground_truth": False,
            "has_tasks": False,
            "description": row.get("prompt", ""),
            "language": lang,
        }


# ---------------------------------------------------------------------------
# MultiPL-E  (HuggingFace: nuprl/MultiPL-E)
# Multi-language translated HumanEval/MBPP — 22 languages
# ---------------------------------------------------------------------------


class MultiPLEAdapter(DatasetAdapter):
    slug = "multiple"
    display_name = "MultiPL-E"
    has_ground_truth = False
    has_tasks = False

    # Languages we load (subset of 22 available)
    LANGUAGES = ["py", "cpp", "java", "js", "ts", "go", "rs", "cs", "rb", "lua"]

    _LANG_LABELS = {
        "py": "Python",
        "cpp": "C++",
        "java": "Java",
        "js": "JavaScript",
        "ts": "TypeScript",
        "go": "Go",
        "rs": "Rust",
        "cs": "C#",
        "rb": "Ruby",
        "lua": "Lua",
    }
    _LANG_PYGMENTS = {
        "py": "python",
        "cpp": "cpp",
        "java": "java",
        "js": "javascript",
        "ts": "typescript",
        "go": "go",
        "rs": "rust",
        "cs": "csharp",
        "rb": "ruby",
        "lua": "lua",
    }

    def __init__(self, datasets_by_lang: dict[str, Any]):
        self._by_lang = datasets_by_lang
        first_lang = next(iter(self._by_lang))
        self._count = len(self._by_lang[first_lang])

    def problem_count(self) -> int:
        return self._count

    def get_problem_summary(self, idx: int) -> dict[str, Any]:
        first_lang = next(iter(self._by_lang))
        row = self._by_lang[first_lang][idx]
        return {
            "idx": idx,
            "task_id": row.get("name", str(idx)),
            "entry_point": row.get("name", f"multiple_{idx}"),
            "num_inputs": len(self._by_lang),
            "source": "MultiPL-E",
        }

    def get_problem_detail(self, idx: int) -> dict[str, Any]:
        first_lang = next(iter(self._by_lang))
        row = self._by_lang[first_lang][idx]

        lang_solutions = []
        for lang in self._by_lang:
            lrow = self._by_lang[lang][idx]
            prompt = lrow.get("prompt", "")
            # MultiPL-E stores tests but may not have canonical solutions
            tests = lrow.get("tests", "")
            lang_key = self._LANG_PYGMENTS.get(lang, lang)
            lang_label = self._LANG_LABELS.get(lang, lang)
            lang_solutions.append(
                {
                    "language": lang,
                    "language_label": lang_label,
                    "code": prompt,
                    "highlighted_code": _highlight_code(prompt, language=lang_key),
                    "test": tests,
                }
            )

        py_row = self._by_lang.get("py", self._by_lang[first_lang])[idx]
        default_code = py_row.get("prompt", "")

        return {
            "idx": idx,
            "task_id": row.get("name", str(idx)),
            "entry_point": row.get("name", f"multiple_{idx}"),
            "code": default_code,
            "highlighted_code": _highlight_code(default_code),
            "inputs": [],
            "outputs": [],
            "test": py_row.get("tests", ""),
            "tasks": [],
            "source": "MultiPL-E",
            "has_ground_truth": False,
            "has_tasks": False,
            "lang_solutions": lang_solutions,
        }


# ---------------------------------------------------------------------------
# Defects4J  (HuggingFace: rufimelo/defects4j)
# Java bug-fix benchmark — 854 real bugs from open-source projects
# ---------------------------------------------------------------------------


class Defects4JAdapter(DatasetAdapter):
    slug = "defects4j"
    display_name = "Defects4J"
    has_ground_truth = False
    has_tasks = False

    def __init__(self, hf_dataset):
        self._ds = hf_dataset

    def problem_count(self) -> int:
        return len(self._ds)

    @staticmethod
    def _project_from_bug_id(bug_id: str) -> str:
        """Extract project name from bug_id like 'Compress-35'."""
        return bug_id.rsplit("-", 1)[0] if "-" in bug_id else bug_id

    def get_problem_summary(self, idx: int) -> dict[str, Any]:
        row = self._ds[idx]
        bug_id = row.get("bug_id", str(idx))
        project = self._project_from_bug_id(bug_id)
        return {
            "idx": idx,
            "task_id": bug_id,
            "entry_point": project,
            "num_inputs": 0,
            "source": project,
        }

    def get_problem_detail(self, idx: int) -> dict[str, Any]:
        row = self._ds[idx]
        bug_id = row.get("bug_id", str(idx))
        project = self._project_from_bug_id(bug_id)
        buggy = row.get("func_before", "")
        fixed = row.get("func_after", "")
        return {
            "idx": idx,
            "task_id": bug_id,
            "entry_point": project,
            "code": fixed,
            "highlighted_code": _highlight_code(fixed, language="java") if fixed else "",
            "inputs": [],
            "outputs": [],
            "test": None,
            "tasks": [],
            "source": project,
            "has_ground_truth": False,
            "has_tasks": False,
            "description": "",
            "buggy_code": buggy,
            "buggy_highlighted_code": _highlight_code(buggy, language="java") if buggy else "",
            "fixed_code": fixed,
            "fixed_highlighted_code": _highlight_code(fixed, language="java") if fixed else "",
            "bug_category": "Bug Fix",
            "bug_subtype": project,
            "bug_explanation": "",
            "language": "Java",
        }