"""Code editing benchmark adapters (SWE-bench, DebugBench, CanItEdit, CodeEditorBench).""" from __future__ import annotations import json from typing import Any from adapters import DatasetAdapter # Injected at runtime by _set_helpers() _highlight_code = None _code_offset = None _extract_test_classes = None # --------------------------------------------------------------------------- # SWE-bench Lite adapter (HuggingFace: princeton-nlp/SWE-bench_Lite) # --------------------------------------------------------------------------- class SWEBenchLiteAdapter(DatasetAdapter): slug = "swebenchlite" display_name = "SWE-bench Lite" has_ground_truth = False has_tasks = False def __init__(self, hf_dataset): self._ds = hf_dataset def problem_count(self) -> int: return len(self._ds) def get_problem_summary(self, idx: int) -> dict[str, Any]: row = self._ds[idx] return { "idx": idx, "task_id": row["instance_id"], "entry_point": row["instance_id"].split("__")[-1], "num_inputs": 0, "source": row["repo"], } @staticmethod def _github_links(instance_id: str, repo: str, base_commit: str) -> dict[str, str]: """Build GitHub URLs from SWE-bench instance metadata.""" links: dict[str, str] = {} if repo: links["repo_url"] = f"https://github.com/{repo}" # instance_id format: "repo__issue-number" e.g. "astropy__astropy-12907" parts = instance_id.rsplit("-", 1) if len(parts) == 2 and parts[1].isdigit() and repo: links["issue_url"] = f"https://github.com/{repo}/issues/{parts[1]}" if base_commit and repo: links["commit_url"] = f"https://github.com/{repo}/commit/{base_commit}" return links def get_problem_detail(self, idx: int) -> dict[str, Any]: row = self._ds[idx] patch = row["patch"] raw_f2p = row["FAIL_TO_PASS"] fail_to_pass = raw_f2p if isinstance(raw_f2p, list) else (json.loads(raw_f2p) if raw_f2p else []) raw_p2p = row["PASS_TO_PASS"] pass_to_pass = raw_p2p if isinstance(raw_p2p, list) else (json.loads(raw_p2p) if raw_p2p else []) instance_id = row["instance_id"] repo = row["repo"] base_commit = row.get("base_commit", "") return { "idx": idx, "task_id": instance_id, "entry_point": instance_id.split("__")[-1], "code": patch, "highlighted_code": "", "inputs": [], "outputs": [], "test": None, "tasks": [], "source": repo, "has_ground_truth": False, "has_tasks": False, "description": row["problem_statement"], "patch": patch, "test_patch": row.get("test_patch", ""), "fail_to_pass": fail_to_pass, "pass_to_pass": pass_to_pass, "hints": row.get("hints_text", ""), "repo": repo, "base_commit": base_commit, "version": row.get("version", ""), "created_at": row.get("created_at", ""), **self._github_links(instance_id, repo, base_commit), } # --------------------------------------------------------------------------- # SWE-bench Verified adapter (HuggingFace: princeton-nlp/SWE-bench_Verified) # --------------------------------------------------------------------------- class SWEBenchVerifiedAdapter(SWEBenchLiteAdapter): slug = "swebenchverified" display_name = "SWE-bench Verified" class SWEBenchFullAdapter(SWEBenchLiteAdapter): slug = "swebenchfull" display_name = "SWE-bench" # --------------------------------------------------------------------------- # DebugBench adapter (HuggingFace: Rtian/DebugBench) # --------------------------------------------------------------------------- class DebugBenchAdapter(DatasetAdapter): slug = "debugbench" display_name = "DebugBench" has_ground_truth = False has_tasks = False def __init__(self, hf_dataset): self._ds = hf_dataset def problem_count(self) -> int: return len(self._ds) def get_problem_summary(self, idx: int) -> dict[str, Any]: row = self._ds[idx] return { "idx": idx, "task_id": row["slug"], "entry_point": row["slug"], "num_inputs": len(row["examples"]), "source": f"{row['language']}/{row['category']}", } def get_problem_detail(self, idx: int) -> dict[str, Any]: row = self._ds[idx] lang = row["language"] buggy = row["buggy_code"] fixed = row["solution"] return { "idx": idx, "task_id": row["slug"], "entry_point": row["slug"], "code": fixed, "highlighted_code": _highlight_code(fixed, language=lang), "inputs": [], "outputs": [], "test": None, "tasks": [], "source": f"{lang}/{row['category']}", "has_ground_truth": False, "has_tasks": False, "description": row["question"], "language": lang, "buggy_code": buggy, "buggy_highlighted_code": _highlight_code(buggy, language=lang), "fixed_code": fixed, "fixed_highlighted_code": _highlight_code(fixed, language=lang), "bug_category": row["category"], "bug_subtype": row["subtype"], "bug_explanation": row["bug_explanation"], "difficulty": row["level"], "examples": list(row["examples"]), } # --------------------------------------------------------------------------- # CanItEdit adapter (HuggingFace: nuprl/CanItEdit) # --------------------------------------------------------------------------- class CanItEditAdapter(DatasetAdapter): slug = "canitedit" display_name = "CanItEdit" has_ground_truth = False has_tasks = False def __init__(self, hf_dataset): self._ds = hf_dataset def problem_count(self) -> int: return len(self._ds) def get_problem_summary(self, idx: int) -> dict[str, Any]: row = self._ds[idx] taxonomy = row.get("taxonomy", {}) change_kind = taxonomy.get("change_kind", "") if isinstance(taxonomy, dict) else "" return { "idx": idx, "task_id": row.get("full_name", str(row.get("id", idx))), "entry_point": row.get("name", f"edit_{idx}"), "num_inputs": 0, "source": change_kind or "CanItEdit", } def get_problem_detail(self, idx: int) -> dict[str, Any]: row = self._ds[idx] before = row["before"] after = row["after"] taxonomy = row.get("taxonomy", {}) if not isinstance(taxonomy, dict): taxonomy = {} return { "idx": idx, "task_id": row.get("full_name", str(row.get("id", idx))), "entry_point": row.get("name", f"edit_{idx}"), "code": after, "highlighted_code": _highlight_code(after), "inputs": [], "outputs": [], "test": row.get("tests", ""), "tasks": [], "source": taxonomy.get("change_kind", "CanItEdit"), "has_ground_truth": False, "has_tasks": False, "description": row.get("instruction_descriptive", ""), "buggy_code": before, "buggy_highlighted_code": _highlight_code(before), "fixed_code": after, "fixed_highlighted_code": _highlight_code(after), "bug_category": taxonomy.get("change_kind", ""), "bug_subtype": taxonomy.get("topic", ""), "bug_explanation": row.get("instruction_lazy", ""), } # --------------------------------------------------------------------------- # CodeEditorBench adapter (HuggingFace: m-a-p/CodeEditorBench) # --------------------------------------------------------------------------- class CodeEditorBenchAdapter(DatasetAdapter): slug = "codeeditorbench" display_name = "CodeEditorBench" has_ground_truth = False has_tasks = False def __init__(self, rows: list[dict[str, Any]]): self._rows = rows def problem_count(self) -> int: return len(self._rows) def get_problem_summary(self, idx: int) -> dict[str, Any]: row = self._rows[idx] return { "idx": idx, "task_id": str(row.get("idx", idx)), "entry_point": row.get("title", f"problem_{idx}"), "num_inputs": 0, "source": row.get("_task_type", "unknown"), } def get_problem_detail(self, idx: int) -> dict[str, Any]: row = self._rows[idx] task_type = row.get("_task_type", "unknown") lang = row.get("code_language", row.get("source_lang", "python")) or "python" lang_key = lang.lower() if task_type == "code_debug": buggy = row.get("incorrect_solutions", "") fixed = row.get("solutions", "") elif task_type == "code_translate": buggy = row.get("source_code", "") fixed = row.get("solutions", row.get("source_code", "")) elif task_type == "code_polishment": buggy = row.get("source_code", "") fixed = row.get("solutions", row.get("source_code", "")) else: # code_switch buggy = row.get("similar_source_code", row.get("source_code", "")) fixed = row.get("solutions", row.get("source_code", "")) return { "idx": idx, "task_id": str(row.get("idx", idx)), "entry_point": row.get("title", f"problem_{idx}"), "code": fixed, "highlighted_code": _highlight_code(fixed, language=lang_key) if fixed else "", "inputs": [], "outputs": [], "test": None, "tasks": [], "source": task_type, "has_ground_truth": False, "has_tasks": False, "description": "", "buggy_code": buggy, "buggy_highlighted_code": _highlight_code(buggy, language=lang_key) if buggy else "", "fixed_code": fixed, "fixed_highlighted_code": _highlight_code(fixed, language=lang_key) if fixed else "", "bug_category": task_type, "bug_subtype": row.get("difficulty", ""), "bug_explanation": "", "difficulty": row.get("difficulty", ""), "language": lang, } # --------------------------------------------------------------------------- # CodeXGLUE Code Refinement adapter (HuggingFace: google/code_x_glue_cc_code_refinement) # --------------------------------------------------------------------------- class CodeXGLUERefinementAdapter(DatasetAdapter): slug = "codexgluerefinement" display_name = "CodeXGLUE Code Refinement" has_ground_truth = False has_tasks = False def __init__(self, hf_dataset): self._ds = hf_dataset def problem_count(self) -> int: return len(self._ds) def get_problem_summary(self, idx: int) -> dict[str, Any]: row = self._ds[idx] return { "idx": idx, "task_id": str(row.get("id", idx)), "entry_point": f"refinement_{row.get('id', idx)}", "num_inputs": 0, "source": "CodeXGLUE", } def get_problem_detail(self, idx: int) -> dict[str, Any]: row = self._ds[idx] buggy = row.get("buggy", "") fixed = row.get("fixed", "") return { "idx": idx, "task_id": str(row.get("id", idx)), "entry_point": f"refinement_{row.get('id', idx)}", "code": fixed, "highlighted_code": _highlight_code(fixed, language="java") if fixed else "", "inputs": [], "outputs": [], "test": None, "tasks": [], "source": "CodeXGLUE", "has_ground_truth": False, "has_tasks": False, "description": "", "buggy_code": buggy, "buggy_highlighted_code": _highlight_code(buggy, language="java") if buggy else "", "fixed_code": fixed, "fixed_highlighted_code": _highlight_code(fixed, language="java") if fixed else "", "bug_category": "Code Refinement", "bug_subtype": "", "bug_explanation": "", "language": "Java", } # --------------------------------------------------------------------------- # CommitBench adapter (HuggingFace: Maxscha/commitbench) # --------------------------------------------------------------------------- class CommitBenchAdapter(DatasetAdapter): slug = "commitbench" display_name = "CommitBench" has_ground_truth = False has_tasks = False def __init__(self, hf_dataset): self._ds = hf_dataset def problem_count(self) -> int: return len(self._ds) def get_problem_summary(self, idx: int) -> dict[str, Any]: row = self._ds[idx] return { "idx": idx, "task_id": row.get("hash", str(idx))[:12], "entry_point": row.get("project", f"commit_{idx}"), "num_inputs": 0, "source": row.get("diff_languages", "unknown"), } def get_problem_detail(self, idx: int) -> dict[str, Any]: row = self._ds[idx] diff = row.get("diff", "") message = row.get("message", "") return { "idx": idx, "task_id": row.get("hash", str(idx))[:12], "entry_point": row.get("project", f"commit_{idx}"), "code": diff, "highlighted_code": "", "inputs": [], "outputs": [], "test": None, "tasks": [], "source": row.get("diff_languages", "unknown"), "has_ground_truth": False, "has_tasks": False, "description": message, "patch": diff, "repo": row.get("project", ""), "commit_hash": row.get("hash", ""), "diff_languages": row.get("diff_languages", ""), }