Spaces:
Running
Running
| """ | |
| swe_bench/loader.py | |
| βββββββββββββββββββ | |
| Load and iterate over SWE-bench Lite instances. | |
| SWE-bench Lite: 300 real GitHub issues from popular Python repositories, | |
| each with a verified patch that makes all tests pass. | |
| Schema per instance: | |
| instance_id : str β unique identifier e.g. "django__django-12345" | |
| repo : str β "owner/repo" | |
| base_commit : str β SHA of the commit where the bug exists | |
| problem_statement : str β the GitHub issue text | |
| patch : str β gold unified diff (the correct fix) | |
| test_patch : str β tests that were added / modified to verify the fix | |
| PASS_TO_PASS : list β tests that must still pass | |
| FAIL_TO_PASS : list β tests that must now pass (previously failing) | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import logging | |
| from dataclasses import dataclass, field | |
| from pathlib import Path | |
| from typing import Iterator | |
| logger = logging.getLogger(__name__) | |
| class SWEInstance: | |
| """A single SWE-bench problem instance.""" | |
| instance_id: str | |
| repo: str | |
| base_commit: str | |
| problem_statement: str | |
| patch: str # gold patch β used only for evaluation | |
| test_patch: str # tests that verify the fix | |
| fail_to_pass: list[str] # tests that must now pass | |
| pass_to_pass: list[str] # regression tests that must still pass | |
| created_at: str = "" | |
| version: str = "" | |
| environment_setup_commit: str = "" | |
| def repo_name(self) -> str: | |
| """e.g. 'django__django' from 'django/django'.""" | |
| return self.repo.replace("/", "__") | |
| def org(self) -> str: | |
| return self.repo.split("/")[0] | |
| def project(self) -> str: | |
| return self.repo.split("/")[1] | |
| def load_swebench_lite( | |
| dataset_name: str = "princeton-nlp/SWE-bench_Lite", | |
| split: str = "test", | |
| max_instances: int | None = None, | |
| instance_ids: list[str] | None = None, | |
| cache_dir: Path | None = None, | |
| ) -> list[SWEInstance]: | |
| """ | |
| Load SWE-bench Lite from HuggingFace or a local JSON cache. | |
| Args: | |
| dataset_name: HuggingFace dataset identifier. | |
| split: Dataset split β 'test' (300 issues) or 'dev' (23 issues). | |
| max_instances: Limit for quick debugging (None = all). | |
| instance_ids: Filter to specific instance IDs. | |
| cache_dir: Local cache directory; saves downloaded data as JSON. | |
| Returns: | |
| List of SWEInstance objects. | |
| """ | |
| cache_path: Path | None = None | |
| if cache_dir is not None: | |
| cache_dir = Path(cache_dir) | |
| cache_dir.mkdir(parents=True, exist_ok=True) | |
| cache_path = cache_dir / f"swebench_lite_{split}.json" | |
| # ββ Try local cache first βββββββββββββββββββββββββββββββββββββββββββββ | |
| if cache_path and cache_path.exists(): | |
| logger.info("Loading SWE-bench Lite from local cache: %s", cache_path) | |
| raw = json.loads(cache_path.read_text()) | |
| instances = [_dict_to_instance(r) for r in raw] | |
| else: | |
| logger.info("Downloading SWE-bench Lite from HuggingFace: %s", dataset_name) | |
| try: | |
| from datasets import load_dataset # type: ignore | |
| except ImportError as exc: | |
| raise ImportError( | |
| "Install 'datasets': pip install datasets" | |
| ) from exc | |
| ds = load_dataset(dataset_name, split=split) | |
| instances = [_dict_to_instance(dict(row)) for row in ds] | |
| if cache_path: | |
| logger.info("Saving to cache: %s", cache_path) | |
| cache_path.write_text( | |
| json.dumps([_instance_to_dict(i) for i in instances], indent=2) | |
| ) | |
| # ββ Apply filters βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if instance_ids: | |
| id_set = set(instance_ids) | |
| instances = [i for i in instances if i.instance_id in id_set] | |
| logger.info("Filtered to %d instances by ID", len(instances)) | |
| if max_instances is not None: | |
| instances = instances[:max_instances] | |
| logger.info("Loaded %d SWE-bench Lite instances (split=%s)", len(instances), split) | |
| return instances | |
| def iter_instances( | |
| dataset_name: str = "princeton-nlp/SWE-bench_Lite", | |
| split: str = "test", | |
| cache_dir: Path | None = None, | |
| ) -> Iterator[SWEInstance]: | |
| """Streaming iterator β useful for large splits.""" | |
| yield from load_swebench_lite(dataset_name, split=split, cache_dir=cache_dir) | |
| # ββ Private helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _dict_to_instance(row: dict) -> SWEInstance: | |
| return SWEInstance( | |
| instance_id=row.get("instance_id", ""), | |
| repo=row.get("repo", ""), | |
| base_commit=row.get("base_commit", ""), | |
| problem_statement=row.get("problem_statement", ""), | |
| patch=row.get("patch", ""), | |
| test_patch=row.get("test_patch", ""), | |
| fail_to_pass=_parse_list(row.get("FAIL_TO_PASS", "[]")), | |
| pass_to_pass=_parse_list(row.get("PASS_TO_PASS", "[]")), | |
| created_at=row.get("created_at", ""), | |
| version=row.get("version", ""), | |
| environment_setup_commit=row.get("environment_setup_commit", ""), | |
| ) | |
| def _instance_to_dict(instance: SWEInstance) -> dict: | |
| return { | |
| "instance_id": instance.instance_id, | |
| "repo": instance.repo, | |
| "base_commit": instance.base_commit, | |
| "problem_statement": instance.problem_statement, | |
| "patch": instance.patch, | |
| "test_patch": instance.test_patch, | |
| "FAIL_TO_PASS": json.dumps(instance.fail_to_pass), | |
| "PASS_TO_PASS": json.dumps(instance.pass_to_pass), | |
| "created_at": instance.created_at, | |
| "version": instance.version, | |
| "environment_setup_commit": instance.environment_setup_commit, | |
| } | |
| def _parse_list(value: str | list) -> list[str]: | |
| if isinstance(value, list): | |
| return value | |
| try: | |
| parsed = json.loads(value) | |
| return parsed if isinstance(parsed, list) else [] | |
| except (json.JSONDecodeError, TypeError): | |
| return [] | |
| # ββ Convenience class (used by experiments/benchmark.py) βββββββββββββββββββββ | |
| class SWEBenchLoader: | |
| """ | |
| Class wrapper around load_swebench_lite() for use in the benchmark harness. | |
| Usage: | |
| loader = SWEBenchLoader() | |
| instances = loader.load(split="test", max_instances=10) | |
| """ | |
| def __init__( | |
| self, | |
| dataset_name: str = "princeton-nlp/SWE-bench_Lite", | |
| cache_dir: Path | None = Path(".cache/swebench"), | |
| ): | |
| self.dataset_name = dataset_name | |
| self.cache_dir = cache_dir | |
| def load( | |
| self, | |
| split: str = "test", | |
| max_instances: int | None = None, | |
| instance_ids: list[str] | None = None, | |
| ) -> list[dict]: | |
| """ | |
| Load instances and return as plain dicts (benchmark-friendly format). | |
| Keys: instance_id, repo, base_commit, problem_statement, | |
| FAIL_TO_PASS, PASS_TO_PASS, patch. | |
| """ | |
| instances = load_swebench_lite( | |
| dataset_name=self.dataset_name, | |
| split=split, | |
| max_instances=max_instances, | |
| instance_ids=instance_ids, | |
| cache_dir=self.cache_dir, | |
| ) | |
| return [_instance_to_dict(i) for i in instances] | |