File size: 3,260 Bytes
1b35d41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""Dataset loader for TimeMachine-bench JSONL files."""

from __future__ import annotations

import json
from dataclasses import dataclass
from pathlib import Path


@dataclass
class Task:
    """A single migration task from the TimeMachine-bench dataset."""

    repo_name: str
    repo_url: str
    commit_hash: str
    patch: str
    test_patch: str
    gold_patch: str
    reproduction_target_date: str
    reproduction_target_version: str
    migration_target_date: str
    migration_target_version: str
    dockerfile: str
    version_source: str
    script_source: str
    dependency_versions: str
    test_type: str
    test_files: str
    test_count: int
    related_modules: str
    py_file_count: int
    total_loc_python: int
    difficulty: str
    license: str


# Default bundled dataset path (relative to this file)
_DEFAULT_DATASET_PATH = (
    Path(__file__).resolve().parent
    / "data"
    / "timemachine-bench-verified.jsonl"
)


class DatasetLoader:
    """Load and query TimeMachine-bench tasks from a JSONL file."""

    def __init__(self, dataset_path: str | None = None) -> None:
        self._path = Path(dataset_path) if dataset_path else _DEFAULT_DATASET_PATH
        self._tasks: list[Task] = self.load()

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    def load(self) -> list[Task]:
        """Parse the JSONL file and return a list of Task records.

        Raises:
            FileNotFoundError: If the dataset file does not exist.
            ValueError: If a line contains malformed JSON (includes file path
                        and 1-based line number in the message).
        """
        path = self._path
        if not path.exists():
            raise FileNotFoundError(f"Dataset file not found: {path}")

        tasks: list[Task] = []
        with open(path, "r", encoding="utf-8") as fh:
            for line_no, raw_line in enumerate(fh, start=1):
                raw_line = raw_line.strip()
                if not raw_line:
                    continue
                try:
                    record = json.loads(raw_line)
                except json.JSONDecodeError as exc:
                    raise ValueError(
                        f"Malformed JSON at {path} line {line_no}: {exc}"
                    ) from exc
                tasks.append(Task(**record))
        return tasks

    def filter_by_difficulty(self, difficulty: str) -> list[Task]:
        """Return tasks matching the given difficulty level."""
        return [t for t in self._tasks if t.difficulty == difficulty]

    def get_by_repo_name(self, repo_name: str) -> Task | None:
        """Return the first task with the given repo_name, or None."""
        for t in self._tasks:
            if t.repo_name == repo_name:
                return t
        return None

    def __len__(self) -> int:
        return len(self._tasks)

    def __getitem__(self, index: int) -> Task:
        return self._tasks[index]