File size: 4,799 Bytes
f78e5d2
fc6ff5a
0320a8d
d8ee465
f78e5d2
fc6ff5a
 
f78e5d2
 
 
fc6ff5a
 
 
f78e5d2
fc6ff5a
d8ee465
f78e5d2
fc6ff5a
 
f78e5d2
fc6ff5a
 
d8ee465
fc6ff5a
 
 
 
 
 
f78e5d2
fc6ff5a
 
 
f78e5d2
fc6ff5a
 
 
 
d8ee465
fc6ff5a
 
 
 
f78e5d2
 
 
 
fc6ff5a
 
 
 
 
 
 
 
d8ee465
fc6ff5a
 
 
 
 
 
 
 
 
f78e5d2
 
 
fc6ff5a
 
 
 
 
 
f78e5d2
0320a8d
 
 
 
 
 
 
 
 
 
 
 
 
 
fc6ff5a
 
 
 
 
 
 
 
 
 
 
d8ee465
f78e5d2
fc6ff5a
 
 
 
 
 
 
 
 
 
 
 
f78e5d2
f8670cd
 
 
 
 
 
 
 
 
f78e5d2
fc6ff5a
 
 
 
f78e5d2
 
 
fc6ff5a
f8670cd
fc6ff5a
 
f78e5d2
 
fc6ff5a
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
from enum import Enum
from typing import List, Optional, Union
from pydantic import BaseModel, model_validator

class TaskId(str, Enum):
    BUG_DETECTION = "bug_detection"
    SECURITY_AUDIT = "security_audit"
    ARCHITECTURAL_REVIEW = "architectural_review"

class ActionType(str, Enum):
    FLAG_ISSUE = "flag_issue"
    COMMENT = "comment"
    APPROVE = "approve"
    REQUEST_CHANGES = "request_changes"
    ASK_QUESTION = "ask_question"

class Category(str, Enum):
    BUG = "bug"
    SECURITY = "security"
    ARCHITECTURE = "architecture"
    STYLE = "style"
    PERFORMANCE = "performance"

class Severity(str, Enum):
    CRITICAL = "critical"    # ordinal 4
    HIGH = "high"            # ordinal 3
    MEDIUM = "medium"        # ordinal 2
    LOW = "low"              # ordinal 1
    INFO = "info"            # ordinal 0

    @classmethod
    def ordinal(cls, sev: "Severity") -> int:
        return {"critical": 4, "high": 3, "medium": 2, "low": 1, "info": 0}[sev.value]

class Verdict(str, Enum):
    LGTM = "lgtm"
    REQUEST_CHANGES = "request_changes"
    NEEDS_DISCUSSION = "needs_discussion"

class FileChanged(BaseModel):
    filename: str
    language: str
    patch: str                          # unified diff of this file
    additions: int = 0
    deletions: int = 0

class GroundTruthIssue(BaseModel):
    id: str
    category: Category
    severity: Severity
    filename: str
    line_number: int
    description: str
    keywords: List[str]                 # at least 2 keywords the agent body must contain
    required_verdict: Optional[Verdict] = None   # if set, terminal verdict is graded

class Scenario(BaseModel):
    task_id: TaskId
    pr_title: str
    pr_description: str
    files_changed: List[FileChanged]
    ground_truth_issues: List[GroundTruthIssue]
    hash: str                           # deterministic identifier, e.g. "bug_001"
    difficulty: str = "medium"          # easy | medium | hard
    tags: List[str] = []

class Action(BaseModel):
    action_type: ActionType
    body: str = ""
    filename: Optional[str] = None
    line_number: Optional[int] = None
    category: Optional[Category] = None
    severity: Optional[Severity] = None
    verdict: Optional[Verdict] = None

    @model_validator(mode="after")
    def validate_action_fields(self) -> "Action":
        if self.action_type == ActionType.FLAG_ISSUE:
            if not self.body or not self.filename or self.line_number is None:
                raise ValueError("flag_issue requires body, filename, and line_number")
            if not self.category or not self.severity:
                raise ValueError("flag_issue requires category and severity")
        elif self.action_type in (ActionType.APPROVE, ActionType.REQUEST_CHANGES):
            if not self.verdict:
                raise ValueError(f"{self.action_type.value} action requires a verdict")
            if not self.body:
                raise ValueError(f"{self.action_type.value} action requires a body summary")
        return self

class ActionRecord(BaseModel):
    """Immutable record of a step taken — stored in episode history."""
    action_type: ActionType
    body: str = ""
    filename: Optional[str] = None
    line_number: Optional[int] = None
    category: Optional[Category] = None
    severity: Optional[Severity] = None
    verdict: Optional[Verdict] = None
    reward: float = 0.0
    timestamp: str = ""     # ISO format, set by env

class Observation(BaseModel):
    task_id: TaskId
    scenario_hash: str
    pr_title: str
    pr_description: str
    diff: str                           # full unified diff (all files concatenated)
    files_changed: List[FileChanged]
    step_count: int
    max_steps: int
    noise_budget: int
    max_noise_budget: int = 5
    issues_flagged: int = 0
    done: bool = False

class Reward(BaseModel):
    """
    Typed reward signal returned at each step (OpenEnv spec).
    All values are normalized in the 0.0 – 1.0 range.
    """
    value: float            # 0.0 – 1.0 normalised score
    reason: str = ""        # human-readable explanation
    is_terminal: bool = False  # True on the final step

class ResetResult(BaseModel):
    task_id: TaskId
    seed: int
    scenario_hash: str
    observation: Observation

class StepResult(BaseModel):
    observation: Observation
    reward: float
    reward_info: Reward     # typed Reward model (OpenEnv spec)
    done: bool
    info: dict = {}

class EpisodeResult(BaseModel):
    episode_id: str = ""
    task_id: TaskId
    scenario_hash: str
    seed: int
    final_score: float
    steps_taken: int
    issues_found: int
    issues_total: int
    noise_penalties: int
    history: List[ActionRecord] = []
    terminated_reason: str = ""         # "terminal_action"|"max_steps"|"noise_exhausted"