File size: 4,582 Bytes
a5c1fa0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# server/models.py
"""
Pydantic models for the OpenEnv API β€” extended with evaluation & reliability layer.
"""
from typing import Optional, List, Dict, Any, Literal
from pydantic import BaseModel, Field


# ── Core Action/Observation Models ──────────────────────────────────────────

class RepoAction(BaseModel):
    """All actions the agent can take in one step."""
    action_type: Literal[
        "read_file",      # Read a file's contents. Costs 1 step.
        "write_file",     # Write/modify a file. Costs 1 step.
        "run_tests",      # Run pytest on a specific test file. Costs 2 steps.
        "search_code",    # Search for a string across all files. Costs 1 step.
        "submit"          # Finalise submission and trigger full grader. Terminal action.
    ]
    path: Optional[str] = None          # For read_file, write_file, run_tests
    content: Optional[str] = None       # For write_file β€” the new file content
    query: Optional[str] = None         # For search_code


class RepoObservation(BaseModel):
    """What the agent sees after each step."""
    repo_tree: List[str]                     # All file paths in the repo
    task_description: str                    # Natural language description of the task
    failing_tests: List[str]                 # Test names that are currently failing
    files_read: List[str]                    # Files the agent has read so far
    last_action_result: Optional[str]        # Output of the last action
    steps_remaining: int
    current_task: str                        # "task1", "task2", or "task3"
    last_action_error: Optional[str] = None  # If the last action failed, why


class RepoReward(BaseModel):
    """Reward signal after each step."""
    value: float = Field(ge=-1.0, le=1.0)
    reason: str


# ── API Response Models ─────────────────────────────────────────────────────

class StepResult(BaseModel):
    """Complete result returned by /step endpoint."""
    observation: RepoObservation
    reward: float
    done: bool
    info: Dict[str, Any] = {}


class ResetResult(BaseModel):
    """Result returned by /reset endpoint."""
    observation: RepoObservation
    info: Dict[str, Any] = {}


class StateResult(BaseModel):
    """Result returned by /state endpoint."""
    observation: RepoObservation
    current_score: float
    total_steps_taken: int


# ── Evaluation & Reliability Models ─────────────────────────────────────────

class TrajectoryResponse(BaseModel):
    """Full trajectory of the current/latest episode."""
    episode_id: Optional[str] = None
    task: Optional[str] = None
    variant_id: Optional[str] = None
    start_time: Optional[float] = None
    end_time: Optional[float] = None
    duration_seconds: Optional[float] = None
    steps: List[Dict[str, Any]] = []
    final_score: float = 0.0
    total_steps: int = 0
    metadata: Dict[str, Any] = {}


class EvaluationResponse(BaseModel):
    """Multi-dimensional evaluation of agent performance."""
    episode_id: Optional[str] = None
    task: Optional[str] = None
    composite_score: float = 0.0
    dimensions: Dict[str, Any] = {}
    failure_analysis: List[str] = []
    strengths: List[str] = []
    recommendations: List[str] = []


class MetricsResponse(BaseModel):
    """Comprehensive metrics for the current/latest episode."""
    episode_id: Optional[str] = None

    # Core metrics
    success_rate: float = 0.0
    step_efficiency: float = 0.0
    navigation_score: float = 0.0
    context_efficiency: float = 0.0
    reasoning_quality: float = 0.0
    robustness_score: float = 0.0
    security_score: float = 0.0

    # Memory stats
    memory: Dict[str, Any] = {}

    # Security stats
    security: Dict[str, Any] = {}

    # Fault injection report
    fault_injection: Dict[str, Any] = {}

    # Wasteful patterns detected
    wasteful_patterns: List[str] = []

    # Timeline of actions
    timeline: List[Dict[str, Any]] = []


class FaultConfigRequest(BaseModel):
    """Request body for configuring fault injection."""
    level: Literal["none", "light", "heavy"] = "none"


class ReplayRequest(BaseModel):
    """Request body for replaying an episode."""
    task: str
    variant_id: Optional[str] = None  # If None, uses the variant from trajectory
    actions: List[Dict[str, Any]] = []