Spaces:
Sleeping
Sleeping
File size: 4,582 Bytes
a5c1fa0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | # server/models.py
"""
Pydantic models for the OpenEnv API β extended with evaluation & reliability layer.
"""
from typing import Optional, List, Dict, Any, Literal
from pydantic import BaseModel, Field
# ββ Core Action/Observation Models ββββββββββββββββββββββββββββββββββββββββββ
class RepoAction(BaseModel):
"""All actions the agent can take in one step."""
action_type: Literal[
"read_file", # Read a file's contents. Costs 1 step.
"write_file", # Write/modify a file. Costs 1 step.
"run_tests", # Run pytest on a specific test file. Costs 2 steps.
"search_code", # Search for a string across all files. Costs 1 step.
"submit" # Finalise submission and trigger full grader. Terminal action.
]
path: Optional[str] = None # For read_file, write_file, run_tests
content: Optional[str] = None # For write_file β the new file content
query: Optional[str] = None # For search_code
class RepoObservation(BaseModel):
"""What the agent sees after each step."""
repo_tree: List[str] # All file paths in the repo
task_description: str # Natural language description of the task
failing_tests: List[str] # Test names that are currently failing
files_read: List[str] # Files the agent has read so far
last_action_result: Optional[str] # Output of the last action
steps_remaining: int
current_task: str # "task1", "task2", or "task3"
last_action_error: Optional[str] = None # If the last action failed, why
class RepoReward(BaseModel):
"""Reward signal after each step."""
value: float = Field(ge=-1.0, le=1.0)
reason: str
# ββ API Response Models βββββββββββββββββββββββββββββββββββββββββββββββββββββ
class StepResult(BaseModel):
"""Complete result returned by /step endpoint."""
observation: RepoObservation
reward: float
done: bool
info: Dict[str, Any] = {}
class ResetResult(BaseModel):
"""Result returned by /reset endpoint."""
observation: RepoObservation
info: Dict[str, Any] = {}
class StateResult(BaseModel):
"""Result returned by /state endpoint."""
observation: RepoObservation
current_score: float
total_steps_taken: int
# ββ Evaluation & Reliability Models βββββββββββββββββββββββββββββββββββββββββ
class TrajectoryResponse(BaseModel):
"""Full trajectory of the current/latest episode."""
episode_id: Optional[str] = None
task: Optional[str] = None
variant_id: Optional[str] = None
start_time: Optional[float] = None
end_time: Optional[float] = None
duration_seconds: Optional[float] = None
steps: List[Dict[str, Any]] = []
final_score: float = 0.0
total_steps: int = 0
metadata: Dict[str, Any] = {}
class EvaluationResponse(BaseModel):
"""Multi-dimensional evaluation of agent performance."""
episode_id: Optional[str] = None
task: Optional[str] = None
composite_score: float = 0.0
dimensions: Dict[str, Any] = {}
failure_analysis: List[str] = []
strengths: List[str] = []
recommendations: List[str] = []
class MetricsResponse(BaseModel):
"""Comprehensive metrics for the current/latest episode."""
episode_id: Optional[str] = None
# Core metrics
success_rate: float = 0.0
step_efficiency: float = 0.0
navigation_score: float = 0.0
context_efficiency: float = 0.0
reasoning_quality: float = 0.0
robustness_score: float = 0.0
security_score: float = 0.0
# Memory stats
memory: Dict[str, Any] = {}
# Security stats
security: Dict[str, Any] = {}
# Fault injection report
fault_injection: Dict[str, Any] = {}
# Wasteful patterns detected
wasteful_patterns: List[str] = []
# Timeline of actions
timeline: List[Dict[str, Any]] = []
class FaultConfigRequest(BaseModel):
"""Request body for configuring fault injection."""
level: Literal["none", "light", "heavy"] = "none"
class ReplayRequest(BaseModel):
"""Request body for replaying an episode."""
task: str
variant_id: Optional[str] = None # If None, uses the variant from trajectory
actions: List[Dict[str, Any]] = []
|