Files changed (1) hide show
  1. models.py +39 -16
models.py CHANGED
@@ -5,10 +5,12 @@
5
  # LICENSE file in the root directory of this source tree.
6
 
7
  """
8
- Data models for the Code Output Assessment Environment.
9
 
10
- This environment tests an agent's ability to produce correct outputs for coding problems
11
- across three difficulty levels: easy, medium, and hard.
 
 
12
  """
13
 
14
  from typing import Literal, Optional
@@ -17,20 +19,41 @@ from pydantic import Field
17
 
18
 
19
  class CodeAssessmentAction(Action):
20
- """Action for submitting an answer to a coding problem."""
21
 
22
- answer: str = Field(..., description="The agent's answer/output for the current problem")
 
 
 
 
 
 
 
23
 
24
 
25
  class CodeAssessmentObservation(Observation):
26
- """Observation containing the problem, feedback, and assessment results."""
27
-
28
- problem_description: str = Field(default="", description="Description of the coding problem")
29
- difficulty: Literal["easy", "medium", "hard"] = Field(default="easy", description="Difficulty level")
30
- test_case_input: str = Field(default="", description="Input for the current test case")
31
- expected_output: Optional[str] = Field(default=None, description="Expected output (shown only after submission)")
32
- feedback: str = Field(default="", description="Feedback on the submitted answer")
33
- is_correct: bool = Field(default=False, description="Whether the answer was correct")
34
- partial_credit: float = Field(default=0.0, description="Partial credit score (0.0 to 1.0)")
35
- problems_solved: int = Field(default=0, description="Total number of problems solved so far")
36
- current_streak: int = Field(default=0, description="Current streak of correct answers")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  # LICENSE file in the root directory of this source tree.
6
 
7
  """
8
+ Data models for the AI Response Evaluation Environment.
9
 
10
+ Three tasks:
11
+ 1. Correctness & Instruction Adherence (easy)
12
+ 2. Tone & Audience Appropriateness with structured user profiles (medium)
13
+ 3. Multi-dimensional Quality Scoring — correctness, tone, empathy, safety (hard)
14
  """
15
 
16
  from typing import Literal, Optional
 
19
 
20
 
21
  class CodeAssessmentAction(Action):
22
+ """Action for submitting an evaluation judgment."""
23
 
24
+ answer: str = Field(
25
+ ...,
26
+ description=(
27
+ "Task 1: 'correct|incorrect|partially-correct, reason'\n"
28
+ "Task 2: 'appropriate|needs-adjustment|inappropriate, issue1,issue2,...'\n"
29
+ "Task 3: 'correctness=N, tone=N, empathy=N, safety=N' (N = 0–10)"
30
+ ),
31
+ )
32
 
33
 
34
  class CodeAssessmentObservation(Observation):
35
+ """Observation with scenario, user profile, and grading feedback."""
36
+
37
+ problem_description: str = Field(default="", description="Task instructions")
38
+ difficulty: Literal["easy", "medium", "hard"] = Field(default="easy")
39
+ test_case_input: str = Field(default="", description="Scenario to evaluate")
40
+ task_type: str = Field(default="correctness_check")
41
+ language: str = Field(default="en")
42
+
43
+ # Structured user profile (populated for tasks 2 & 3)
44
+ user_age: Optional[int] = Field(default=None, description="User's age")
45
+ user_mood: Optional[str] = Field(
46
+ default=None,
47
+ description="User's emotional state: happy, sad, frustrated, anxious, neutral, angry",
48
+ )
49
+ user_context: Optional[str] = Field(
50
+ default=None,
51
+ description="Interaction context: education, customer-support, medical, professional, casual, crisis",
52
+ )
53
+
54
+ expected_output: Optional[str] = Field(default=None, description="Correct answer (shown after wrong submission)")
55
+ feedback: str = Field(default="", description="Detailed grading explanation")
56
+ is_correct: bool = Field(default=False)
57
+ partial_credit: float = Field(default=0.0, description="0.0–1.0")
58
+ problems_solved: int = Field(default=0)
59
+ current_streak: int = Field(default=0)