Spaces:

TulasiSankar
/

code_assessment_env

Running

App Files Files Community

Update models.py

by rsaibhargav - opened Apr 5

base: refs/heads/main

←

from: refs/pr/9

Discussion Files changed

+39

-16

Files changed (1) hide show

models.py +39 -16

models.py CHANGED Viewed

@@ -5,10 +5,12 @@
 # LICENSE file in the root directory of this source tree.
 """
-Data models for the Code Output Assessment Environment.
-This environment tests an agent's ability to produce correct outputs for coding problems
-across three difficulty levels: easy, medium, and hard.
 """
 from typing import Literal, Optional
@@ -17,20 +19,41 @@ from pydantic import Field
 class CodeAssessmentAction(Action):
-    """Action for submitting an answer to a coding problem."""
-    answer: str = Field(..., description="The agent's answer/output for the current problem")
 class CodeAssessmentObservation(Observation):
-    """Observation containing the problem, feedback, and assessment results."""
-    problem_description: str = Field(default="", description="Description of the coding problem")
-    difficulty: Literal["easy", "medium", "hard"] = Field(default="easy", description="Difficulty level")
-    test_case_input: str = Field(default="", description="Input for the current test case")
-    expected_output: Optional[str] = Field(default=None, description="Expected output (shown only after submission)")
-    feedback: str = Field(default="", description="Feedback on the submitted answer")
-    is_correct: bool = Field(default=False, description="Whether the answer was correct")
-    partial_credit: float = Field(default=0.0, description="Partial credit score (0.0 to 1.0)")
-    problems_solved: int = Field(default=0, description="Total number of problems solved so far")
-    current_streak: int = Field(default=0, description="Current streak of correct answers")

 # LICENSE file in the root directory of this source tree.
 """
+Data models for the AI Response Evaluation Environment.
+Three tasks:
+  1. Correctness & Instruction Adherence (easy)
+  2. Tone & Audience Appropriateness with structured user profiles (medium)
+  3. Multi-dimensional Quality Scoring — correctness, tone, empathy, safety (hard)
 """
 from typing import Literal, Optional
 class CodeAssessmentAction(Action):
+    """Action for submitting an evaluation judgment."""
+    answer: str = Field(
+        ...,
+        description=(
+            "Task 1: 'correct|incorrect|partially-correct, reason'\n"
+            "Task 2: 'appropriate|needs-adjustment|inappropriate, issue1,issue2,...'\n"
+            "Task 3: 'correctness=N, tone=N, empathy=N, safety=N' (N = 0–10)"
+        ),
+    )
 class CodeAssessmentObservation(Observation):
+    """Observation with scenario, user profile, and grading feedback."""
+    problem_description: str = Field(default="", description="Task instructions")
+    difficulty: Literal["easy", "medium", "hard"] = Field(default="easy")
+    test_case_input: str = Field(default="", description="Scenario to evaluate")
+    task_type: str = Field(default="correctness_check")
+    language: str = Field(default="en")
+    # Structured user profile (populated for tasks 2 & 3)
+    user_age: Optional[int] = Field(default=None, description="User's age")
+    user_mood: Optional[str] = Field(
+        default=None,
+        description="User's emotional state: happy, sad, frustrated, anxious, neutral, angry",
+    )
+    user_context: Optional[str] = Field(
+        default=None,
+        description="Interaction context: education, customer-support, medical, professional, casual, crisis",
+    )
+    expected_output: Optional[str] = Field(default=None, description="Correct answer (shown after wrong submission)")
+    feedback: str = Field(default="", description="Detailed grading explanation")
+    is_correct: bool = Field(default=False)
+    partial_credit: float = Field(default=0.0, description="0.0–1.0")
+    problems_solved: int = Field(default=0)
+    current_streak: int = Field(default=0)