File size: 4,838 Bytes
c06cf60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e461841
c06cf60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e461841
c06cf60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e461841
c06cf60
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""
models/models.py
OpenEnv-compliant Pydantic models.

Three public models:
  Observation  β†’ what the agent sees
  Action       β†’ what the agent does
  Reward       β†’ structured reward signal

One internal model (used by env.py):
  InternalState β†’ hidden ground truth for grading
"""

from __future__ import annotations

from typing import Any, Dict, List, Optional
from pydantic import BaseModel, field_validator, model_validator


# ---------------------------------------------------------------------------
# 1. Message (building block for conversation_history)
# ---------------------------------------------------------------------------

class Message(BaseModel):
    """A single turn in the conversation."""
    role: str      # "user" | "assistant"
    content: str


# ---------------------------------------------------------------------------
# 2. Observation β€” what the agent receives at each step
# ---------------------------------------------------------------------------

class Observation(BaseModel):
    """
    Everything the agent can see.

    Fields:
      instruction           Original task string (never changes).
      known_info            Info collected so far: {"time": "10 AM", ...}
      conversation_history  Full Q&A history as Message objects.
      last_response         The environment's most recent reply (or None).
    """
    instruction: str
    known_info: Dict[str, str] = {}
    constraints: Dict[str, Any] = {}
    conversation_history: List[Message] = []
    last_response: Optional[str] = None


# ---------------------------------------------------------------------------
# 3. Action β€” what the agent can do
# ---------------------------------------------------------------------------

class Action(BaseModel):
    """
    Agent action. Two types:
      "ask"     β†’ ask a clarification question
      "execute" β†’ attempt to complete the task

    Validation:
      - type == "ask"     requires question
      - type == "execute" requires at least proposed_time OR proposed_participants
    """
    type: str                                    # "ask" | "execute"
    question: Optional[str] = None               # for ask
    proposed_time: Optional[str] = None          # for execute
    proposed_participants: Optional[List[str]] = None  # for execute
    proposed_location: Optional[str] = None      # for execute (optional field)

    @field_validator("type")
    @classmethod
    def type_must_be_valid(cls, v: str) -> str:
        if v not in ("ask", "execute"):
            raise ValueError(f"Action type must be 'ask' or 'execute', got '{v}'")
        return v

    @model_validator(mode="after")
    def check_fields_for_type(self) -> "Action":
        if self.type == "ask":
            if not self.question or not self.question.strip():
                raise ValueError("Action type 'ask' requires a non-empty 'question'.")
        if self.type == "execute":
            if self.proposed_time is None and self.proposed_participants is None:
                raise ValueError(
                    "Action type 'execute' requires at least 'proposed_time' "
                    "or 'proposed_participants'."
                )
        return self


# ---------------------------------------------------------------------------
# 4. Reward β€” structured reward signal
# ---------------------------------------------------------------------------

class Reward(BaseModel):
    """
    Structured reward returned by the environment.

    score  β†’ always clamped to [0.0, 1.0]
    reason β†’ human-readable explanation (optional)
    """
    score: float
    reason: Optional[str] = None

    @field_validator("score")
    @classmethod
    def clamp_score(cls, v: float) -> float:
        """Hard clamp: score is always in [0.0, 1.0]."""
        return max(0.0, min(1.0, v))


# ---------------------------------------------------------------------------
# 5. InternalState β€” hidden ground truth (used by env.py, not exposed)
# ---------------------------------------------------------------------------

class InternalState(BaseModel):
    """
    Ground truth known only to the environment / grader.
    Never sent to the agent directly.

    Fields:
      true_time          Correct answer for the time field.
      true_participants  Correct answer for participants.
      true_location      Correct answer for location (optional).
      collected_info     What has been revealed so far via Q&A.
      question_count     How many questions the agent has asked.
      done               Whether the episode is finished.
    """
    true_time: str = ""
    true_participants: List[str] = []
    true_location: Optional[str] = None
    constraints: Dict[str, Any] = {}

    collected_info: Dict[str, str] = {}
    question_count: int = 0
    done: bool = False