Spaces:

5ivatej
/

meta-hackathon

Sleeping

File size: 3,618 Bytes

807d5cc

"""Typed Pydantic models for the ESC OpenEnv environment.

Defines the Action, Observation, Reward, and result envelopes used across the
HTTP boundary (server.py) and the in-process env (env.py).
"""
from __future__ import annotations

from typing import Any, Dict, List, Optional

from pydantic import BaseModel, Field


class Action(BaseModel):
    """Agent action: a free-text conversational reply to the seeker."""

    message: str = Field(..., description="Agent's reply to the seeker.")


class Observation(BaseModel):
    """What the agent sees each turn.

    The seeker's internal state (distress, trust, openness, true_issue) is
    intentionally hidden — partial observability is what makes this env
    RL-native. Only the seeker's *utterance* and coarse hints are exposed.
    """

    seeker_utterance: str = Field(..., description="The seeker's latest message.")
    turn: int = Field(..., description="1-indexed conversation turn.")
    remaining_turns: int = Field(..., description="Turns left before forced close.")
    stage_hint: str = Field(
        ...,
        description=(
            "Coarse public hint about conversational phase: one of "
            "'opening', 'exploring', 'reflecting', 'planning', 'closing'."
        ),
    )
    task_id: str = Field(..., description="Currently active task id.")
    scenario_brief: str = Field(
        ...,
        description="One-line scenario framing shown once at reset (kept in obs for convenience).",
    )


class Reward(BaseModel):
    """Detailed reward breakdown for a single step.

    The scalar `value` is what the agent sees. The decomposition is exposed
    for transparency and debugging.
    """

    value: float = Field(..., ge=0.0, le=1.0, description="Clipped step reward in [0,1].")
    immediate: float = Field(..., description="Immediate turn-level component (empathy, stage-fit).")
    future_oriented: float = Field(
        ...,
        description=(
            "Future-oriented component: k-step lookahead over the deterministic "
            "seeker dynamics, comparing this action's projected resolution "
            "progress against the oracle ceiling (RLFF-ESC style)."
        ),
    )
    penalties: float = Field(..., description="Summed penalties (dismissive, premature advice, loops).")
    components: Dict[str, float] = Field(default_factory=dict, description="Sub-component breakdown.")


class StepResult(BaseModel):
    """Envelope returned by env.step()."""

    observation: Observation
    reward: float
    reward_detail: Reward
    done: bool
    info: Dict[str, Any] = Field(default_factory=dict)


class ResetResult(BaseModel):
    """Envelope returned by env.reset()."""

    observation: Observation
    info: Dict[str, Any] = Field(default_factory=dict)


class EnvState(BaseModel):
    """Public view of environment state returned by env.state().

    Hidden seeker variables are *not* included — only public bookkeeping.
    """

    task_id: str
    turn: int
    max_turns: int
    done: bool
    cumulative_reward: float
    transcript: List[Dict[str, str]] = Field(
        default_factory=list,
        description="List of {'role': 'seeker'|'agent', 'text': str} entries.",
    )


# ------- Request schemas for the HTTP server -------


class ResetRequest(BaseModel):
    task_id: Optional[str] = Field(
        default=None,
        description="Optional task id. If omitted, defaults to 'work_stress_venting'.",
    )
    seed: Optional[int] = Field(default=None, description="Optional seed (reserved; env is deterministic).")


class StepRequest(BaseModel):
    action: Action