File size: 5,124 Bytes
cdf485e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a4c268d
447a565
 
a4c268d
cdf485e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ee1380
cdf485e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""
Data models for PreferenceLab Environment.

PreferenceLab simulates the RLHF preference data collection pipeline.
An AI agent learns to judge which LLM responses are better using
gold-standard labels from real preference datasets as ground truth.

Tasks:
    Task 1 (Easy):   Pairwise ranking β€” pick A or B
    Task 2 (Medium): Multi-axis Likert scoring
    Task 3 (Hard):   Transitive consistency chain ranking
"""

from typing import Literal, Optional
from pydantic import Field

from openenv.core.env_server.types import Action, Observation


# ── Actions ───────────────────────────────────────────────────

class PairwiseAction(Action):
    """Action for Task 1: choose which response is better."""

    choice: Literal["A", "B", "tie", "skip"] = Field(
        ...,
        description="Choose 'A' if response A is better, 'B' if response B is better, "
                    "'tie' if they are equal, or 'skip' if undecidable.",
    )
    justification: Optional[str] = Field(
        default=None,
        description="Optional reasoning for the choice (not used for grading).",
    )
    confidence: float = Field(
        default=0.8, ge=0.0, le=1.0,
        description="Annotator confidence (0.0-1.0)"
    )


class LikertAction(Action):
    """Action for Task 2: score a response on 4 quality axes (1-5 each)."""

    helpfulness: int = Field(
        ..., ge=1, le=5,
        description="How helpful is the response? 1=not helpful, 5=extremely helpful.",
    )
    honesty: int = Field(
        ..., ge=1, le=5,
        description="How honest and accurate is the response? 1=misleading, 5=fully honest.",
    )
    harmlessness: int = Field(
        ..., ge=1, le=5,
        description="How harmless is the response? 1=harmful, 5=completely safe.",
    )
    instruction_following: int = Field(
        ..., ge=1, le=5,
        description="How well does the response follow the instruction? 1=ignores it, 5=follows perfectly.",
    )


class ConsistencyAction(Action):
    """Action for Task 3: rank 4 responses (A, B, C, D) from best to worst."""

    ranking: list[Literal["A", "B", "C", "D"]] = Field(
        ...,
        min_length=4,
        max_length=4,
        description="List of 4 response IDs ordered best to worst, e.g. ['B', 'A', 'D', 'C'].",
    )


# ── Observations ──────────────────────────────────────────────

class PairwiseObservation(Observation):
    """Observation for Task 1: a prompt with two candidate responses."""

    task_id: str = Field(..., description="Unique task identifier.")
    task_type: Literal["pairwise"] = Field(default="pairwise")
    prompt: str = Field(..., description="The user prompt / instruction.")
    response_a: str = Field(..., description="Candidate response A.")
    response_b: str = Field(..., description="Candidate response B.")
    reward: float = Field(default=0.0, description="Reward signal from last step.")
    done: bool = Field(default=False, description="Whether the episode is complete.")
    step_count: int = Field(default=0, description="Current step within the episode.")
    info: dict = Field(default_factory=dict, description="Extra debug info.")


class LikertObservation(Observation):
    """Observation for Task 2: a prompt + single response to score on multiple axes."""

    task_id: str = Field(..., description="Unique task identifier.")
    task_type: Literal["likert"] = Field(default="likert")
    prompt: str = Field(..., description="The user prompt / instruction.")
    response: str = Field(..., description="The response to evaluate.")
    rubric: str = Field(..., description="Scoring rubric to guide evaluation.")
    reward: float = Field(default=0.0, description="Reward signal from last step.")
    done: bool = Field(default=False, description="Whether the episode is complete.")
    step_count: int = Field(default=0, description="Current step within the episode.")
    info: dict = Field(default_factory=dict, description="Extra debug info.")


class ConsistencyObservation(Observation):
    """Observation for Task 3: a prompt + 4 responses to rank transitively."""

    task_id: str = Field(..., description="Unique task identifier.")
    task_type: Literal["consistency"] = Field(default="consistency")
    prompt: str = Field(..., description="The user prompt / instruction.")
    response_a: str = Field(..., description="Candidate response A.")
    response_b: str = Field(..., description="Candidate response B.")
    response_c: str = Field(..., description="Candidate response C.")
    response_d: str = Field(..., description="Candidate response D.")
    reward: float = Field(default=0.0, description="Reward signal from last step.")
    done: bool = Field(default=False, description="Whether the episode is complete.")
    step_count: int = Field(default=0, description="Current step within the episode.")
    info: dict = Field(default_factory=dict, description="Extra debug info.")