Spaces:
Sleeping
Sleeping
File size: 5,124 Bytes
cdf485e a4c268d 447a565 a4c268d cdf485e 5ee1380 cdf485e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | """
Data models for PreferenceLab Environment.
PreferenceLab simulates the RLHF preference data collection pipeline.
An AI agent learns to judge which LLM responses are better using
gold-standard labels from real preference datasets as ground truth.
Tasks:
Task 1 (Easy): Pairwise ranking β pick A or B
Task 2 (Medium): Multi-axis Likert scoring
Task 3 (Hard): Transitive consistency chain ranking
"""
from typing import Literal, Optional
from pydantic import Field
from openenv.core.env_server.types import Action, Observation
# ββ Actions βββββββββββββββββββββββββββββββββββββββββββββββββββ
class PairwiseAction(Action):
"""Action for Task 1: choose which response is better."""
choice: Literal["A", "B", "tie", "skip"] = Field(
...,
description="Choose 'A' if response A is better, 'B' if response B is better, "
"'tie' if they are equal, or 'skip' if undecidable.",
)
justification: Optional[str] = Field(
default=None,
description="Optional reasoning for the choice (not used for grading).",
)
confidence: float = Field(
default=0.8, ge=0.0, le=1.0,
description="Annotator confidence (0.0-1.0)"
)
class LikertAction(Action):
"""Action for Task 2: score a response on 4 quality axes (1-5 each)."""
helpfulness: int = Field(
..., ge=1, le=5,
description="How helpful is the response? 1=not helpful, 5=extremely helpful.",
)
honesty: int = Field(
..., ge=1, le=5,
description="How honest and accurate is the response? 1=misleading, 5=fully honest.",
)
harmlessness: int = Field(
..., ge=1, le=5,
description="How harmless is the response? 1=harmful, 5=completely safe.",
)
instruction_following: int = Field(
..., ge=1, le=5,
description="How well does the response follow the instruction? 1=ignores it, 5=follows perfectly.",
)
class ConsistencyAction(Action):
"""Action for Task 3: rank 4 responses (A, B, C, D) from best to worst."""
ranking: list[Literal["A", "B", "C", "D"]] = Field(
...,
min_length=4,
max_length=4,
description="List of 4 response IDs ordered best to worst, e.g. ['B', 'A', 'D', 'C'].",
)
# ββ Observations ββββββββββββββββββββββββββββββββββββββββββββββ
class PairwiseObservation(Observation):
"""Observation for Task 1: a prompt with two candidate responses."""
task_id: str = Field(..., description="Unique task identifier.")
task_type: Literal["pairwise"] = Field(default="pairwise")
prompt: str = Field(..., description="The user prompt / instruction.")
response_a: str = Field(..., description="Candidate response A.")
response_b: str = Field(..., description="Candidate response B.")
reward: float = Field(default=0.0, description="Reward signal from last step.")
done: bool = Field(default=False, description="Whether the episode is complete.")
step_count: int = Field(default=0, description="Current step within the episode.")
info: dict = Field(default_factory=dict, description="Extra debug info.")
class LikertObservation(Observation):
"""Observation for Task 2: a prompt + single response to score on multiple axes."""
task_id: str = Field(..., description="Unique task identifier.")
task_type: Literal["likert"] = Field(default="likert")
prompt: str = Field(..., description="The user prompt / instruction.")
response: str = Field(..., description="The response to evaluate.")
rubric: str = Field(..., description="Scoring rubric to guide evaluation.")
reward: float = Field(default=0.0, description="Reward signal from last step.")
done: bool = Field(default=False, description="Whether the episode is complete.")
step_count: int = Field(default=0, description="Current step within the episode.")
info: dict = Field(default_factory=dict, description="Extra debug info.")
class ConsistencyObservation(Observation):
"""Observation for Task 3: a prompt + 4 responses to rank transitively."""
task_id: str = Field(..., description="Unique task identifier.")
task_type: Literal["consistency"] = Field(default="consistency")
prompt: str = Field(..., description="The user prompt / instruction.")
response_a: str = Field(..., description="Candidate response A.")
response_b: str = Field(..., description="Candidate response B.")
response_c: str = Field(..., description="Candidate response C.")
response_d: str = Field(..., description="Candidate response D.")
reward: float = Field(default=0.0, description="Reward signal from last step.")
done: bool = Field(default=False, description="Whether the episode is complete.")
step_count: int = Field(default=0, description="Current step within the episode.")
info: dict = Field(default_factory=dict, description="Extra debug info.")
|