Spaces:

Dev-CrafterX
/

preference-lab

Sleeping

File size: 5,124 Bytes

"""
Data models for PreferenceLab Environment.

PreferenceLab simulates the RLHF preference data collection pipeline.
An AI agent learns to judge which LLM responses are better using
gold-standard labels from real preference datasets as ground truth.

Tasks:
    Task 1 (Easy):   Pairwise ranking — pick A or B
    Task 2 (Medium): Multi-axis Likert scoring
    Task 3 (Hard):   Transitive consistency chain ranking
"""

from typing import Literal, Optional
from pydantic import Field

from openenv.core.env_server.types import Action, Observation


# ── Actions ───────────────────────────────────────────────────

class PairwiseAction(Action):
    """Action for Task 1: choose which response is better."""

    choice: Literal["A", "B", "tie", "skip"] = Field(
        ...,
        description="Choose 'A' if response A is better, 'B' if response B is better, "
                    "'tie' if they are equal, or 'skip' if undecidable.",
    )
    justification: Optional[str] = Field(
        default=None,
        description="Optional reasoning for the choice (not used for grading).",
    )
    confidence: float = Field(
        default=0.8, ge=0.0, le=1.0,
        description="Annotator confidence (0.0-1.0)"
    )


class LikertAction(Action):
    """Action for Task 2: score a response on 4 quality axes (1-5 each)."""

    helpfulness: int = Field(
        ..., ge=1, le=5,
        description="How helpful is the response? 1=not helpful, 5=extremely helpful.",
    )
    honesty: int = Field(
        ..., ge=1, le=5,
        description="How honest and accurate is the response? 1=misleading, 5=fully honest.",
    )
    harmlessness: int = Field(
        ..., ge=1, le=5,
        description="How harmless is the response? 1=harmful, 5=completely safe.",
    )
    instruction_following: int = Field(
        ..., ge=1, le=5,
        description="How well does the response follow the instruction? 1=ignores it, 5=follows perfectly.",
    )


class ConsistencyAction(Action):
    """Action for Task 3: rank 4 responses (A, B, C, D) from best to worst."""

    ranking: list[Literal["A", "B", "C", "D"]] = Field(
        ...,
        min_length=4,
        max_length=4,
        description="List of 4 response IDs ordered best to worst, e.g. ['B', 'A', 'D', 'C'].",
    )


# ── Observations ──────────────────────────────────────────────

class PairwiseObservation(Observation):
    """Observation for Task 1: a prompt with two candidate responses."""

    task_id: str = Field(..., description="Unique task identifier.")
    task_type: Literal["pairwise"] = Field(default="pairwise")
    prompt: str = Field(..., description="The user prompt / instruction.")
    response_a: str = Field(..., description="Candidate response A.")
    response_b: str = Field(..., description="Candidate response B.")
    reward: float = Field(default=0.0, description="Reward signal from last step.")
    done: bool = Field(default=False, description="Whether the episode is complete.")
    step_count: int = Field(default=0, description="Current step within the episode.")
    info: dict = Field(default_factory=dict, description="Extra debug info.")


class LikertObservation(Observation):
    """Observation for Task 2: a prompt + single response to score on multiple axes."""

    task_id: str = Field(..., description="Unique task identifier.")
    task_type: Literal["likert"] = Field(default="likert")
    prompt: str = Field(..., description="The user prompt / instruction.")
    response: str = Field(..., description="The response to evaluate.")
    rubric: str = Field(..., description="Scoring rubric to guide evaluation.")
    reward: float = Field(default=0.0, description="Reward signal from last step.")
    done: bool = Field(default=False, description="Whether the episode is complete.")
    step_count: int = Field(default=0, description="Current step within the episode.")
    info: dict = Field(default_factory=dict, description="Extra debug info.")


class ConsistencyObservation(Observation):
    """Observation for Task 3: a prompt + 4 responses to rank transitively."""

    task_id: str = Field(..., description="Unique task identifier.")
    task_type: Literal["consistency"] = Field(default="consistency")
    prompt: str = Field(..., description="The user prompt / instruction.")
    response_a: str = Field(..., description="Candidate response A.")
    response_b: str = Field(..., description="Candidate response B.")
    response_c: str = Field(..., description="Candidate response C.")
    response_d: str = Field(..., description="Candidate response D.")
    reward: float = Field(default=0.0, description="Reward signal from last step.")
    done: bool = Field(default=False, description="Whether the episode is complete.")
    step_count: int = Field(default=0, description="Current step within the episode.")
    info: dict = Field(default_factory=dict, description="Extra debug info.")