File size: 5,821 Bytes
0b6a889
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
"""Task definitions for the FinePrint OpenEnv environment.

Defines three progressively harder policy compliance scenarios:
  1. quote_accuracy     (EASY)   - Quote policies correctly with no drift.
  2. drift_detection    (MEDIUM) - Detect and adapt to policy changes.
  3. compliance_storm   (HARD)   - Full compliance under heavy drift.

Each task configures the environment with specific workflows, drift
parameters, and grading criteria.
"""

from __future__ import annotations

# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------

TASK_IDS: list[str] = [
    "quote_accuracy",
    "drift_detection",
    "compliance_storm",
]


def get_task(task_id: str) -> dict:
    """Return the full task configuration for *task_id*.

    Returns
    -------
    dict with keys:
        task_id            – str
        description        – human-readable problem statement
        workflows          – list[str]  workflow names to run
        max_versions       – int  how many policy versions are available
        drift_probability  – float
        silent_drift_ratio – float
        max_steps          – int  agent step budget
    """
    builders = {
        "quote_accuracy": _build_quote_accuracy,
        "drift_detection": _build_drift_detection,
        "compliance_storm": _build_compliance_storm,
    }

    builder = builders.get(task_id)
    if builder is None:
        raise ValueError(
            f"Unknown task_id {task_id!r}. "
            f"Valid IDs: {', '.join(TASK_IDS)}"
        )
    return builder()


# ---------------------------------------------------------------------------
# Task 1 β€” quote_accuracy (EASY)
# ---------------------------------------------------------------------------

def _build_quote_accuracy() -> dict:
    """Two workflows, zero drift. Tests basic policy quoting accuracy."""

    return {
        "task_id": "quote_accuracy",
        "description": (
            "You are a customer service agent. Handle the following customer "
            "workflows by quoting company policies accurately.\n\n"
            "There are NO policy changes during this task β€” just quote "
            "the current policies correctly.\n\n"
            "Workflows: shopping checkout, product return\n\n"
            "Tips:\n"
            "- Use 'view_policies' to see current policy values\n"
            "- Use 'quote_policy' with the correct field path and value\n"
            "- Use 'respond_to_user' for non-policy messages\n"
            "- Use 'take_action' to process workflow steps\n"
            "- Use 'submit' when all workflows are complete"
        ),
        "workflows": ["shop", "return"],
        "max_versions": 1,
        "drift_probability": 0.0,
        "silent_drift_ratio": 0.0,
        "max_steps": 20,
    }


# ---------------------------------------------------------------------------
# Task 2 β€” drift_detection (MEDIUM)
# ---------------------------------------------------------------------------

def _build_drift_detection() -> dict:
    """Three workflows with moderate drift. Tests drift detection."""

    return {
        "task_id": "drift_detection",
        "description": (
            "You are a customer service agent handling multiple workflows. "
            "Company policies may change mid-conversation without warning.\n\n"
            "You must detect when policies have changed and adapt your "
            "responses accordingly. Quoting stale policies is penalized.\n\n"
            "Workflows: shopping checkout, product return, subscription signup\n\n"
            "Tips:\n"
            "- Use 'request_verification' periodically to check for updates\n"
            "- Watch for system notifications about policy changes\n"
            "- After verification, re-read policies with 'view_policies'\n"
            "- Stale policy quotes (from old versions) incur heavy penalties\n"
            "- Use 'submit' when all workflows are complete"
        ),
        "workflows": ["shop", "return", "subscribe"],
        "max_versions": 4,
        "drift_probability": 0.30,
        "silent_drift_ratio": 0.50,
        "max_steps": 30,
    }


# ---------------------------------------------------------------------------
# Task 3 β€” compliance_storm (HARD)
# ---------------------------------------------------------------------------

def _build_compliance_storm() -> dict:
    """All five workflows with aggressive silent drift. Full compliance test."""

    return {
        "task_id": "compliance_storm",
        "description": (
            "You are a customer service agent under extreme conditions. "
            "Handle ALL customer workflows while policies change frequently "
            "and silently. Critical policy fields may change without notice.\n\n"
            "This task tests your ability to maintain compliance under "
            "pressure β€” balancing workflow progress with policy freshness.\n\n"
            "Workflows: shopping, returns, subscriptions, bookings, complaints\n\n"
            "Tips:\n"
            "- Policies change frequently and often silently (no notification)\n"
            "- Verify policies before every quote when possible\n"
            "- Severity of errors: HIGH for stale quotes, CRITICAL for "
            "scope changes\n"
            "- The policy version may jump multiple versions at once\n"
            "- Balance speed (workflow completion) with accuracy (compliance)\n"
            "- Use 'submit' when all workflows are complete"
        ),
        "workflows": ["shop", "return", "subscribe", "book", "complain"],
        "max_versions": 8,
        "drift_probability": 0.50,
        "silent_drift_ratio": 0.80,
        "max_steps": 45,
    }