Spaces:
Sleeping
Sleeping
File size: 5,821 Bytes
0b6a889 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | """Task definitions for the FinePrint OpenEnv environment.
Defines three progressively harder policy compliance scenarios:
1. quote_accuracy (EASY) - Quote policies correctly with no drift.
2. drift_detection (MEDIUM) - Detect and adapt to policy changes.
3. compliance_storm (HARD) - Full compliance under heavy drift.
Each task configures the environment with specific workflows, drift
parameters, and grading criteria.
"""
from __future__ import annotations
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
TASK_IDS: list[str] = [
"quote_accuracy",
"drift_detection",
"compliance_storm",
]
def get_task(task_id: str) -> dict:
"""Return the full task configuration for *task_id*.
Returns
-------
dict with keys:
task_id β str
description β human-readable problem statement
workflows β list[str] workflow names to run
max_versions β int how many policy versions are available
drift_probability β float
silent_drift_ratio β float
max_steps β int agent step budget
"""
builders = {
"quote_accuracy": _build_quote_accuracy,
"drift_detection": _build_drift_detection,
"compliance_storm": _build_compliance_storm,
}
builder = builders.get(task_id)
if builder is None:
raise ValueError(
f"Unknown task_id {task_id!r}. "
f"Valid IDs: {', '.join(TASK_IDS)}"
)
return builder()
# ---------------------------------------------------------------------------
# Task 1 β quote_accuracy (EASY)
# ---------------------------------------------------------------------------
def _build_quote_accuracy() -> dict:
"""Two workflows, zero drift. Tests basic policy quoting accuracy."""
return {
"task_id": "quote_accuracy",
"description": (
"You are a customer service agent. Handle the following customer "
"workflows by quoting company policies accurately.\n\n"
"There are NO policy changes during this task β just quote "
"the current policies correctly.\n\n"
"Workflows: shopping checkout, product return\n\n"
"Tips:\n"
"- Use 'view_policies' to see current policy values\n"
"- Use 'quote_policy' with the correct field path and value\n"
"- Use 'respond_to_user' for non-policy messages\n"
"- Use 'take_action' to process workflow steps\n"
"- Use 'submit' when all workflows are complete"
),
"workflows": ["shop", "return"],
"max_versions": 1,
"drift_probability": 0.0,
"silent_drift_ratio": 0.0,
"max_steps": 20,
}
# ---------------------------------------------------------------------------
# Task 2 β drift_detection (MEDIUM)
# ---------------------------------------------------------------------------
def _build_drift_detection() -> dict:
"""Three workflows with moderate drift. Tests drift detection."""
return {
"task_id": "drift_detection",
"description": (
"You are a customer service agent handling multiple workflows. "
"Company policies may change mid-conversation without warning.\n\n"
"You must detect when policies have changed and adapt your "
"responses accordingly. Quoting stale policies is penalized.\n\n"
"Workflows: shopping checkout, product return, subscription signup\n\n"
"Tips:\n"
"- Use 'request_verification' periodically to check for updates\n"
"- Watch for system notifications about policy changes\n"
"- After verification, re-read policies with 'view_policies'\n"
"- Stale policy quotes (from old versions) incur heavy penalties\n"
"- Use 'submit' when all workflows are complete"
),
"workflows": ["shop", "return", "subscribe"],
"max_versions": 4,
"drift_probability": 0.30,
"silent_drift_ratio": 0.50,
"max_steps": 30,
}
# ---------------------------------------------------------------------------
# Task 3 β compliance_storm (HARD)
# ---------------------------------------------------------------------------
def _build_compliance_storm() -> dict:
"""All five workflows with aggressive silent drift. Full compliance test."""
return {
"task_id": "compliance_storm",
"description": (
"You are a customer service agent under extreme conditions. "
"Handle ALL customer workflows while policies change frequently "
"and silently. Critical policy fields may change without notice.\n\n"
"This task tests your ability to maintain compliance under "
"pressure β balancing workflow progress with policy freshness.\n\n"
"Workflows: shopping, returns, subscriptions, bookings, complaints\n\n"
"Tips:\n"
"- Policies change frequently and often silently (no notification)\n"
"- Verify policies before every quote when possible\n"
"- Severity of errors: HIGH for stale quotes, CRITICAL for "
"scope changes\n"
"- The policy version may jump multiple versions at once\n"
"- Balance speed (workflow completion) with accuracy (compliance)\n"
"- Use 'submit' when all workflows are complete"
),
"workflows": ["shop", "return", "subscribe", "book", "complain"],
"max_versions": 8,
"drift_probability": 0.50,
"silent_drift_ratio": 0.80,
"max_steps": 45,
}
|