Spaces:
Sleeping
Sleeping
| """Task definitions for the FinePrint OpenEnv environment. | |
| Defines three progressively harder policy compliance scenarios: | |
| 1. quote_accuracy (EASY) - Quote policies correctly with no drift. | |
| 2. drift_detection (MEDIUM) - Detect and adapt to policy changes. | |
| 3. compliance_storm (HARD) - Full compliance under heavy drift. | |
| Each task configures the environment with specific workflows, drift | |
| parameters, and grading criteria. | |
| """ | |
| from __future__ import annotations | |
| # --------------------------------------------------------------------------- | |
| # Public API | |
| # --------------------------------------------------------------------------- | |
| TASK_IDS: list[str] = [ | |
| "quote_accuracy", | |
| "drift_detection", | |
| "compliance_storm", | |
| ] | |
| def get_task(task_id: str) -> dict: | |
| """Return the full task configuration for *task_id*. | |
| Returns | |
| ------- | |
| dict with keys: | |
| task_id β str | |
| description β human-readable problem statement | |
| workflows β list[str] workflow names to run | |
| max_versions β int how many policy versions are available | |
| drift_probability β float | |
| silent_drift_ratio β float | |
| max_steps β int agent step budget | |
| """ | |
| builders = { | |
| "quote_accuracy": _build_quote_accuracy, | |
| "drift_detection": _build_drift_detection, | |
| "compliance_storm": _build_compliance_storm, | |
| } | |
| builder = builders.get(task_id) | |
| if builder is None: | |
| raise ValueError( | |
| f"Unknown task_id {task_id!r}. " | |
| f"Valid IDs: {', '.join(TASK_IDS)}" | |
| ) | |
| return builder() | |
| # --------------------------------------------------------------------------- | |
| # Task 1 β quote_accuracy (EASY) | |
| # --------------------------------------------------------------------------- | |
| def _build_quote_accuracy() -> dict: | |
| """Two workflows, zero drift. Tests basic policy quoting accuracy.""" | |
| return { | |
| "task_id": "quote_accuracy", | |
| "description": ( | |
| "You are a customer service agent. Handle the following customer " | |
| "workflows by quoting company policies accurately.\n\n" | |
| "There are NO policy changes during this task β just quote " | |
| "the current policies correctly.\n\n" | |
| "Workflows: shopping checkout, product return\n\n" | |
| "Tips:\n" | |
| "- Use 'view_policies' to see current policy values\n" | |
| "- Use 'quote_policy' with the correct field path and value\n" | |
| "- Use 'respond_to_user' for non-policy messages\n" | |
| "- Use 'take_action' to process workflow steps\n" | |
| "- Use 'submit' when all workflows are complete" | |
| ), | |
| "workflows": ["shop", "return"], | |
| "max_versions": 1, | |
| "drift_probability": 0.0, | |
| "silent_drift_ratio": 0.0, | |
| "max_steps": 20, | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Task 2 β drift_detection (MEDIUM) | |
| # --------------------------------------------------------------------------- | |
| def _build_drift_detection() -> dict: | |
| """Three workflows with moderate drift. Tests drift detection.""" | |
| return { | |
| "task_id": "drift_detection", | |
| "description": ( | |
| "You are a customer service agent handling multiple workflows. " | |
| "Company policies may change mid-conversation without warning.\n\n" | |
| "You must detect when policies have changed and adapt your " | |
| "responses accordingly. Quoting stale policies is penalized.\n\n" | |
| "Workflows: shopping checkout, product return, subscription signup\n\n" | |
| "Tips:\n" | |
| "- Use 'request_verification' periodically to check for updates\n" | |
| "- Watch for system notifications about policy changes\n" | |
| "- After verification, re-read policies with 'view_policies'\n" | |
| "- Stale policy quotes (from old versions) incur heavy penalties\n" | |
| "- Use 'submit' when all workflows are complete" | |
| ), | |
| "workflows": ["shop", "return", "subscribe"], | |
| "max_versions": 4, | |
| "drift_probability": 0.30, | |
| "silent_drift_ratio": 0.50, | |
| "max_steps": 30, | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Task 3 β compliance_storm (HARD) | |
| # --------------------------------------------------------------------------- | |
| def _build_compliance_storm() -> dict: | |
| """All five workflows with aggressive silent drift. Full compliance test.""" | |
| return { | |
| "task_id": "compliance_storm", | |
| "description": ( | |
| "You are a customer service agent under extreme conditions. " | |
| "Handle ALL customer workflows while policies change frequently " | |
| "and silently. Critical policy fields may change without notice.\n\n" | |
| "This task tests your ability to maintain compliance under " | |
| "pressure β balancing workflow progress with policy freshness.\n\n" | |
| "Workflows: shopping, returns, subscriptions, bookings, complaints\n\n" | |
| "Tips:\n" | |
| "- Policies change frequently and often silently (no notification)\n" | |
| "- Verify policies before every quote when possible\n" | |
| "- Severity of errors: HIGH for stale quotes, CRITICAL for " | |
| "scope changes\n" | |
| "- The policy version may jump multiple versions at once\n" | |
| "- Balance speed (workflow completion) with accuracy (compliance)\n" | |
| "- Use 'submit' when all workflows are complete" | |
| ), | |
| "workflows": ["shop", "return", "subscribe", "book", "complain"], | |
| "max_versions": 8, | |
| "drift_probability": 0.50, | |
| "silent_drift_ratio": 0.80, | |
| "max_steps": 45, | |
| } | |