"""Task definitions for the FinePrint OpenEnv environment. Defines three progressively harder policy compliance scenarios: 1. quote_accuracy (EASY) - Quote policies correctly with no drift. 2. drift_detection (MEDIUM) - Detect and adapt to policy changes. 3. compliance_storm (HARD) - Full compliance under heavy drift. Each task configures the environment with specific workflows, drift parameters, and grading criteria. """ from __future__ import annotations # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- TASK_IDS: list[str] = [ "quote_accuracy", "drift_detection", "compliance_storm", ] def get_task(task_id: str) -> dict: """Return the full task configuration for *task_id*. Returns ------- dict with keys: task_id – str description – human-readable problem statement workflows – list[str] workflow names to run max_versions – int how many policy versions are available drift_probability – float silent_drift_ratio – float max_steps – int agent step budget """ builders = { "quote_accuracy": _build_quote_accuracy, "drift_detection": _build_drift_detection, "compliance_storm": _build_compliance_storm, } builder = builders.get(task_id) if builder is None: raise ValueError( f"Unknown task_id {task_id!r}. " f"Valid IDs: {', '.join(TASK_IDS)}" ) return builder() # --------------------------------------------------------------------------- # Task 1 — quote_accuracy (EASY) # --------------------------------------------------------------------------- def _build_quote_accuracy() -> dict: """Two workflows, zero drift. Tests basic policy quoting accuracy.""" return { "task_id": "quote_accuracy", "description": ( "You are a customer service agent. Handle the following customer " "workflows by quoting company policies accurately.\n\n" "There are NO policy changes during this task — just quote " "the current policies correctly.\n\n" "Workflows: shopping checkout, product return\n\n" "Tips:\n" "- Use 'view_policies' to see current policy values\n" "- Use 'quote_policy' with the correct field path and value\n" "- Use 'respond_to_user' for non-policy messages\n" "- Use 'take_action' to process workflow steps\n" "- Use 'submit' when all workflows are complete" ), "workflows": ["shop", "return"], "max_versions": 1, "drift_probability": 0.0, "silent_drift_ratio": 0.0, "max_steps": 20, } # --------------------------------------------------------------------------- # Task 2 — drift_detection (MEDIUM) # --------------------------------------------------------------------------- def _build_drift_detection() -> dict: """Three workflows with moderate drift. Tests drift detection.""" return { "task_id": "drift_detection", "description": ( "You are a customer service agent handling multiple workflows. " "Company policies may change mid-conversation without warning.\n\n" "You must detect when policies have changed and adapt your " "responses accordingly. Quoting stale policies is penalized.\n\n" "Workflows: shopping checkout, product return, subscription signup\n\n" "Tips:\n" "- Use 'request_verification' periodically to check for updates\n" "- Watch for system notifications about policy changes\n" "- After verification, re-read policies with 'view_policies'\n" "- Stale policy quotes (from old versions) incur heavy penalties\n" "- Use 'submit' when all workflows are complete" ), "workflows": ["shop", "return", "subscribe"], "max_versions": 4, "drift_probability": 0.30, "silent_drift_ratio": 0.50, "max_steps": 30, } # --------------------------------------------------------------------------- # Task 3 — compliance_storm (HARD) # --------------------------------------------------------------------------- def _build_compliance_storm() -> dict: """All five workflows with aggressive silent drift. Full compliance test.""" return { "task_id": "compliance_storm", "description": ( "You are a customer service agent under extreme conditions. " "Handle ALL customer workflows while policies change frequently " "and silently. Critical policy fields may change without notice.\n\n" "This task tests your ability to maintain compliance under " "pressure — balancing workflow progress with policy freshness.\n\n" "Workflows: shopping, returns, subscriptions, bookings, complaints\n\n" "Tips:\n" "- Policies change frequently and often silently (no notification)\n" "- Verify policies before every quote when possible\n" "- Severity of errors: HIGH for stale quotes, CRITICAL for " "scope changes\n" "- The policy version may jump multiple versions at once\n" "- Balance speed (workflow completion) with accuracy (compliance)\n" "- Use 'submit' when all workflows are complete" ), "workflows": ["shop", "return", "subscribe", "book", "complain"], "max_versions": 8, "drift_probability": 0.50, "silent_drift_ratio": 0.80, "max_steps": 45, }