Spaces:

PraneshkumarR
/

fineprint-env

Sleeping

File size: 5,821 Bytes

0b6a889

"""Task definitions for the FinePrint OpenEnv environment.

Defines three progressively harder policy compliance scenarios:
  1. quote_accuracy     (EASY)   - Quote policies correctly with no drift.
  2. drift_detection    (MEDIUM) - Detect and adapt to policy changes.
  3. compliance_storm   (HARD)   - Full compliance under heavy drift.

Each task configures the environment with specific workflows, drift
parameters, and grading criteria.
"""

from __future__ import annotations

# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------

TASK_IDS: list[str] = [
    "quote_accuracy",
    "drift_detection",
    "compliance_storm",
]


def get_task(task_id: str) -> dict:
    """Return the full task configuration for *task_id*.

    Returns
    -------
    dict with keys:
        task_id            – str
        description        – human-readable problem statement
        workflows          – list[str]  workflow names to run
        max_versions       – int  how many policy versions are available
        drift_probability  – float
        silent_drift_ratio – float
        max_steps          – int  agent step budget
    """
    builders = {
        "quote_accuracy": _build_quote_accuracy,
        "drift_detection": _build_drift_detection,
        "compliance_storm": _build_compliance_storm,
    }

    builder = builders.get(task_id)
    if builder is None:
        raise ValueError(
            f"Unknown task_id {task_id!r}. "
            f"Valid IDs: {', '.join(TASK_IDS)}"
        )
    return builder()


# ---------------------------------------------------------------------------
# Task 1 — quote_accuracy (EASY)
# ---------------------------------------------------------------------------

def _build_quote_accuracy() -> dict:
    """Two workflows, zero drift. Tests basic policy quoting accuracy."""

    return {
        "task_id": "quote_accuracy",
        "description": (
            "You are a customer service agent. Handle the following customer "
            "workflows by quoting company policies accurately.\n\n"
            "There are NO policy changes during this task — just quote "
            "the current policies correctly.\n\n"
            "Workflows: shopping checkout, product return\n\n"
            "Tips:\n"
            "- Use 'view_policies' to see current policy values\n"
            "- Use 'quote_policy' with the correct field path and value\n"
            "- Use 'respond_to_user' for non-policy messages\n"
            "- Use 'take_action' to process workflow steps\n"
            "- Use 'submit' when all workflows are complete"
        ),
        "workflows": ["shop", "return"],
        "max_versions": 1,
        "drift_probability": 0.0,
        "silent_drift_ratio": 0.0,
        "max_steps": 20,
    }


# ---------------------------------------------------------------------------
# Task 2 — drift_detection (MEDIUM)
# ---------------------------------------------------------------------------

def _build_drift_detection() -> dict:
    """Three workflows with moderate drift. Tests drift detection."""

    return {
        "task_id": "drift_detection",
        "description": (
            "You are a customer service agent handling multiple workflows. "
            "Company policies may change mid-conversation without warning.\n\n"
            "You must detect when policies have changed and adapt your "
            "responses accordingly. Quoting stale policies is penalized.\n\n"
            "Workflows: shopping checkout, product return, subscription signup\n\n"
            "Tips:\n"
            "- Use 'request_verification' periodically to check for updates\n"
            "- Watch for system notifications about policy changes\n"
            "- After verification, re-read policies with 'view_policies'\n"
            "- Stale policy quotes (from old versions) incur heavy penalties\n"
            "- Use 'submit' when all workflows are complete"
        ),
        "workflows": ["shop", "return", "subscribe"],
        "max_versions": 4,
        "drift_probability": 0.30,
        "silent_drift_ratio": 0.50,
        "max_steps": 30,
    }


# ---------------------------------------------------------------------------
# Task 3 — compliance_storm (HARD)
# ---------------------------------------------------------------------------

def _build_compliance_storm() -> dict:
    """All five workflows with aggressive silent drift. Full compliance test."""

    return {
        "task_id": "compliance_storm",
        "description": (
            "You are a customer service agent under extreme conditions. "
            "Handle ALL customer workflows while policies change frequently "
            "and silently. Critical policy fields may change without notice.\n\n"
            "This task tests your ability to maintain compliance under "
            "pressure — balancing workflow progress with policy freshness.\n\n"
            "Workflows: shopping, returns, subscriptions, bookings, complaints\n\n"
            "Tips:\n"
            "- Policies change frequently and often silently (no notification)\n"
            "- Verify policies before every quote when possible\n"
            "- Severity of errors: HIGH for stale quotes, CRITICAL for "
            "scope changes\n"
            "- The policy version may jump multiple versions at once\n"
            "- Balance speed (workflow completion) with accuracy (compliance)\n"
            "- Use 'submit' when all workflows are complete"
        ),
        "workflows": ["shop", "return", "subscribe", "book", "complain"],
        "max_versions": 8,
        "drift_probability": 0.50,
        "silent_drift_ratio": 0.80,
        "max_steps": 45,
    }