Spaces:

PraneshkumarR
/

fineprint-env

Sleeping

App Files Files Community

fineprint-env / server /tasks.py

vigneshmoovendhan

Fine Print RL final

0b6a889 16 days ago

raw

history blame contribute delete

5.82 kB

	"""Task definitions for the FinePrint OpenEnv environment.

	Defines three progressively harder policy compliance scenarios:
	1. quote_accuracy (EASY) - Quote policies correctly with no drift.
	2. drift_detection (MEDIUM) - Detect and adapt to policy changes.
	3. compliance_storm (HARD) - Full compliance under heavy drift.

	Each task configures the environment with specific workflows, drift
	parameters, and grading criteria.
	"""

	from __future__ import annotations

	# ---------------------------------------------------------------------------
	# Public API
	# ---------------------------------------------------------------------------

	TASK_IDS: list[str] = [
	"quote_accuracy",
	"drift_detection",
	"compliance_storm",
	]


	def get_task(task_id: str) -> dict:
	"""Return the full task configuration for task_id.

	Returns
	-------
	dict with keys:
	task_id – str
	description – human-readable problem statement
	workflows – list[str] workflow names to run
	max_versions – int how many policy versions are available
	drift_probability – float
	silent_drift_ratio – float
	max_steps – int agent step budget
	"""
	builders = {
	"quote_accuracy": _build_quote_accuracy,
	"drift_detection": _build_drift_detection,
	"compliance_storm": _build_compliance_storm,
	}

	builder = builders.get(task_id)
	if builder is None:
	raise ValueError(
	f"Unknown task_id {task_id!r}. "
	f"Valid IDs: {', '.join(TASK_IDS)}"
	)
	return builder()


	# ---------------------------------------------------------------------------
	# Task 1 — quote_accuracy (EASY)
	# ---------------------------------------------------------------------------

	def _build_quote_accuracy() -> dict:
	"""Two workflows, zero drift. Tests basic policy quoting accuracy."""

	return {
	"task_id": "quote_accuracy",
	"description": (
	"You are a customer service agent. Handle the following customer "
	"workflows by quoting company policies accurately.\n\n"
	"There are NO policy changes during this task — just quote "
	"the current policies correctly.\n\n"
	"Workflows: shopping checkout, product return\n\n"
	"Tips:\n"
	"- Use 'view_policies' to see current policy values\n"
	"- Use 'quote_policy' with the correct field path and value\n"
	"- Use 'respond_to_user' for non-policy messages\n"
	"- Use 'take_action' to process workflow steps\n"
	"- Use 'submit' when all workflows are complete"
	),
	"workflows": ["shop", "return"],
	"max_versions": 1,
	"drift_probability": 0.0,
	"silent_drift_ratio": 0.0,
	"max_steps": 20,
	}


	# ---------------------------------------------------------------------------
	# Task 2 — drift_detection (MEDIUM)
	# ---------------------------------------------------------------------------

	def _build_drift_detection() -> dict:
	"""Three workflows with moderate drift. Tests drift detection."""

	return {
	"task_id": "drift_detection",
	"description": (
	"You are a customer service agent handling multiple workflows. "
	"Company policies may change mid-conversation without warning.\n\n"
	"You must detect when policies have changed and adapt your "
	"responses accordingly. Quoting stale policies is penalized.\n\n"
	"Workflows: shopping checkout, product return, subscription signup\n\n"
	"Tips:\n"
	"- Use 'request_verification' periodically to check for updates\n"
	"- Watch for system notifications about policy changes\n"
	"- After verification, re-read policies with 'view_policies'\n"
	"- Stale policy quotes (from old versions) incur heavy penalties\n"
	"- Use 'submit' when all workflows are complete"
	),
	"workflows": ["shop", "return", "subscribe"],
	"max_versions": 4,
	"drift_probability": 0.30,
	"silent_drift_ratio": 0.50,
	"max_steps": 30,
	}


	# ---------------------------------------------------------------------------
	# Task 3 — compliance_storm (HARD)
	# ---------------------------------------------------------------------------

	def _build_compliance_storm() -> dict:
	"""All five workflows with aggressive silent drift. Full compliance test."""

	return {
	"task_id": "compliance_storm",
	"description": (
	"You are a customer service agent under extreme conditions. "
	"Handle ALL customer workflows while policies change frequently "
	"and silently. Critical policy fields may change without notice.\n\n"
	"This task tests your ability to maintain compliance under "
	"pressure — balancing workflow progress with policy freshness.\n\n"
	"Workflows: shopping, returns, subscriptions, bookings, complaints\n\n"
	"Tips:\n"
	"- Policies change frequently and often silently (no notification)\n"
	"- Verify policies before every quote when possible\n"
	"- Severity of errors: HIGH for stale quotes, CRITICAL for "
	"scope changes\n"
	"- The policy version may jump multiple versions at once\n"
	"- Balance speed (workflow completion) with accuracy (compliance)\n"
	"- Use 'submit' when all workflows are complete"
	),
	"workflows": ["shop", "return", "subscribe", "book", "complain"],
	"max_versions": 8,
	"drift_probability": 0.50,
	"silent_drift_ratio": 0.80,
	"max_steps": 45,
	}