Spaces:

yashash045
/

schemashift

Sleeping

App Files Files Community

schemashift / tests /test_environment.py

yashash04

Phase 6: SchemaShiftEnvironment scheduler — full episode lifecycle

c618a61 about 1 month ago

raw

history blame contribute delete

10.6 kB

	"""SchemaShiftEnvironment acceptance tests — Phase 6 (end-to-end RL loop)."""
	from __future__ import annotations

	import pytest

	from models import (
	Action,
	CompleteParams,
	DriftReportParams,
	InspectParams,
	RetryParams,
	ToolCallParams,
	)
	from server.environment import SchemaShiftEnvironment


	# ──────────────────────────────────────────────────────────────────
	# Test 1 — Round 1 bug prevention
	# ──────────────────────────────────────────────────────────────────

	def test_step_before_reset_raises() -> None:
	env = SchemaShiftEnvironment()
	action = Action(type="complete_task", complete=CompleteParams(summary="noop"))
	with pytest.raises(RuntimeError) as excinfo:
	env.step(action)
	assert "reset" in str(excinfo.value).lower()


	# ──────────────────────────────────────────────────────────────────
	# Test 2 — reset returns a valid observation
	# ──────────────────────────────────────────────────────────────────

	def test_reset_returns_valid_observation() -> None:
	env = SchemaShiftEnvironment()
	obs = env.reset("E1_onboard_new_hire")
	assert obs.task_id == "E1_onboard_new_hire"
	assert obs.step == 0
	assert obs.max_steps == 8
	assert "mail" in obs.tool_schemas
	assert "calendar" in obs.tool_schemas
	assert "crm" not in obs.tool_schemas
	assert obs.done is False
	assert obs.difficulty == "easy"
	assert len(obs.success_criteria) >= 1


	# ──────────────────────────────────────────────────────────────────
	# Test 3 — reset on unknown task raises
	# ──────────────────────────────────────────────────────────────────

	def test_reset_unknown_task_raises() -> None:
	env = SchemaShiftEnvironment()
	with pytest.raises(ValueError):
	env.reset("nonexistent_task")


	# ──────────────────────────────────────────────────────────────────
	# Test 4 — call_tool success updates agent_state
	# ──────────────────────────────────────────────────────────────────

	def test_call_tool_success_updates_state() -> None:
	env = SchemaShiftEnvironment()
	env.reset("E1_onboard_new_hire")

	obs, reward = env.step(Action(
	type="call_tool",
	tool_call=ToolCallParams(
	tool="mail",
	endpoint="send_message",
	params={
	"to": "priya@company.com",
	"subject": "Welcome!",
	"body": "Welcome to the team.",
	},
	),
	))
	assert obs.last_response is not None
	assert obs.last_response.ok is True
	assert obs.known_state["mail.sent_count"] == 1
	assert obs.known_state["mail.last_sent_to"] == "priya@company.com"
	assert obs.known_state["mail.last_subject_contains_welcome"] is True


	# ──────────────────────────────────────────────────────────────────
	# Test 5 — FULL E1 episode with drift → inspect → retry → report → complete
	# ──────────────────────────────────────────────────────────────────

	def test_e1_full_episode_with_adaptation() -> None:
	env = SchemaShiftEnvironment()
	env.reset("E1_onboard_new_hire")

	# Step 1: send welcome email (pre-drift)
	obs, r1 = env.step(Action(
	type="call_tool",
	tool_call=ToolCallParams(
	tool="mail", endpoint="send_message",
	params={"to": "priya@company.com", "subject": "Welcome aboard!",
	"body": "Welcome to the team."},
	),
	))
	assert obs.last_response is not None and obs.last_response.ok is True

	# Step 2: inspect calendar (pre-drift)
	obs, r2 = env.step(Action(
	type="inspect_schema", inspect=InspectParams(tool="calendar"),
	))
	assert obs.last_response is not None and obs.last_response.ok is True

	# Step 3: drift fires at state.step=3; call with stale attendees fails
	obs, r3 = env.step(Action(
	type="call_tool",
	tool_call=ToolCallParams(
	tool="calendar", endpoint="create_event",
	params={"title": "New Hire Orientation",
	"start": "2026-04-27T10:00:00Z",
	"end": "2026-04-27T11:00:00Z",
	"attendees": ["priya@company.com", "alex@company.com"]},
	),
	))
	assert obs.last_response is not None and obs.last_response.ok is False

	# Step 4: inspect calendar (now shows participants schema)
	obs, r4 = env.step(Action(
	type="inspect_schema", inspect=InspectParams(tool="calendar"),
	))
	assert obs.last_response is not None and obs.last_response.ok is True
	cal_schema = obs.tool_schemas["calendar"]["create_event"]
	assert "participants" in cal_schema["params"]

	# Step 5: retry with participants format
	obs, r5 = env.step(Action(
	type="retry_with_variant",
	retry=RetryParams(
	tool="calendar", endpoint="create_event",
	params={"title": "New Hire Orientation",
	"start": "2026-04-27T10:00:00Z",
	"end": "2026-04-27T11:00:00Z",
	"participants": [
	{"email": "priya@company.com", "role": "required"},
	{"email": "alex@company.com", "role": "required"},
	]},
	),
	))
	assert obs.last_response is not None and obs.last_response.ok is True

	# Step 6: report drift
	obs, r6 = env.step(Action(
	type="report_drift",
	report=DriftReportParams(
	tool="calendar", drift_kind="field_rename",
	description="create_event attendees renamed to participants",
	),
	))

	# Step 7: complete
	obs, r7 = env.step(Action(
	type="complete_task",
	complete=CompleteParams(
	summary="Onboarded Priya with welcome email and orientation event.",
	),
	))

	state = env._state
	assert state is not None
	assert obs.done is True
	assert state.agent_state["mail.sent_count"] == 1
	assert state.agent_state["calendar.events_count"] == 1
	assert state.agent_state["calendar.last_event_has_both_attendees"] is True
	assert state.drift_plan[0].detected_by_agent is True
	assert r7.task_completion == 1.0
	assert r7.drift_detection == 1.0
	assert r7.adaptation_quality == 1.0
	assert r7.shaped_total > 0.5
	assert r7.binary == 1.0


	# ──────────────────────────────────────────────────────────────────
	# Test 6 — max_steps terminates episode
	# ──────────────────────────────────────────────────────────────────

	def test_max_steps_terminates_episode() -> None:
	env = SchemaShiftEnvironment()
	env.reset("E2_meeting_invite_blast")
	inspect = Action(type="inspect_schema", inspect=InspectParams(tool="mail"))
	obs = None
	for _ in range(6):
	obs, _ = env.step(inspect)
	assert obs is not None
	assert obs.done is True
	assert obs.step == 6


	# ──────────────────────────────────────────────────────────────────
	# Test 7 — step_shaping +0.10 for inspect after failure
	# ──────────────────────────────────────────────────────────────────

	def test_step_shaping_applied_correctly() -> None:
	env = SchemaShiftEnvironment()
	env.reset("E1_onboard_new_hire")

	# Step 1: send_message missing required 'body' → 400
	env.step(Action(
	type="call_tool",
	tool_call=ToolCallParams(
	tool="mail", endpoint="send_message",
	params={"to": "x@y.com", "subject": "hi"},
	),
	))

	# Step 2: inspect_schema after failure → +0.10 shaping
	obs, reward = env.step(Action(
	type="inspect_schema", inspect=InspectParams(tool="mail"),
	))
	assert reward.step_shaping == pytest.approx(0.10)


	# ──────────────────────────────────────────────────────────────────
	# Test 8 — dumb retry penalty
	# ──────────────────────────────────────────────────────────────────

	def test_dumb_retry_penalty() -> None:
	env = SchemaShiftEnvironment()
	env.reset("E1_onboard_new_hire")

	# Step 1: call_tool mail.send_message with only {"to": "x"} → 400
	env.step(Action(
	type="call_tool",
	tool_call=ToolCallParams(
	tool="mail", endpoint="send_message",
	params={"to": "x@y.com"},
	),
	))

	# Step 2: same call again → dumb retry → -0.05 penalty
	obs, reward = env.step(Action(
	type="call_tool",
	tool_call=ToolCallParams(
	tool="mail", endpoint="send_message",
	params={"to": "x@y.com"},
	),
	))
	assert reward.step_shaping == pytest.approx(-0.05)