commitment-os / tests /test_environment.py
jayantaggarwal-sketch
Sync from GitHub: MCP tool names, HF_README, tests
01ab723
"""Comprehensive test suite for CommitmentOS.
Tests cover:
- Grader (perfect/partial/zero for each component)
- Environment lifecycle (reset/step/state/multi-turn)
- Commitment ledger (creation, violation, renegotiation)
- Task dataset integrity
- API endpoints
- Difficulty verification
"""
from __future__ import annotations
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
import json
from typing import Any, Dict
import pytest
from models import CommitmentAction, CommitmentObservation, CommitmentState
from server.domain import CalendarEvent, ConstraintDef, ScenarioDef
from server.environment import CommitmentEnvironment
from server.graders import (
_calendar_has_no_overlaps,
_keyword_score,
_score_commitment_coherence,
_score_conflict_resolution,
_score_step_efficiency,
grade_scenario,
)
from server.tasks import get_all_scenarios, get_scenario, get_scenarios_by_difficulty
from server.world import WorldState, _time_to_min
# ===================================================================
# Fixtures
# ===================================================================
@pytest.fixture
def env() -> CommitmentEnvironment:
return CommitmentEnvironment()
@pytest.fixture
def easy_env(env: CommitmentEnvironment) -> CommitmentEnvironment:
env.reset(task_id="easy_001")
return env
# ===================================================================
# 1. Task dataset integrity
# ===================================================================
class TestTaskDataset:
def test_15_scenarios_loaded(self) -> None:
scenarios = get_all_scenarios()
assert len(scenarios) == 15
def test_5_easy_5_medium_5_hard(self) -> None:
for difficulty, count in [("easy", 5), ("medium", 5), ("hard", 5)]:
tasks = get_scenarios_by_difficulty(difficulty)
assert len(tasks) == count, f"Expected {count} {difficulty} tasks, got {len(tasks)}"
def test_each_scenario_has_required_fields(self) -> None:
for sid, scenario in get_all_scenarios().items():
assert scenario.scenario_id == sid
assert scenario.difficulty in ("easy", "medium", "hard")
assert len(scenario.briefing) > 20, f"{sid}: briefing too short"
assert scenario.optimal_steps >= 2, f"{sid}: optimal_steps too low"
assert scenario.max_steps >= scenario.optimal_steps
assert len(scenario.constraints) >= 1, f"{sid}: no constraints defined"
def test_scenario_ids_unique(self) -> None:
ids = list(get_all_scenarios().keys())
assert len(ids) == len(set(ids))
def test_get_scenario_returns_none_for_missing(self) -> None:
assert get_scenario("nonexistent_999") is None
def test_get_scenario_returns_correct(self) -> None:
s = get_scenario("easy_001")
assert s is not None
assert s.difficulty == "easy"
# ===================================================================
# 2. Grader unit tests
# ===================================================================
class TestKeywordScore:
def test_full_match(self) -> None:
score, matched = _keyword_score("I need to reschedule the standup meeting", ["reschedule", "standup"], min_matches=2)
assert score == 1.0
assert len(matched) == 2
def test_partial_match(self) -> None:
score, matched = _keyword_score("I need to reschedule", ["reschedule", "standup"], min_matches=2)
assert score == 0.5
assert len(matched) == 1
def test_no_match(self) -> None:
score, matched = _keyword_score("Hello world", ["reschedule", "standup"], min_matches=2)
assert score == 0.0
assert len(matched) == 0
def test_case_insensitive(self) -> None:
score, _ = _keyword_score("RESCHEDULE THE STANDUP", ["reschedule", "standup"], min_matches=2)
assert score == 1.0
class TestCalendarConflicts:
def test_no_conflicts(self) -> None:
scenario = get_scenario("easy_002")
assert scenario is not None
world = WorldState(scenario)
assert _calendar_has_no_overlaps(world) is True
def test_conflict_detected(self) -> None:
scenario = get_scenario("easy_001")
assert scenario is not None
world = WorldState(scenario)
assert _calendar_has_no_overlaps(world) is False
class TestCommitmentCoherence:
def test_no_commitments_full_score(self) -> None:
scenario = get_scenario("easy_005")
assert scenario is not None
world = WorldState(scenario)
score, _ = _score_commitment_coherence(world)
assert score == 1.0
def test_honored_commitment(self, env: CommitmentEnvironment) -> None:
env.reset(task_id="easy_001")
env.step(CommitmentAction(action_type="reschedule_event", event_id="evt_2", new_time="15:00"))
assert env._world is not None
score, feedback = _score_commitment_coherence(env._world)
assert score == 1.0
def test_silent_violation_detected(self, env: CommitmentEnvironment) -> None:
env.reset(task_id="easy_001")
env.step(CommitmentAction(action_type="schedule_meeting", title="New Meeting", date="2026-04-25", time="16:00", participants=["Alice"]))
assert env._world is not None
env._world.calendar.pop("evt_100", None)
for c in env._world.commitment_ledger:
if c.commitment_type == "meeting_scheduled" and "16:00" in c.constraint:
event_key = c.constraint
for eid, ev in list(env._world.calendar.items()):
if ev.time == "16:00" and ev.date == "2026-04-25" and ev.title == "New Meeting":
del env._world.calendar[eid]
break
violations = env._world.get_silent_violations()
assert len(violations) >= 1
class TestStepEfficiency:
def test_optimal_steps(self) -> None:
scenario = get_scenario("easy_001")
assert scenario is not None
world = WorldState(scenario)
world.step_count = 3
score, _ = _score_step_efficiency(scenario, world)
assert score == 1.0
def test_over_optimal(self) -> None:
scenario = get_scenario("easy_001")
assert scenario is not None
world = WorldState(scenario)
world.step_count = 8
score, _ = _score_step_efficiency(scenario, world)
assert score == 0.5
# ===================================================================
# 3. Environment lifecycle
# ===================================================================
class TestEnvironmentLifecycle:
def test_reset_returns_observation(self, env: CommitmentEnvironment) -> None:
obs = env.reset(task_id="easy_001")
assert isinstance(obs, CommitmentObservation)
assert obs.scenario_id == "easy_001"
assert obs.done is False
assert obs.reward == 0.0
assert len(obs.briefing) > 0
def test_step_before_reset_raises(self, env: CommitmentEnvironment) -> None:
with pytest.raises(ValueError, match="No active episode"):
env.step(CommitmentAction(action_type="view_calendar", date="2026-04-25"))
def test_step_after_done_raises(self, env: CommitmentEnvironment) -> None:
env.reset(task_id="easy_001")
env.step(CommitmentAction(action_type="submit_plan"))
with pytest.raises(ValueError, match="already completed"):
env.step(CommitmentAction(action_type="view_calendar", date="2026-04-25"))
def test_state_property(self, env: CommitmentEnvironment) -> None:
env.reset(task_id="easy_001")
state = env.state
assert isinstance(state, CommitmentState)
assert state.scenario_id == "easy_001"
assert state.completed is False
assert len(state.available_tasks) == 15
def test_multi_turn_episode(self, env: CommitmentEnvironment) -> None:
env.reset(task_id="easy_001")
obs = env.step(CommitmentAction(action_type="view_calendar", date="2026-04-25"))
assert obs.done is False
assert obs.step_number == 1
obs = env.step(CommitmentAction(action_type="reschedule_event", event_id="evt_2", new_time="15:00"))
assert obs.done is False
assert obs.step_number == 2
obs = env.step(CommitmentAction(action_type="submit_plan"))
assert obs.done is True
assert obs.reward > 0
def test_max_steps_auto_submits(self, env: CommitmentEnvironment) -> None:
env.reset(task_id="easy_002")
for _ in range(20):
obs = env.step(CommitmentAction(action_type="view_calendar", date="2026-04-25"))
if obs.done:
break
assert obs.done is True
def test_reset_clears_state(self, env: CommitmentEnvironment) -> None:
env.reset(task_id="easy_001")
env.step(CommitmentAction(action_type="view_calendar", date="2026-04-25"))
env.reset(task_id="easy_002")
assert env.state.scenario_id == "easy_002"
assert env.state.step_count == 0
def test_unknown_action_type(self, env: CommitmentEnvironment) -> None:
env.reset(task_id="easy_001")
obs = env.step(CommitmentAction(action_type="fly_to_moon"))
assert "Unknown action_type" in obs.tool_result
def test_random_reset(self, env: CommitmentEnvironment) -> None:
obs = env.reset(seed=42)
assert obs.scenario_id in get_all_scenarios()
def test_difficulty_filter_reset(self, env: CommitmentEnvironment) -> None:
obs = env.reset(difficulty="hard", seed=1)
assert obs.difficulty == "hard"
# ===================================================================
# 4. World simulation (tool functions)
# ===================================================================
class TestWorldTools:
def test_view_calendar(self) -> None:
scenario = get_scenario("easy_001")
assert scenario is not None
world = WorldState(scenario)
result = world.view_calendar("2026-04-25")
assert "evt_1" in result
assert "14:00" in result
def test_view_calendar_empty(self) -> None:
scenario = get_scenario("easy_001")
assert scenario is not None
world = WorldState(scenario)
result = world.view_calendar("2099-01-01")
assert "No events" in result
def test_check_availability(self) -> None:
scenario = get_scenario("easy_003")
assert scenario is not None
world = WorldState(scenario)
result = world.check_availability("Client_Jones")
assert "09:00" in result
def test_check_availability_unknown(self) -> None:
scenario = get_scenario("easy_001")
assert scenario is not None
world = WorldState(scenario)
result = world.check_availability("NonExistentPerson")
assert "not found" in result
def test_search_restaurants_filters(self) -> None:
scenario = get_scenario("med_007")
assert scenario is not None
world = WorldState(scenario)
result = world.search_restaurants(dietary="vegan", max_price=45, max_distance_miles=3.0)
assert "Green Garden" in result
assert "Steak House Prime" not in result
def test_schedule_meeting_creates_commitment(self) -> None:
scenario = get_scenario("easy_002")
assert scenario is not None
world = WorldState(scenario)
result = world.schedule_meeting("Test Meeting", "2026-04-25", "14:00", turn=1)
assert "scheduled" in result.lower()
assert len(world.commitment_ledger) == 1
assert world.commitment_ledger[0].commitment_type == "meeting_scheduled"
def test_schedule_meeting_conflict(self) -> None:
scenario = get_scenario("easy_001")
assert scenario is not None
world = WorldState(scenario)
result = world.schedule_meeting("Conflicting", "2026-04-25", "14:00", turn=1)
assert "CONFLICT" in result
def test_reschedule_event(self) -> None:
scenario = get_scenario("easy_001")
assert scenario is not None
world = WorldState(scenario)
result = world.reschedule_event("evt_2", "15:00", turn=1)
assert "Rescheduled" in result
assert world.calendar["evt_2"].time == "15:00"
def test_cancel_event(self) -> None:
scenario = get_scenario("easy_001")
assert scenario is not None
world = WorldState(scenario)
result = world.cancel_event("evt_2", turn=1)
assert "Cancelled" in result
assert "evt_2" not in world.calendar
def test_send_email(self) -> None:
scenario = get_scenario("easy_001")
assert scenario is not None
world = WorldState(scenario)
result = world.send_email("Team", "Hello", "Testing email body", turn=1)
assert "sent" in result.lower()
assert len(world.emails_sent) == 1
def test_book_restaurant(self) -> None:
scenario = get_scenario("easy_002")
assert scenario is not None
world = WorldState(scenario)
result = world.book_restaurant("Bella Italia", turn=1)
assert "confirmed" in result.lower()
assert world.booked_restaurant == "Bella Italia"
# ===================================================================
# 5. Commitment ledger behaviour
# ===================================================================
class TestCommitmentLedger:
def test_schedule_creates_commitment(self) -> None:
scenario = get_scenario("easy_002")
assert scenario is not None
world = WorldState(scenario)
world.schedule_meeting("Test", "2026-04-25", "10:00", turn=1)
assert len(world.commitment_ledger) == 1
c = world.commitment_ledger[0]
assert c.turn_created == 1
assert c.active is True
assert c.renegotiated_at is None
def test_reschedule_marks_old_renegotiated(self) -> None:
scenario = get_scenario("easy_001")
assert scenario is not None
world = WorldState(scenario)
world.reschedule_event("evt_2", "15:00", turn=1)
renegotiated = [c for c in world.commitment_ledger if c.renegotiated_at is not None]
assert len(renegotiated) == 0 # initial events don't create ledger entries
new_commits = [c for c in world.commitment_ledger if c.active]
assert len(new_commits) >= 1
def test_email_renegotiation_detection(self) -> None:
scenario = get_scenario("easy_001")
assert scenario is not None
world = WorldState(scenario)
world.schedule_meeting("Important", "2026-04-25", "16:00", participants=["Alice"], turn=1)
world.send_email("Alice", "Change of plans", "I need to reschedule our meeting", turn=2)
renegotiated = [c for c in world.commitment_ledger if c.renegotiated_at is not None]
assert len(renegotiated) >= 1
def test_cancel_personal_marks_renegotiated(self) -> None:
scenario = get_scenario("easy_001")
assert scenario is not None
world = WorldState(scenario)
# evt_3 is Lunch (personal)
world.cancel_event("evt_3", turn=1)
# Personal cancellations are auto-OK
# ===================================================================
# 6. Full scenario scoring
# ===================================================================
class TestFullScoring:
def test_perfect_easy_001(self, env: CommitmentEnvironment) -> None:
env.reset(task_id="easy_001")
env.step(CommitmentAction(action_type="reschedule_event", event_id="evt_2", new_time="15:00"))
env.step(CommitmentAction(action_type="send_email", to="Team", subject="Standup moved", body="Hi team, I've rescheduled the standup to 3:00 PM. Sorry for the move."))
obs = env.step(CommitmentAction(action_type="submit_plan"))
assert obs.done is True
assert obs.reward >= 0.85
def test_zero_effort_gets_low_score(self, env: CommitmentEnvironment) -> None:
env.reset(task_id="easy_001")
obs = env.step(CommitmentAction(action_type="submit_plan"))
assert obs.done is True
assert obs.reward <= 0.50
def test_hard_011_perfect_run(self, env: CommitmentEnvironment) -> None:
env.reset(task_id="hard_011")
env.step(CommitmentAction(action_type="view_calendar", date="2026-04-25"))
env.step(CommitmentAction(action_type="cancel_event", event_id="evt_90"))
env.step(CommitmentAction(action_type="search_restaurants", dietary="vegetarian", near_airport=True, max_price=60))
env.step(CommitmentAction(action_type="book_restaurant", restaurant_name="Sky Lounge"))
env.step(CommitmentAction(action_type="send_email", to="Team", subject="Happy Hour Rescheduled", body="Sorry team, I need to reschedule the happy hour to Thursday. An investor dinner came up tonight. Apologies!"))
env.step(CommitmentAction(action_type="send_email", to="VP_Chen", subject="Investor dinner plan", body="I've booked Sky Lounge for dinner tonight with Investor_Park. Vegetarian options available, near the airport."))
obs = env.step(CommitmentAction(action_type="submit_plan"))
assert obs.done is True
assert obs.reward >= 0.85
def test_hard_015_sre_crisis(self, env: CommitmentEnvironment) -> None:
env.reset(task_id="hard_015")
env.step(CommitmentAction(action_type="view_calendar", date="2026-04-25"))
env.step(CommitmentAction(action_type="cancel_event", event_id="evt_130"))
env.step(CommitmentAction(action_type="send_email", to="Team", subject="Lunch cancelled - incident", body="Team, I'm cancelling our lunch due to a production incident. Payment service returning 503s. Will handle this first."))
env.step(CommitmentAction(action_type="send_email", to="Client_Jones", subject="Demo reschedule needed", body="Hi Client_Jones, I sincerely apologize but I need to reschedule our demo. We have a production incident with the payment system. Can we find another time this week?"))
env.step(CommitmentAction(action_type="send_email", to="VP_Chen", subject="Incident + 1-on-1", body="VP_Chen, we have a production incident — payment service is returning 503s. I'm on-call and handling it. May need to reschedule our 1-on-1 depending on resolution time."))
obs = env.step(CommitmentAction(action_type="submit_plan"))
assert obs.done is True
assert obs.reward >= 0.60
# ===================================================================
# 7. Reward clamping
# ===================================================================
class TestRewardClamping:
def test_reward_never_zero(self, env: CommitmentEnvironment) -> None:
env.reset(task_id="easy_001")
obs = env.step(CommitmentAction(action_type="submit_plan"))
assert obs.reward >= 0.01
def test_reward_never_one(self, env: CommitmentEnvironment) -> None:
env.reset(task_id="easy_001")
env.step(CommitmentAction(action_type="reschedule_event", event_id="evt_2", new_time="15:00"))
env.step(CommitmentAction(action_type="send_email", to="Team", subject="Standup moved", body="Hi team, the standup is rescheduled to 3pm. Sorry for the move."))
obs = env.step(CommitmentAction(action_type="submit_plan"))
assert obs.reward <= 0.99
assert obs.reward > 0.01
# ===================================================================
# 8. Time utility
# ===================================================================
class TestTimeUtil:
def test_time_to_min(self) -> None:
assert _time_to_min("00:00") == 0
assert _time_to_min("09:30") == 570
assert _time_to_min("14:00") == 840
assert _time_to_min("23:59") == 1439
# ===================================================================
# 9. API endpoint tests (via TestClient)
# ===================================================================
class TestAPI:
@pytest.fixture
def client(self):
from fastapi.testclient import TestClient
from server.app import app
return TestClient(app)
def test_health(self, client) -> None:
resp = client.get("/health")
assert resp.status_code == 200
def test_tasks(self, client) -> None:
resp = client.get("/tasks")
assert resp.status_code == 200
data = resp.json()
assert len(data["easy"]) == 5
assert len(data["medium"]) == 5
assert len(data["hard"]) == 5
def test_reset_step_state(self, client) -> None:
resp = client.post("/reset", params={"task_id": "easy_001"})
assert resp.status_code == 200
resp = client.post("/step", json={"action": {"action_type": "view_calendar", "date": "2026-04-25"}})
assert resp.status_code == 200
data = resp.json()
assert data.get("done") is False
resp = client.get("/state")
assert resp.status_code == 200
state = resp.json()
assert "step_count" in state
def test_mcp_initialize(self, client) -> None:
resp = client.post("/mcp", json={
"jsonrpc": "2.0", "id": 1, "method": "initialize", "params": {},
})
assert resp.status_code == 200
data = resp.json()
assert data["result"]["serverInfo"]["name"] == "commitment-os"
def test_mcp_tools_list(self, client) -> None:
resp = client.post("/mcp", json={
"jsonrpc": "2.0", "id": 2, "method": "tools/list", "params": {},
})
assert resp.status_code == 200
tools = resp.json()["result"]["tools"]
assert len(tools) == 3
names = {t["name"] for t in tools}
assert names == {
"cos_episode_reset",
"cos_environment_step",
"cos_session_snapshot",
}
# ===================================================================
# 10. Metadata
# ===================================================================
class TestMetadata:
def test_get_metadata(self, env: CommitmentEnvironment) -> None:
meta = env.get_metadata()
assert meta.name == "commitment-os"
assert "Jayant" in meta.author