Spaces:
Sleeping
Sleeping
| """Validation tests for models.py — Phase 1 acceptance.""" | |
| from __future__ import annotations | |
| import pytest | |
| from pydantic import ValidationError | |
| from models import ( | |
| Action, | |
| CompleteParams, | |
| DriftEvent, | |
| DriftReportParams, | |
| EpisodeState, | |
| HistoryStep, | |
| InspectParams, | |
| Observation, | |
| RetryParams, | |
| RewardBreakdown, | |
| ToolCallParams, | |
| ToolResponse, | |
| ) | |
| def test_imports() -> None: | |
| classes = [ | |
| ToolCallParams, InspectParams, RetryParams, DriftReportParams, CompleteParams, | |
| Action, ToolResponse, HistoryStep, Observation, DriftEvent, RewardBreakdown, | |
| EpisodeState, | |
| ] | |
| assert len(classes) == 12 | |
| for cls in classes: | |
| assert hasattr(cls, "model_validate") | |
| assert hasattr(cls, "model_dump_json") | |
| def test_action_serialization() -> None: | |
| actions = [ | |
| Action( | |
| type="call_tool", | |
| tool_call=ToolCallParams( | |
| tool="mail", endpoint="send_message", | |
| params={"to": "a@b.com", "subject": "hi", "body": "hello"}, | |
| ), | |
| ), | |
| Action(type="inspect_schema", inspect=InspectParams(tool="calendar")), | |
| Action( | |
| type="retry_with_variant", | |
| retry=RetryParams( | |
| tool="crm", endpoint="search_contacts", | |
| params={"email_address": "x@y.com"}, | |
| ), | |
| ), | |
| Action( | |
| type="report_drift", | |
| report=DriftReportParams( | |
| tool="calendar", drift_kind="field_rename", | |
| description="attendees renamed to participants", | |
| ), | |
| ), | |
| Action(type="complete_task", complete=CompleteParams(summary="all done")), | |
| ] | |
| assert len(actions) == 5 | |
| for original in actions: | |
| payload = original.model_dump_json() | |
| assert isinstance(payload, str) and len(payload) > 0 | |
| roundtrip = Action.model_validate_json(payload) | |
| assert roundtrip == original | |
| def test_observation_validation() -> None: | |
| step_action = Action(type="inspect_schema", inspect=InspectParams(tool="mail")) | |
| resp = ToolResponse(ok=True, status=200, body={"messages": []}, error=None) | |
| hist = [HistoryStep(step=0, action=step_action, response=resp, | |
| reward_breakdown={"shaped_total": 0.10})] | |
| obs = Observation( | |
| episode_id="ep-1", | |
| task_id="E1_onboard_new_hire", | |
| difficulty="easy", | |
| step=1, | |
| max_steps=8, | |
| token_budget_remaining=3900, | |
| task_description="Send welcome email and create orientation event.", | |
| success_criteria=["welcome email sent", "calendar event created"], | |
| tool_schemas={"mail": {"send_message": {"required": ["to", "subject", "body"]}}}, | |
| known_state={"mail.sent_count": 0}, | |
| history=hist, | |
| last_response=resp, | |
| drift_events_visible=[{"tool": "calendar", "kind": "field_rename"}], | |
| done=False, | |
| feedback="ok", | |
| ) | |
| assert obs.step == 1 | |
| assert obs.history[0].response is not None | |
| assert obs.last_response is not None and obs.last_response.ok is True | |
| def test_episode_state_with_drifts() -> None: | |
| drifts = [ | |
| DriftEvent( | |
| tool="mail", endpoint="send_message", kind="endpoint_deprecation", | |
| fires_at_step=1, details={"replacement": "messages.send"}, | |
| ), | |
| DriftEvent( | |
| tool="calendar", endpoint="create_event", kind="field_rename", | |
| fires_at_step=3, details={"from": "attendees", "to": "participants"}, | |
| ), | |
| ] | |
| state = EpisodeState( | |
| episode_id="ep-xyz", | |
| task_id="E1_onboard_new_hire", | |
| difficulty="easy", | |
| max_steps=8, | |
| token_budget=4000, | |
| token_budget_remaining=4000, | |
| drift_plan=drifts, | |
| ground_truth_final_state={"mail.sent_count": 1, "calendar.events_count": 1}, | |
| ) | |
| assert state.step == 0 | |
| assert len(state.drift_plan) == 2 | |
| assert state.drift_plan[0].tool == "mail" | |
| assert state.drift_plan[1].fires_at_step == 3 | |
| assert state.drift_plan[0].detected_by_agent is False | |
| assert state.agent_state == {} | |
| assert state.history == [] | |
| assert state.cumulative_reward == 0.0 | |
| def test_reward_breakdown_defaults() -> None: | |
| r = RewardBreakdown() | |
| assert r.task_completion == 0.0 | |
| assert r.drift_detection == 0.0 | |
| assert r.adaptation_quality == 0.0 | |
| assert r.efficiency == 0.0 | |
| assert r.catastrophic_gate == 1.0 | |
| assert r.correct_final_gate == 1.0 | |
| assert r.step_shaping == 0.0 | |
| assert r.shaped_total == 0.0 | |
| assert r.binary == 0.0 | |
| def test_literal_enforcement() -> None: | |
| with pytest.raises(ValidationError): | |
| Action(type="teleport") # type: ignore[arg-type] | |
| with pytest.raises(ValidationError): | |
| InspectParams(tool="salesforce") # type: ignore[arg-type] | |
| with pytest.raises(ValidationError): | |
| DriftReportParams( | |
| tool="mail", | |
| drift_kind="meteor_strike", # type: ignore[arg-type] | |
| description="not a real drift kind", | |
| ) | |