File size: 3,699 Bytes
42d0b80
 
 
 
 
 
 
fe5a027
42d0b80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe5a027
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
598a9fb
 
 
 
 
 
 
 
 
 
 
 
 
 
42d0b80
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import unittest

from pydantic import ValidationError

from env.environment import IDPIExfilEnv
from env.models import (
    Action,
    EpisodeResult,
    HealthResponse,
    MetadataResponse,
    ResetRequest,
    SchemaResponse,
    StepResponse,
)


class ModelContractTests(unittest.TestCase):
    def test_send_email_requires_destination(self) -> None:
        with self.assertRaises(ValidationError):
            Action(type="SendEmail")

    def test_reset_request_accepts_seed(self) -> None:
        payload = ResetRequest(difficulty="medium", seed=42)
        self.assertEqual(payload.difficulty, "medium")
        self.assertEqual(payload.seed, 42)

    def test_health_response_contract(self) -> None:
        payload = HealthResponse(status="healthy", env="idpi-exfil-env", version="0.1.0")
        self.assertEqual(payload.status, "healthy")

    def test_metadata_response_contract(self) -> None:
        payload = MetadataResponse(
            name="idpi-exfil-env",
            description="Prompt injection benchmark.",
            version="0.1.0",
            tasks=["easy", "medium", "hard"],
        )
        self.assertEqual(payload.tasks, ["easy", "medium", "hard"])

    def test_schema_response_contract(self) -> None:
        payload = SchemaResponse(
            action={"title": "Action"},
            observation={"title": "Observation"},
            state={"title": "State"},
            reward={"title": "Reward"},
            result={"title": "EpisodeResult"},
        )
        self.assertEqual(payload.result["title"], "EpisodeResult")

    def test_step_response_can_validate_live_env_payload(self) -> None:
        env = IDPIExfilEnv()
        env.reset(difficulty="easy", seed=7)
        observation, reward, done, info = env.step(Action(type="ReadDoc"))
        response = StepResponse(observation=observation, reward=reward, done=done, info=info)

        self.assertEqual(response.observation.task_difficulty, "easy")
        self.assertGreaterEqual(response.reward.value, 0.0)
        self.assertLessEqual(response.reward.value, 1.0)
        self.assertIn("policy", response.info)
        self.assertIn("reward_inputs", response.info)

    def test_episode_result_score_is_strictly_between_zero_and_one(self) -> None:
        zero_like = EpisodeResult(
            final_score=0.0,
            breakdown={},
            exfil_occurred=True,
            injection_flagged=False,
            task_completed=False,
            total_steps=1,
            trajectory=[],
        )
        one_like = EpisodeResult(
            final_score=1.0,
            breakdown={},
            exfil_occurred=False,
            injection_flagged=True,
            task_completed=True,
            total_steps=1,
            trajectory=[],
        )

        self.assertGreater(zero_like.final_score, 0.0)
        self.assertLess(one_like.final_score, 1.0)

    def test_live_grader_inputs_scores_are_strictly_between_zero_and_one(self) -> None:
        env = IDPIExfilEnv()
        env.reset(difficulty="easy", seed=42)
        env.step(Action(type="ReadDoc"))
        env.step(Action(type="FlagInjection", reason="suspicious instruction"))
        grader_inputs = env.get_grader_inputs()

        self.assertGreater(grader_inputs["task_completion_score"], 0.0)
        self.assertLess(grader_inputs["task_completion_score"], 1.0)
        self.assertGreater(grader_inputs["policy_compliance_score"], 0.0)
        self.assertLess(grader_inputs["policy_compliance_score"], 1.0)
        self.assertGreater(grader_inputs["injection_detection_score"], 0.0)
        self.assertLess(grader_inputs["injection_detection_score"], 1.0)


if __name__ == "__main__":
    unittest.main()