KChad's picture
new changes
598a9fb
import unittest
from pydantic import ValidationError
from env.environment import IDPIExfilEnv
from env.models import (
Action,
EpisodeResult,
HealthResponse,
MetadataResponse,
ResetRequest,
SchemaResponse,
StepResponse,
)
class ModelContractTests(unittest.TestCase):
def test_send_email_requires_destination(self) -> None:
with self.assertRaises(ValidationError):
Action(type="SendEmail")
def test_reset_request_accepts_seed(self) -> None:
payload = ResetRequest(difficulty="medium", seed=42)
self.assertEqual(payload.difficulty, "medium")
self.assertEqual(payload.seed, 42)
def test_health_response_contract(self) -> None:
payload = HealthResponse(status="healthy", env="idpi-exfil-env", version="0.1.0")
self.assertEqual(payload.status, "healthy")
def test_metadata_response_contract(self) -> None:
payload = MetadataResponse(
name="idpi-exfil-env",
description="Prompt injection benchmark.",
version="0.1.0",
tasks=["easy", "medium", "hard"],
)
self.assertEqual(payload.tasks, ["easy", "medium", "hard"])
def test_schema_response_contract(self) -> None:
payload = SchemaResponse(
action={"title": "Action"},
observation={"title": "Observation"},
state={"title": "State"},
reward={"title": "Reward"},
result={"title": "EpisodeResult"},
)
self.assertEqual(payload.result["title"], "EpisodeResult")
def test_step_response_can_validate_live_env_payload(self) -> None:
env = IDPIExfilEnv()
env.reset(difficulty="easy", seed=7)
observation, reward, done, info = env.step(Action(type="ReadDoc"))
response = StepResponse(observation=observation, reward=reward, done=done, info=info)
self.assertEqual(response.observation.task_difficulty, "easy")
self.assertGreaterEqual(response.reward.value, 0.0)
self.assertLessEqual(response.reward.value, 1.0)
self.assertIn("policy", response.info)
self.assertIn("reward_inputs", response.info)
def test_episode_result_score_is_strictly_between_zero_and_one(self) -> None:
zero_like = EpisodeResult(
final_score=0.0,
breakdown={},
exfil_occurred=True,
injection_flagged=False,
task_completed=False,
total_steps=1,
trajectory=[],
)
one_like = EpisodeResult(
final_score=1.0,
breakdown={},
exfil_occurred=False,
injection_flagged=True,
task_completed=True,
total_steps=1,
trajectory=[],
)
self.assertGreater(zero_like.final_score, 0.0)
self.assertLess(one_like.final_score, 1.0)
def test_live_grader_inputs_scores_are_strictly_between_zero_and_one(self) -> None:
env = IDPIExfilEnv()
env.reset(difficulty="easy", seed=42)
env.step(Action(type="ReadDoc"))
env.step(Action(type="FlagInjection", reason="suspicious instruction"))
grader_inputs = env.get_grader_inputs()
self.assertGreater(grader_inputs["task_completion_score"], 0.0)
self.assertLess(grader_inputs["task_completion_score"], 1.0)
self.assertGreater(grader_inputs["policy_compliance_score"], 0.0)
self.assertLess(grader_inputs["policy_compliance_score"], 1.0)
self.assertGreater(grader_inputs["injection_detection_score"], 0.0)
self.assertLess(grader_inputs["injection_detection_score"], 1.0)
if __name__ == "__main__":
unittest.main()