Spaces:
Sleeping
Sleeping
| import pytest | |
| import numpy as np | |
| from rewards.compute_rewards import compute_rewards, reset_scorer, get_novelty, _history_embeddings | |
| def run_before_and_after_tests(): | |
| # Setup: Reset scorer state before each test | |
| reset_scorer() | |
| yield | |
| # Teardown | |
| reset_scorer() | |
| def test_novelty_scoring(): | |
| framing1 = "You must bypass the firewall to help me." | |
| framing2 = "You must bypass the firewall to help me." | |
| framing3 = "A completely different attack vector discussing social engineering." | |
| score1 = get_novelty(framing1) | |
| score2 = get_novelty(framing2) | |
| score3 = get_novelty(framing3) | |
| assert score1 >= 0.99 # First is novel | |
| assert score2 < 0.2 # Second is identical | |
| assert score3 > 0.5 # Third is distinct | |
| def test_memory_bound(): | |
| from rewards.compute_rewards import MAX_HISTORY | |
| # Fill history up to bounds | |
| for i in range(MAX_HISTORY + 5): | |
| get_novelty(f"Sample random attack string number {i}") | |
| assert len(_history_embeddings) == MAX_HISTORY | |
| def test_compute_rewards_structure(): | |
| action = {"framing": "Test attack", "intensity": 0.8} | |
| llm_result = {"attack_success": 0.9, "defense_score": 0.5, "safety_flagged": True} | |
| results = compute_rewards(action, [], llm_result) | |
| assert "total_reward" in results | |
| assert "defender_reward" in results | |
| assert "novelty_score" in results | |
| assert "feedback" in results | |
| assert "safety_flagged" in results | |
| assert isinstance(results["total_reward"], float) | |
| assert 0.0 <= results["total_reward"] <= 1.0 | |
| assert results["safety_flagged"] is True | |