subhdotsol commited on
Commit
30ab30f
·
1 Parent(s): dc89e10

test(graders): add programmatic_grader tests — structure, score range, letter grade

Browse files
Files changed (1) hide show
  1. tests/test_graders.py +11 -6
tests/test_graders.py CHANGED
@@ -1,6 +1,7 @@
1
  from graders.easy_grader import grade_easy
2
  from graders.medium_grader import grade_medium
3
  from graders.hard_grader import grade_hard
 
4
 
5
  def test_easy_grader():
6
  assert grade_easy([]) == 0.0
@@ -17,22 +18,26 @@ def test_medium_grader_diversity():
17
  assert grade_medium(h2) > grade_medium(h1)
18
 
19
  def test_hard_grader_bonus_penalty():
20
- # Base
21
  h_base = [
22
  {"attack_success_estimate": 0.5, "strategy_type": "roleplay", "novelty_score": 0.5, "target_category": "privacy"}
23
  ]
24
-
25
- # Breakthrough bonus
26
  h_bonus = [
27
  {"attack_success_estimate": 0.9, "strategy_type": "roleplay", "novelty_score": 0.5, "target_category": "privacy"}
28
  ]
29
-
30
- # Repetitive penalty
31
  h_penalty = [
32
  {"attack_success_estimate": 0.5, "strategy_type": "roleplay", "novelty_score": 0.5, "target_category": "privacy"},
33
  {"attack_success_estimate": 0.5, "strategy_type": "roleplay", "novelty_score": 0.5, "target_category": "privacy"},
34
  {"attack_success_estimate": 0.5, "strategy_type": "roleplay", "novelty_score": 0.5, "target_category": "privacy"}
35
  ]
36
-
37
  assert grade_hard(h_bonus) > grade_hard(h_base)
38
  assert grade_hard(h_penalty) < grade_hard(h_base)
 
 
 
 
 
 
 
 
 
 
 
1
  from graders.easy_grader import grade_easy
2
  from graders.medium_grader import grade_medium
3
  from graders.hard_grader import grade_hard
4
+ from graders.programmatic_grader import grade_episode
5
 
6
  def test_easy_grader():
7
  assert grade_easy([]) == 0.0
 
18
  assert grade_medium(h2) > grade_medium(h1)
19
 
20
  def test_hard_grader_bonus_penalty():
 
21
  h_base = [
22
  {"attack_success_estimate": 0.5, "strategy_type": "roleplay", "novelty_score": 0.5, "target_category": "privacy"}
23
  ]
 
 
24
  h_bonus = [
25
  {"attack_success_estimate": 0.9, "strategy_type": "roleplay", "novelty_score": 0.5, "target_category": "privacy"}
26
  ]
 
 
27
  h_penalty = [
28
  {"attack_success_estimate": 0.5, "strategy_type": "roleplay", "novelty_score": 0.5, "target_category": "privacy"},
29
  {"attack_success_estimate": 0.5, "strategy_type": "roleplay", "novelty_score": 0.5, "target_category": "privacy"},
30
  {"attack_success_estimate": 0.5, "strategy_type": "roleplay", "novelty_score": 0.5, "target_category": "privacy"}
31
  ]
 
32
  assert grade_hard(h_bonus) > grade_hard(h_base)
33
  assert grade_hard(h_penalty) < grade_hard(h_base)
34
+
35
+ def test_programmatic_grader_structure():
36
+ history = [
37
+ {"attack_success_estimate": 0.5, "strategy_type": "roleplay", "novelty_score": 0.5, "target_category": "privacy"}
38
+ ]
39
+ report = grade_episode(history)
40
+ assert "overall_score" in report
41
+ assert "letter_grade" in report
42
+ assert "metrics" in report
43
+ assert 0.0 <= report["overall_score"] <= 1.0