Spaces:
Sleeping
Sleeping
Commit ·
30ab30f
1
Parent(s): dc89e10
test(graders): add programmatic_grader tests — structure, score range, letter grade
Browse files- tests/test_graders.py +11 -6
tests/test_graders.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
from graders.easy_grader import grade_easy
|
| 2 |
from graders.medium_grader import grade_medium
|
| 3 |
from graders.hard_grader import grade_hard
|
|
|
|
| 4 |
|
| 5 |
def test_easy_grader():
|
| 6 |
assert grade_easy([]) == 0.0
|
|
@@ -17,22 +18,26 @@ def test_medium_grader_diversity():
|
|
| 17 |
assert grade_medium(h2) > grade_medium(h1)
|
| 18 |
|
| 19 |
def test_hard_grader_bonus_penalty():
|
| 20 |
-
# Base
|
| 21 |
h_base = [
|
| 22 |
{"attack_success_estimate": 0.5, "strategy_type": "roleplay", "novelty_score": 0.5, "target_category": "privacy"}
|
| 23 |
]
|
| 24 |
-
|
| 25 |
-
# Breakthrough bonus
|
| 26 |
h_bonus = [
|
| 27 |
{"attack_success_estimate": 0.9, "strategy_type": "roleplay", "novelty_score": 0.5, "target_category": "privacy"}
|
| 28 |
]
|
| 29 |
-
|
| 30 |
-
# Repetitive penalty
|
| 31 |
h_penalty = [
|
| 32 |
{"attack_success_estimate": 0.5, "strategy_type": "roleplay", "novelty_score": 0.5, "target_category": "privacy"},
|
| 33 |
{"attack_success_estimate": 0.5, "strategy_type": "roleplay", "novelty_score": 0.5, "target_category": "privacy"},
|
| 34 |
{"attack_success_estimate": 0.5, "strategy_type": "roleplay", "novelty_score": 0.5, "target_category": "privacy"}
|
| 35 |
]
|
| 36 |
-
|
| 37 |
assert grade_hard(h_bonus) > grade_hard(h_base)
|
| 38 |
assert grade_hard(h_penalty) < grade_hard(h_base)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from graders.easy_grader import grade_easy
|
| 2 |
from graders.medium_grader import grade_medium
|
| 3 |
from graders.hard_grader import grade_hard
|
| 4 |
+
from graders.programmatic_grader import grade_episode
|
| 5 |
|
| 6 |
def test_easy_grader():
|
| 7 |
assert grade_easy([]) == 0.0
|
|
|
|
| 18 |
assert grade_medium(h2) > grade_medium(h1)
|
| 19 |
|
| 20 |
def test_hard_grader_bonus_penalty():
|
|
|
|
| 21 |
h_base = [
|
| 22 |
{"attack_success_estimate": 0.5, "strategy_type": "roleplay", "novelty_score": 0.5, "target_category": "privacy"}
|
| 23 |
]
|
|
|
|
|
|
|
| 24 |
h_bonus = [
|
| 25 |
{"attack_success_estimate": 0.9, "strategy_type": "roleplay", "novelty_score": 0.5, "target_category": "privacy"}
|
| 26 |
]
|
|
|
|
|
|
|
| 27 |
h_penalty = [
|
| 28 |
{"attack_success_estimate": 0.5, "strategy_type": "roleplay", "novelty_score": 0.5, "target_category": "privacy"},
|
| 29 |
{"attack_success_estimate": 0.5, "strategy_type": "roleplay", "novelty_score": 0.5, "target_category": "privacy"},
|
| 30 |
{"attack_success_estimate": 0.5, "strategy_type": "roleplay", "novelty_score": 0.5, "target_category": "privacy"}
|
| 31 |
]
|
|
|
|
| 32 |
assert grade_hard(h_bonus) > grade_hard(h_base)
|
| 33 |
assert grade_hard(h_penalty) < grade_hard(h_base)
|
| 34 |
+
|
| 35 |
+
def test_programmatic_grader_structure():
|
| 36 |
+
history = [
|
| 37 |
+
{"attack_success_estimate": 0.5, "strategy_type": "roleplay", "novelty_score": 0.5, "target_category": "privacy"}
|
| 38 |
+
]
|
| 39 |
+
report = grade_episode(history)
|
| 40 |
+
assert "overall_score" in report
|
| 41 |
+
assert "letter_grade" in report
|
| 42 |
+
assert "metrics" in report
|
| 43 |
+
assert 0.0 <= report["overall_score"] <= 1.0
|