Spaces:

ANI00
/

content-moderation-env

Sleeping

App Files Files Community

ANI00 commited on Apr 11

Commit

0261d2e

verified ·

1 Parent(s): 1133d32

Add: Task and grader registry for validation

Browse files

Files changed (1) hide show

test/test.py +24 -148

test/test.py CHANGED Viewed

@@ -5,7 +5,7 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
 import pytest
 from server.models import ModerationAction, ContentObservation, StepResult, ResetResult, EnvState
 from server.env import ContentModerationEnv
-from server.graders import grade_text_spam, grade_content_moderation, grade_deepfake, grade_misinformation, GRADERS
 from server.tasks import TASKS, TASK_NAMES
@@ -243,57 +243,43 @@ def test_deepfake_obs_has_image_description():
     assert obs.content_type == "multimodal"
-# ========== COMPREHENSIVE GRADER TESTS ==========
-# 5+ tests per grader (20+ total)
-# --- TEXT SPAM GRADER (5 tests) ---
 def test_text_spam_1_correct_reject():
-    """Test 1: Correct spam rejection with high confidence"""
     gt = {"decision": "reject", "labels": ["spam"], "is_harmful": True}
     r = grade_text_spam(
         make_action_dict("reject", confidence=0.9, labels=["spam"]), gt
     )
     assert 0.0 <= r <= 1.0
-    assert r > 0.6, f"Expected > 0.6, got {r}"
-    print(f"✓ test_text_spam_1_correct_reject: {r}")
 def test_text_spam_2_correct_approve():
-    """Test 2: Correct legitimate email approval"""
     gt = {"decision": "approve", "labels": [], "is_harmful": False}
     r = grade_text_spam(
         make_action_dict("approve", confidence=0.85, labels=[]), gt
     )
     assert 0.0 <= r <= 1.0
-    assert r > 0.6, f"Expected > 0.6, got {r}"
-    print(f"✓ test_text_spam_2_correct_approve: {r}")
 def test_text_spam_3_wrong_decision_penalty():
-    """Test 3: Wrong decision gets penalized"""
     gt = {"decision": "reject", "labels": ["spam"], "is_harmful": True}
     r = grade_text_spam(
         make_action_dict("approve", confidence=0.9, labels=[]), gt
     )
     assert 0.0 <= r <= 1.0
-    assert r < 0.4, f"Expected < 0.4, got {r}"
-    print(f"✓ test_text_spam_3_wrong_decision_penalty: {r}")
 def test_text_spam_4_escalate_partial_credit():
-    """Test 4: Escalate action gets partial credit"""
     gt = {"decision": "reject", "labels": ["spam"], "is_harmful": True}
     r = grade_text_spam(
         make_action_dict("escalate", confidence=0.5, labels=["spam"]), gt
     )
     assert 0.0 <= r <= 1.0
-    assert 0.15 <= r <= 0.5, f"Expected 0.15-0.5, got {r}"
-    print(f"✓ test_text_spam_4_escalate_partial_credit: {r}")
 def test_text_spam_5_low_confidence_penalty():
-    """Test 5: Low confidence penalizes score"""
     gt_reject = {"decision": "reject", "labels": ["spam"], "is_harmful": True}
     r_high = grade_text_spam(
         make_action_dict("reject", confidence=0.9, labels=["spam"]), gt_reject
@@ -303,36 +289,28 @@ def test_text_spam_5_low_confidence_penalty():
     )
     assert 0.0 <= r_high <= 1.0
     assert 0.0 <= r_low <= 1.0
-    assert r_high > r_low, f"High confidence ({r_high}) should beat low ({r_low})"
-    print(f"✓ test_text_spam_5_low_confidence_penalty: high={r_high}, low={r_low}")
-# --- CONTENT MODERATION GRADER (5 tests) ---
 def test_content_mod_1_correct_reject_multilabel():
-    """Test 1: Correct multi-label rejection"""
     gt = {"decision": "reject", "labels": ["hate_speech", "harassment"], "is_harmful": True}
     r = grade_content_moderation(
         make_action_dict("reject", confidence=0.9, labels=["hate_speech", "harassment"]), gt
     )
     assert 0.0 <= r <= 1.0
-    assert r > 0.6, f"Expected > 0.6, got {r}"
-    print(f"✓ test_content_mod_1_correct_reject_multilabel: {r}")
 def test_content_mod_2_correct_approve():
-    """Test 2: Correct benign content approval"""
     gt = {"decision": "approve", "labels": [], "is_harmful": False}
     r = grade_content_moderation(
         make_action_dict("approve", confidence=0.8, labels=[]), gt
     )
     assert 0.0 <= r <= 1.0
-    assert r > 0.6, f"Expected > 0.6, got {r}"
-    print(f"✓ test_content_mod_2_correct_approve: {r}")
 def test_content_mod_3_partial_label_match():
-    """Test 3: Partial label match gets partial credit"""
     gt = {"decision": "reject", "labels": ["hate_speech", "harassment"], "is_harmful": True}
     r_partial = grade_content_moderation(
         make_action_dict("reject", confidence=0.7, labels=["hate_speech"]), gt
@@ -342,23 +320,19 @@ def test_content_mod_3_partial_label_match():
     )
     assert 0.0 <= r_partial <= 1.0
     assert 0.0 <= r_full <= 1.0
-    assert r_full > r_partial, f"Full match ({r_full}) should beat partial ({r_partial})"
-    print(f"✓ test_content_mod_3_partial_label_match: partial={r_partial}, full={r_full}")
 def test_content_mod_4_flag_on_harmful():
-    """Test 4: Flag action on harmful content gets partial credit"""
     gt = {"decision": "reject", "labels": ["violence"], "is_harmful": True}
     r = grade_content_moderation(
         make_action_dict("flag", confidence=0.5, labels=["violence"]), gt
     )
     assert 0.0 <= r <= 1.0
-    assert 0.2 <= r <= 0.6, f"Expected 0.2-0.6 for flag, got {r}"
-    print(f"✓ test_content_mod_4_flag_on_harmful: {r}")
 def test_content_mod_5_wrong_labels_hurt_score():
-    """Test 5: Wrong labels hurt the score"""
     gt = {"decision": "reject", "labels": ["hate_speech"], "is_harmful": True}
     r_correct_labels = grade_content_moderation(
         make_action_dict("reject", confidence=0.9, labels=["hate_speech"]), gt
@@ -368,58 +342,46 @@ def test_content_mod_5_wrong_labels_hurt_score():
     )
     assert 0.0 <= r_correct_labels <= 1.0
     assert 0.0 <= r_wrong_labels <= 1.0
-    assert r_correct_labels > r_wrong_labels, f"Correct labels ({r_correct_labels}) should beat wrong ({r_wrong_labels})"
-    print(f"✓ test_content_mod_5_wrong_labels_hurt_score: correct={r_correct_labels}, wrong={r_wrong_labels}")
-# --- DEEPFAKE GRADER (5 tests) ---
 def test_deepfake_1_correct_detection():
-    """Test 1: Correct deepfake detection"""
     gt = {"decision": "reject", "labels": ["deepfake"], "is_deepfake": True, "is_harmful": True}
     r = grade_deepfake(
         make_action_dict("reject", confidence=0.95, labels=["deepfake"]), gt, detector_score=0.85
     )
     assert 0.0 <= r <= 1.0
-    assert r > 0.6, f"Expected > 0.6, got {r}"
-    print(f"✓ test_deepfake_1_correct_detection: {r}")
 def test_deepfake_2_correct_authentic():
-    """Test 2: Correct authentic content approval"""
     gt = {"decision": "approve", "labels": [], "is_deepfake": False, "is_harmful": False}
     r = grade_deepfake(
         make_action_dict("approve", confidence=0.9, labels=[]), gt, detector_score=0.05
     )
     assert 0.0 <= r <= 1.0
-    assert r > 0.6, f"Expected > 0.6, got {r}"
-    print(f"✓ test_deepfake_2_correct_authentic: {r}")
 def test_deepfake_3_false_positive_penalty():
-    """Test 3: False positive (flagging authentic content) gets penalized"""
     gt = {"decision": "approve", "labels": [], "is_deepfake": False, "is_harmful": False}
     r = grade_deepfake(
         make_action_dict("reject", confidence=0.8, labels=["deepfake"]), gt, detector_score=0.1
     )
     assert 0.0 <= r <= 1.0
-    assert r < 0.4, f"Expected < 0.4 for false positive, got {r}"
-    print(f"✓ test_deepfake_3_false_positive_penalty: {r}")
 def test_deepfake_4_escalate_uncertain():
-    """Test 4: Escalate on uncertain deepfake gets partial credit"""
     gt = {"decision": "reject", "labels": ["deepfake"], "is_deepfake": True, "is_harmful": True}
     r = grade_deepfake(
         make_action_dict("escalate", confidence=0.5, labels=["deepfake"]), gt, detector_score=0.5
     )
     assert 0.0 <= r <= 1.0
-    assert 0.15 <= r <= 0.5, f"Expected 0.15-0.5 for escalate, got {r}"
-    print(f"✓ test_deepfake_4_escalate_uncertain: {r}")
 def test_deepfake_5_missing_label_hurts():
-    """Test 5: Missing deepfake label hurts score"""
     gt = {"decision": "reject", "labels": ["deepfake"], "is_deepfake": True, "is_harmful": True}
     r_missing_label = grade_deepfake(
         make_action_dict("reject", confidence=0.7, labels=[]), gt, detector_score=0.8
@@ -429,91 +391,21 @@ def test_deepfake_5_missing_label_hurts():
     )
     assert 0.0 <= r_missing_label <= 1.0
     assert 0.0 <= r_with_label <= 1.0
-    assert r_with_label > r_missing_label, f"With label ({r_with_label}) should beat missing ({r_missing_label})"
-    print(f"✓ test_deepfake_5_missing_label_hurts: with_label={r_with_label}, missing={r_missing_label}")
-# --- MISINFORMATION GRADER (5 tests) ---
-def test_misinformation_1_correct_rejection():
-    """Test 1: Correct misinformation detection"""
-    gt = {"decision": "reject", "labels": ["misinformation"], "is_harmful": True}
-    r = grade_misinformation(
-        make_action_dict("reject", confidence=0.9, labels=["misinformation"]), gt
-    )
-    assert 0.0 <= r <= 1.0
-    assert r > 0.6, f"Expected > 0.6, got {r}"
-    print(f"✓ test_misinformation_1_correct_rejection: {r}")
-def test_misinformation_2_correct_approval():
-    """Test 2: Correct factual content approval"""
-    gt = {"decision": "approve", "labels": [], "is_harmful": False}
-    r = grade_misinformation(
-        make_action_dict("approve", confidence=0.85, labels=[]), gt
-    )
-    assert 0.0 <= r <= 1.0
-    assert r > 0.6, f"Expected > 0.6, got {r}"
-    print(f"✓ test_misinformation_2_correct_approval: {r}")
-def test_misinformation_3_flag_on_uncertain():
-    """Test 3: Flag on uncertain misinformation gets partial credit"""
-    gt = {"decision": "reject", "labels": ["misinformation"], "is_harmful": True}
-    r = grade_misinformation(
-        make_action_dict("flag", confidence=0.5, labels=["misinformation"]), gt
-    )
-    assert 0.0 <= r <= 1.0
-    assert 0.15 <= r <= 0.5, f"Expected 0.15-0.5 for flag, got {r}"
-    print(f"✓ test_misinformation_3_flag_on_uncertain: {r}")
-def test_misinformation_4_wrong_decision():
-    """Test 4: Wrong decision gets penalized"""
-    gt = {"decision": "reject", "labels": ["misinformation"], "is_harmful": True}
-    r = grade_misinformation(
-        make_action_dict("approve", confidence=0.8, labels=[]), gt
-    )
-    assert 0.0 <= r <= 1.0
-    assert r < 0.4, f"Expected < 0.4 for wrong decision, got {r}"
-    print(f"✓ test_misinformation_4_wrong_decision: {r}")
-def test_misinformation_5_confidence_calibration():
-    """Test 5: High confidence on wrong answer gets extra penalty"""
-    gt = {"decision": "reject", "labels": ["misinformation"], "is_harmful": True}
-    r_low_conf = grade_misinformation(
-        make_action_dict("approve", confidence=0.1, labels=[]), gt
-    )
-    r_high_conf = grade_misinformation(
-        make_action_dict("approve", confidence=0.95, labels=[]), gt
-    )
-    assert 0.0 <= r_low_conf <= 1.0
-    assert 0.0 <= r_high_conf <= 1.0
-    assert r_low_conf > r_high_conf, f"Low confidence ({r_low_conf}) should beat high bad confidence ({r_high_conf})"
-    print(f"✓ test_misinformation_5_confidence_calibration: low_conf={r_low_conf}, high_conf={r_high_conf}")
-# --- GRADER REGISTRY VALIDATION (5 tests) ---
-def test_registry_1_all_4_graders_exist():
-    """Test 1: All 4 graders registered"""
     assert "text_spam" in GRADERS
     assert "content_moderation" in GRADERS
     assert "deepfake_detection" in GRADERS
-    assert "misinformation_detection" in GRADERS
-    print(f"✓ test_registry_1_all_4_graders_exist: {len(GRADERS)} graders found")
 def test_registry_2_all_graders_callable():
-    """Test 2: All graders are callable functions"""
     for task_name, grader in GRADERS.items():
-        assert callable(grader), f"Grader for {task_name} is not callable"
-    print(f"✓ test_registry_2_all_graders_callable: All {len(GRADERS)} graders are callable")
 def test_registry_3_all_graders_return_valid_scores():
-    """Test 3: All graders return scores in valid range [0.0, 1.0]"""
     test_cases = {
         "text_spam": (
             {"decision": "approve", "confidence": 0.5, "labels": []},
@@ -530,11 +422,6 @@ def test_registry_3_all_graders_return_valid_scores():
             {"decision": "approve", "labels": [], "is_deepfake": False, "is_harmful": False},
             0.5
         ),
-        "misinformation_detection": (
-            {"decision": "approve", "confidence": 0.5, "labels": []},
-            {"decision": "approve", "labels": [], "is_harmful": False},
-            None
-        ),
     }
     for task_name, (action, ground_truth, detector_score) in test_cases.items():
@@ -543,13 +430,11 @@ def test_registry_3_all_graders_return_valid_scores():
             score = grader(action, ground_truth, detector_score)
         else:
             score = grader(action, ground_truth)
-        assert isinstance(score, (int, float)), f"{task_name} returned non-numeric score"
-        assert 0.0 <= score <= 1.0, f"{task_name} returned score outside [0.0, 1.0]: {score}"
-    print(f"✓ test_registry_3_all_graders_return_valid_scores: All scores in [0.0, 1.0]")
 def test_registry_4_graders_distinguish_performance():
-    """Test 4: Graders differentiate between good and bad actions"""
     test_pairs = {
         "text_spam": (
             ({"decision": "reject", "confidence": 0.9, "labels": ["spam"]},
@@ -569,12 +454,6 @@ def test_registry_4_graders_distinguish_performance():
             ({"decision": "approve", "confidence": 0.9, "labels": []},
              {"decision": "reject", "labels": ["deepfake"], "is_deepfake": True, "is_harmful": True})
         ),
-        "misinformation_detection": (
-            ({"decision": "reject", "confidence": 0.9, "labels": ["misinformation"]},
-             {"decision": "reject", "labels": ["misinformation"], "is_harmful": True}),
-            ({"decision": "approve", "confidence": 0.9, "labels": []},
-             {"decision": "reject", "labels": ["misinformation"], "is_harmful": True})
-        ),
     }
     for task_name, (good_pair, bad_pair) in test_pairs.items():
@@ -589,12 +468,10 @@ def test_registry_4_graders_distinguish_performance():
             score_good = grader(good_action, good_gt)
             score_bad = grader(bad_action, bad_gt)
-        assert score_good > score_bad, f"{task_name}: good action ({score_good}) should score > bad action ({score_bad})"
-    print(f"✓ test_registry_4_graders_distinguish_performance: All graders differentiate good/bad")
 def test_registry_5_boundary_confidence_values():
-    """Test 5: Graders handle boundary confidence values (0.0, 1.0)"""
     action_0 = {"decision": "approve", "confidence": 0.0, "labels": []}
     action_100 = {"decision": "approve", "confidence": 1.0, "labels": []}
     gt = {"decision": "approve", "labels": [], "is_harmful": False}
@@ -607,8 +484,7 @@ def test_registry_5_boundary_confidence_values():
             score_0 = grader(action_0, gt)
             score_100 = grader(action_100, gt)
-        assert 0.0 <= score_0 <= 1.0, f"{task_name} failed on confidence=0.0"
-        assert 0.0 <= score_100 <= 1.0, f"{task_name} failed on confidence=1.0"
-        assert score_100 >= score_0, f"{task_name}: high confidence should >= low confidence"
-    print(f"✓ test_registry_5_boundary_confidence_values: All graders handle boundaries")

 import pytest
 from server.models import ModerationAction, ContentObservation, StepResult, ResetResult, EnvState
 from server.env import ContentModerationEnv
+from server.graders import grade_text_spam, grade_content_moderation, grade_deepfake, GRADERS
 from server.tasks import TASKS, TASK_NAMES
     assert obs.content_type == "multimodal"
 def test_text_spam_1_correct_reject():
     gt = {"decision": "reject", "labels": ["spam"], "is_harmful": True}
     r = grade_text_spam(
         make_action_dict("reject", confidence=0.9, labels=["spam"]), gt
     )
     assert 0.0 <= r <= 1.0
+    assert r > 0.6
 def test_text_spam_2_correct_approve():
     gt = {"decision": "approve", "labels": [], "is_harmful": False}
     r = grade_text_spam(
         make_action_dict("approve", confidence=0.85, labels=[]), gt
     )
     assert 0.0 <= r <= 1.0
+    assert r > 0.6
 def test_text_spam_3_wrong_decision_penalty():
     gt = {"decision": "reject", "labels": ["spam"], "is_harmful": True}
     r = grade_text_spam(
         make_action_dict("approve", confidence=0.9, labels=[]), gt
     )
     assert 0.0 <= r <= 1.0
+    assert r < 0.4
 def test_text_spam_4_escalate_partial_credit():
     gt = {"decision": "reject", "labels": ["spam"], "is_harmful": True}
     r = grade_text_spam(
         make_action_dict("escalate", confidence=0.5, labels=["spam"]), gt
     )
     assert 0.0 <= r <= 1.0
+    assert 0.15 <= r <= 0.5
 def test_text_spam_5_low_confidence_penalty():
     gt_reject = {"decision": "reject", "labels": ["spam"], "is_harmful": True}
     r_high = grade_text_spam(
         make_action_dict("reject", confidence=0.9, labels=["spam"]), gt_reject
     )
     assert 0.0 <= r_high <= 1.0
     assert 0.0 <= r_low <= 1.0
+    assert r_high > r_low
 def test_content_mod_1_correct_reject_multilabel():
     gt = {"decision": "reject", "labels": ["hate_speech", "harassment"], "is_harmful": True}
     r = grade_content_moderation(
         make_action_dict("reject", confidence=0.9, labels=["hate_speech", "harassment"]), gt
     )
     assert 0.0 <= r <= 1.0
+    assert r > 0.6
 def test_content_mod_2_correct_approve():
     gt = {"decision": "approve", "labels": [], "is_harmful": False}
     r = grade_content_moderation(
         make_action_dict("approve", confidence=0.8, labels=[]), gt
     )
     assert 0.0 <= r <= 1.0
+    assert r > 0.6
 def test_content_mod_3_partial_label_match():
     gt = {"decision": "reject", "labels": ["hate_speech", "harassment"], "is_harmful": True}
     r_partial = grade_content_moderation(
         make_action_dict("reject", confidence=0.7, labels=["hate_speech"]), gt
     )
     assert 0.0 <= r_partial <= 1.0
     assert 0.0 <= r_full <= 1.0
+    assert r_full > r_partial
 def test_content_mod_4_flag_on_harmful():
     gt = {"decision": "reject", "labels": ["violence"], "is_harmful": True}
     r = grade_content_moderation(
         make_action_dict("flag", confidence=0.5, labels=["violence"]), gt
     )
     assert 0.0 <= r <= 1.0
+    assert 0.2 <= r <= 0.6
 def test_content_mod_5_wrong_labels_hurt_score():
     gt = {"decision": "reject", "labels": ["hate_speech"], "is_harmful": True}
     r_correct_labels = grade_content_moderation(
         make_action_dict("reject", confidence=0.9, labels=["hate_speech"]), gt
     )
     assert 0.0 <= r_correct_labels <= 1.0
     assert 0.0 <= r_wrong_labels <= 1.0
+    assert r_correct_labels > r_wrong_labels
 def test_deepfake_1_correct_detection():
     gt = {"decision": "reject", "labels": ["deepfake"], "is_deepfake": True, "is_harmful": True}
     r = grade_deepfake(
         make_action_dict("reject", confidence=0.95, labels=["deepfake"]), gt, detector_score=0.85
     )
     assert 0.0 <= r <= 1.0
+    assert r > 0.6
 def test_deepfake_2_correct_authentic():
     gt = {"decision": "approve", "labels": [], "is_deepfake": False, "is_harmful": False}
     r = grade_deepfake(
         make_action_dict("approve", confidence=0.9, labels=[]), gt, detector_score=0.05
     )
     assert 0.0 <= r <= 1.0
+    assert r > 0.6
 def test_deepfake_3_false_positive_penalty():
     gt = {"decision": "approve", "labels": [], "is_deepfake": False, "is_harmful": False}
     r = grade_deepfake(
         make_action_dict("reject", confidence=0.8, labels=["deepfake"]), gt, detector_score=0.1
     )
     assert 0.0 <= r <= 1.0
+    assert r < 0.4
 def test_deepfake_4_escalate_uncertain():
     gt = {"decision": "reject", "labels": ["deepfake"], "is_deepfake": True, "is_harmful": True}
     r = grade_deepfake(
         make_action_dict("escalate", confidence=0.5, labels=["deepfake"]), gt, detector_score=0.5
     )
     assert 0.0 <= r <= 1.0
+    assert 0.15 <= r <= 0.5
 def test_deepfake_5_missing_label_hurts():
     gt = {"decision": "reject", "labels": ["deepfake"], "is_deepfake": True, "is_harmful": True}
     r_missing_label = grade_deepfake(
         make_action_dict("reject", confidence=0.7, labels=[]), gt, detector_score=0.8
     )
     assert 0.0 <= r_missing_label <= 1.0
     assert 0.0 <= r_with_label <= 1.0
+    assert r_with_label > r_missing_label
+def test_registry_1_all_3_graders_exist():
     assert "text_spam" in GRADERS
     assert "content_moderation" in GRADERS
     assert "deepfake_detection" in GRADERS
 def test_registry_2_all_graders_callable():
     for task_name, grader in GRADERS.items():
+        assert callable(grader)
 def test_registry_3_all_graders_return_valid_scores():
     test_cases = {
         "text_spam": (
             {"decision": "approve", "confidence": 0.5, "labels": []},
             {"decision": "approve", "labels": [], "is_deepfake": False, "is_harmful": False},
             0.5
         ),
     }
     for task_name, (action, ground_truth, detector_score) in test_cases.items():
             score = grader(action, ground_truth, detector_score)
         else:
             score = grader(action, ground_truth)
+        assert isinstance(score, (int, float))
+        assert 0.0 <= score <= 1.0
 def test_registry_4_graders_distinguish_performance():
     test_pairs = {
         "text_spam": (
             ({"decision": "reject", "confidence": 0.9, "labels": ["spam"]},
             ({"decision": "approve", "confidence": 0.9, "labels": []},
              {"decision": "reject", "labels": ["deepfake"], "is_deepfake": True, "is_harmful": True})
         ),
     }
     for task_name, (good_pair, bad_pair) in test_pairs.items():
             score_good = grader(good_action, good_gt)
             score_bad = grader(bad_action, bad_gt)
+        assert score_good > score_bad
 def test_registry_5_boundary_confidence_values():
     action_0 = {"decision": "approve", "confidence": 0.0, "labels": []}
     action_100 = {"decision": "approve", "confidence": 1.0, "labels": []}
     gt = {"decision": "approve", "labels": [], "is_harmful": False}
             score_0 = grader(action_0, gt)
             score_100 = grader(action_100, gt)
+        assert 0.0 <= score_0 <= 1.0
+        assert 0.0 <= score_100 <= 1.0
+        assert score_100 >= score_0