ANI00 commited on
Commit
0261d2e
·
verified ·
1 Parent(s): 1133d32

Add: Task and grader registry for validation

Browse files
Files changed (1) hide show
  1. test/test.py +24 -148
test/test.py CHANGED
@@ -5,7 +5,7 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
5
  import pytest
6
  from server.models import ModerationAction, ContentObservation, StepResult, ResetResult, EnvState
7
  from server.env import ContentModerationEnv
8
- from server.graders import grade_text_spam, grade_content_moderation, grade_deepfake, grade_misinformation, GRADERS
9
  from server.tasks import TASKS, TASK_NAMES
10
 
11
 
@@ -243,57 +243,43 @@ def test_deepfake_obs_has_image_description():
243
  assert obs.content_type == "multimodal"
244
 
245
 
246
- # ========== COMPREHENSIVE GRADER TESTS ==========
247
- # 5+ tests per grader (20+ total)
248
-
249
- # --- TEXT SPAM GRADER (5 tests) ---
250
-
251
  def test_text_spam_1_correct_reject():
252
- """Test 1: Correct spam rejection with high confidence"""
253
  gt = {"decision": "reject", "labels": ["spam"], "is_harmful": True}
254
  r = grade_text_spam(
255
  make_action_dict("reject", confidence=0.9, labels=["spam"]), gt
256
  )
257
  assert 0.0 <= r <= 1.0
258
- assert r > 0.6, f"Expected > 0.6, got {r}"
259
- print(f"✓ test_text_spam_1_correct_reject: {r}")
260
 
261
 
262
  def test_text_spam_2_correct_approve():
263
- """Test 2: Correct legitimate email approval"""
264
  gt = {"decision": "approve", "labels": [], "is_harmful": False}
265
  r = grade_text_spam(
266
  make_action_dict("approve", confidence=0.85, labels=[]), gt
267
  )
268
  assert 0.0 <= r <= 1.0
269
- assert r > 0.6, f"Expected > 0.6, got {r}"
270
- print(f"✓ test_text_spam_2_correct_approve: {r}")
271
 
272
 
273
  def test_text_spam_3_wrong_decision_penalty():
274
- """Test 3: Wrong decision gets penalized"""
275
  gt = {"decision": "reject", "labels": ["spam"], "is_harmful": True}
276
  r = grade_text_spam(
277
  make_action_dict("approve", confidence=0.9, labels=[]), gt
278
  )
279
  assert 0.0 <= r <= 1.0
280
- assert r < 0.4, f"Expected < 0.4, got {r}"
281
- print(f"✓ test_text_spam_3_wrong_decision_penalty: {r}")
282
 
283
 
284
  def test_text_spam_4_escalate_partial_credit():
285
- """Test 4: Escalate action gets partial credit"""
286
  gt = {"decision": "reject", "labels": ["spam"], "is_harmful": True}
287
  r = grade_text_spam(
288
  make_action_dict("escalate", confidence=0.5, labels=["spam"]), gt
289
  )
290
  assert 0.0 <= r <= 1.0
291
- assert 0.15 <= r <= 0.5, f"Expected 0.15-0.5, got {r}"
292
- print(f"✓ test_text_spam_4_escalate_partial_credit: {r}")
293
 
294
 
295
  def test_text_spam_5_low_confidence_penalty():
296
- """Test 5: Low confidence penalizes score"""
297
  gt_reject = {"decision": "reject", "labels": ["spam"], "is_harmful": True}
298
  r_high = grade_text_spam(
299
  make_action_dict("reject", confidence=0.9, labels=["spam"]), gt_reject
@@ -303,36 +289,28 @@ def test_text_spam_5_low_confidence_penalty():
303
  )
304
  assert 0.0 <= r_high <= 1.0
305
  assert 0.0 <= r_low <= 1.0
306
- assert r_high > r_low, f"High confidence ({r_high}) should beat low ({r_low})"
307
- print(f"✓ test_text_spam_5_low_confidence_penalty: high={r_high}, low={r_low}")
308
-
309
 
310
- # --- CONTENT MODERATION GRADER (5 tests) ---
311
 
312
  def test_content_mod_1_correct_reject_multilabel():
313
- """Test 1: Correct multi-label rejection"""
314
  gt = {"decision": "reject", "labels": ["hate_speech", "harassment"], "is_harmful": True}
315
  r = grade_content_moderation(
316
  make_action_dict("reject", confidence=0.9, labels=["hate_speech", "harassment"]), gt
317
  )
318
  assert 0.0 <= r <= 1.0
319
- assert r > 0.6, f"Expected > 0.6, got {r}"
320
- print(f"✓ test_content_mod_1_correct_reject_multilabel: {r}")
321
 
322
 
323
  def test_content_mod_2_correct_approve():
324
- """Test 2: Correct benign content approval"""
325
  gt = {"decision": "approve", "labels": [], "is_harmful": False}
326
  r = grade_content_moderation(
327
  make_action_dict("approve", confidence=0.8, labels=[]), gt
328
  )
329
  assert 0.0 <= r <= 1.0
330
- assert r > 0.6, f"Expected > 0.6, got {r}"
331
- print(f"✓ test_content_mod_2_correct_approve: {r}")
332
 
333
 
334
  def test_content_mod_3_partial_label_match():
335
- """Test 3: Partial label match gets partial credit"""
336
  gt = {"decision": "reject", "labels": ["hate_speech", "harassment"], "is_harmful": True}
337
  r_partial = grade_content_moderation(
338
  make_action_dict("reject", confidence=0.7, labels=["hate_speech"]), gt
@@ -342,23 +320,19 @@ def test_content_mod_3_partial_label_match():
342
  )
343
  assert 0.0 <= r_partial <= 1.0
344
  assert 0.0 <= r_full <= 1.0
345
- assert r_full > r_partial, f"Full match ({r_full}) should beat partial ({r_partial})"
346
- print(f"✓ test_content_mod_3_partial_label_match: partial={r_partial}, full={r_full}")
347
 
348
 
349
  def test_content_mod_4_flag_on_harmful():
350
- """Test 4: Flag action on harmful content gets partial credit"""
351
  gt = {"decision": "reject", "labels": ["violence"], "is_harmful": True}
352
  r = grade_content_moderation(
353
  make_action_dict("flag", confidence=0.5, labels=["violence"]), gt
354
  )
355
  assert 0.0 <= r <= 1.0
356
- assert 0.2 <= r <= 0.6, f"Expected 0.2-0.6 for flag, got {r}"
357
- print(f"✓ test_content_mod_4_flag_on_harmful: {r}")
358
 
359
 
360
  def test_content_mod_5_wrong_labels_hurt_score():
361
- """Test 5: Wrong labels hurt the score"""
362
  gt = {"decision": "reject", "labels": ["hate_speech"], "is_harmful": True}
363
  r_correct_labels = grade_content_moderation(
364
  make_action_dict("reject", confidence=0.9, labels=["hate_speech"]), gt
@@ -368,58 +342,46 @@ def test_content_mod_5_wrong_labels_hurt_score():
368
  )
369
  assert 0.0 <= r_correct_labels <= 1.0
370
  assert 0.0 <= r_wrong_labels <= 1.0
371
- assert r_correct_labels > r_wrong_labels, f"Correct labels ({r_correct_labels}) should beat wrong ({r_wrong_labels})"
372
- print(f"✓ test_content_mod_5_wrong_labels_hurt_score: correct={r_correct_labels}, wrong={r_wrong_labels}")
373
 
374
 
375
- # --- DEEPFAKE GRADER (5 tests) ---
376
-
377
  def test_deepfake_1_correct_detection():
378
- """Test 1: Correct deepfake detection"""
379
  gt = {"decision": "reject", "labels": ["deepfake"], "is_deepfake": True, "is_harmful": True}
380
  r = grade_deepfake(
381
  make_action_dict("reject", confidence=0.95, labels=["deepfake"]), gt, detector_score=0.85
382
  )
383
  assert 0.0 <= r <= 1.0
384
- assert r > 0.6, f"Expected > 0.6, got {r}"
385
- print(f"✓ test_deepfake_1_correct_detection: {r}")
386
 
387
 
388
  def test_deepfake_2_correct_authentic():
389
- """Test 2: Correct authentic content approval"""
390
  gt = {"decision": "approve", "labels": [], "is_deepfake": False, "is_harmful": False}
391
  r = grade_deepfake(
392
  make_action_dict("approve", confidence=0.9, labels=[]), gt, detector_score=0.05
393
  )
394
  assert 0.0 <= r <= 1.0
395
- assert r > 0.6, f"Expected > 0.6, got {r}"
396
- print(f"✓ test_deepfake_2_correct_authentic: {r}")
397
 
398
 
399
  def test_deepfake_3_false_positive_penalty():
400
- """Test 3: False positive (flagging authentic content) gets penalized"""
401
  gt = {"decision": "approve", "labels": [], "is_deepfake": False, "is_harmful": False}
402
  r = grade_deepfake(
403
  make_action_dict("reject", confidence=0.8, labels=["deepfake"]), gt, detector_score=0.1
404
  )
405
  assert 0.0 <= r <= 1.0
406
- assert r < 0.4, f"Expected < 0.4 for false positive, got {r}"
407
- print(f"✓ test_deepfake_3_false_positive_penalty: {r}")
408
 
409
 
410
  def test_deepfake_4_escalate_uncertain():
411
- """Test 4: Escalate on uncertain deepfake gets partial credit"""
412
  gt = {"decision": "reject", "labels": ["deepfake"], "is_deepfake": True, "is_harmful": True}
413
  r = grade_deepfake(
414
  make_action_dict("escalate", confidence=0.5, labels=["deepfake"]), gt, detector_score=0.5
415
  )
416
  assert 0.0 <= r <= 1.0
417
- assert 0.15 <= r <= 0.5, f"Expected 0.15-0.5 for escalate, got {r}"
418
- print(f"✓ test_deepfake_4_escalate_uncertain: {r}")
419
 
420
 
421
  def test_deepfake_5_missing_label_hurts():
422
- """Test 5: Missing deepfake label hurts score"""
423
  gt = {"decision": "reject", "labels": ["deepfake"], "is_deepfake": True, "is_harmful": True}
424
  r_missing_label = grade_deepfake(
425
  make_action_dict("reject", confidence=0.7, labels=[]), gt, detector_score=0.8
@@ -429,91 +391,21 @@ def test_deepfake_5_missing_label_hurts():
429
  )
430
  assert 0.0 <= r_missing_label <= 1.0
431
  assert 0.0 <= r_with_label <= 1.0
432
- assert r_with_label > r_missing_label, f"With label ({r_with_label}) should beat missing ({r_missing_label})"
433
- print(f"✓ test_deepfake_5_missing_label_hurts: with_label={r_with_label}, missing={r_missing_label}")
434
-
435
-
436
- # --- MISINFORMATION GRADER (5 tests) ---
437
-
438
- def test_misinformation_1_correct_rejection():
439
- """Test 1: Correct misinformation detection"""
440
- gt = {"decision": "reject", "labels": ["misinformation"], "is_harmful": True}
441
- r = grade_misinformation(
442
- make_action_dict("reject", confidence=0.9, labels=["misinformation"]), gt
443
- )
444
- assert 0.0 <= r <= 1.0
445
- assert r > 0.6, f"Expected > 0.6, got {r}"
446
- print(f"✓ test_misinformation_1_correct_rejection: {r}")
447
 
448
 
449
- def test_misinformation_2_correct_approval():
450
- """Test 2: Correct factual content approval"""
451
- gt = {"decision": "approve", "labels": [], "is_harmful": False}
452
- r = grade_misinformation(
453
- make_action_dict("approve", confidence=0.85, labels=[]), gt
454
- )
455
- assert 0.0 <= r <= 1.0
456
- assert r > 0.6, f"Expected > 0.6, got {r}"
457
- print(f"✓ test_misinformation_2_correct_approval: {r}")
458
-
459
-
460
- def test_misinformation_3_flag_on_uncertain():
461
- """Test 3: Flag on uncertain misinformation gets partial credit"""
462
- gt = {"decision": "reject", "labels": ["misinformation"], "is_harmful": True}
463
- r = grade_misinformation(
464
- make_action_dict("flag", confidence=0.5, labels=["misinformation"]), gt
465
- )
466
- assert 0.0 <= r <= 1.0
467
- assert 0.15 <= r <= 0.5, f"Expected 0.15-0.5 for flag, got {r}"
468
- print(f"✓ test_misinformation_3_flag_on_uncertain: {r}")
469
-
470
-
471
- def test_misinformation_4_wrong_decision():
472
- """Test 4: Wrong decision gets penalized"""
473
- gt = {"decision": "reject", "labels": ["misinformation"], "is_harmful": True}
474
- r = grade_misinformation(
475
- make_action_dict("approve", confidence=0.8, labels=[]), gt
476
- )
477
- assert 0.0 <= r <= 1.0
478
- assert r < 0.4, f"Expected < 0.4 for wrong decision, got {r}"
479
- print(f"✓ test_misinformation_4_wrong_decision: {r}")
480
-
481
-
482
- def test_misinformation_5_confidence_calibration():
483
- """Test 5: High confidence on wrong answer gets extra penalty"""
484
- gt = {"decision": "reject", "labels": ["misinformation"], "is_harmful": True}
485
- r_low_conf = grade_misinformation(
486
- make_action_dict("approve", confidence=0.1, labels=[]), gt
487
- )
488
- r_high_conf = grade_misinformation(
489
- make_action_dict("approve", confidence=0.95, labels=[]), gt
490
- )
491
- assert 0.0 <= r_low_conf <= 1.0
492
- assert 0.0 <= r_high_conf <= 1.0
493
- assert r_low_conf > r_high_conf, f"Low confidence ({r_low_conf}) should beat high bad confidence ({r_high_conf})"
494
- print(f"✓ test_misinformation_5_confidence_calibration: low_conf={r_low_conf}, high_conf={r_high_conf}")
495
-
496
-
497
- # --- GRADER REGISTRY VALIDATION (5 tests) ---
498
-
499
- def test_registry_1_all_4_graders_exist():
500
- """Test 1: All 4 graders registered"""
501
  assert "text_spam" in GRADERS
502
  assert "content_moderation" in GRADERS
503
  assert "deepfake_detection" in GRADERS
504
- assert "misinformation_detection" in GRADERS
505
- print(f"✓ test_registry_1_all_4_graders_exist: {len(GRADERS)} graders found")
506
 
507
 
508
  def test_registry_2_all_graders_callable():
509
- """Test 2: All graders are callable functions"""
510
  for task_name, grader in GRADERS.items():
511
- assert callable(grader), f"Grader for {task_name} is not callable"
512
- print(f"✓ test_registry_2_all_graders_callable: All {len(GRADERS)} graders are callable")
513
 
514
 
515
  def test_registry_3_all_graders_return_valid_scores():
516
- """Test 3: All graders return scores in valid range [0.0, 1.0]"""
517
  test_cases = {
518
  "text_spam": (
519
  {"decision": "approve", "confidence": 0.5, "labels": []},
@@ -530,11 +422,6 @@ def test_registry_3_all_graders_return_valid_scores():
530
  {"decision": "approve", "labels": [], "is_deepfake": False, "is_harmful": False},
531
  0.5
532
  ),
533
- "misinformation_detection": (
534
- {"decision": "approve", "confidence": 0.5, "labels": []},
535
- {"decision": "approve", "labels": [], "is_harmful": False},
536
- None
537
- ),
538
  }
539
 
540
  for task_name, (action, ground_truth, detector_score) in test_cases.items():
@@ -543,13 +430,11 @@ def test_registry_3_all_graders_return_valid_scores():
543
  score = grader(action, ground_truth, detector_score)
544
  else:
545
  score = grader(action, ground_truth)
546
- assert isinstance(score, (int, float)), f"{task_name} returned non-numeric score"
547
- assert 0.0 <= score <= 1.0, f"{task_name} returned score outside [0.0, 1.0]: {score}"
548
- print(f"✓ test_registry_3_all_graders_return_valid_scores: All scores in [0.0, 1.0]")
549
 
550
 
551
  def test_registry_4_graders_distinguish_performance():
552
- """Test 4: Graders differentiate between good and bad actions"""
553
  test_pairs = {
554
  "text_spam": (
555
  ({"decision": "reject", "confidence": 0.9, "labels": ["spam"]},
@@ -569,12 +454,6 @@ def test_registry_4_graders_distinguish_performance():
569
  ({"decision": "approve", "confidence": 0.9, "labels": []},
570
  {"decision": "reject", "labels": ["deepfake"], "is_deepfake": True, "is_harmful": True})
571
  ),
572
- "misinformation_detection": (
573
- ({"decision": "reject", "confidence": 0.9, "labels": ["misinformation"]},
574
- {"decision": "reject", "labels": ["misinformation"], "is_harmful": True}),
575
- ({"decision": "approve", "confidence": 0.9, "labels": []},
576
- {"decision": "reject", "labels": ["misinformation"], "is_harmful": True})
577
- ),
578
  }
579
 
580
  for task_name, (good_pair, bad_pair) in test_pairs.items():
@@ -589,12 +468,10 @@ def test_registry_4_graders_distinguish_performance():
589
  score_good = grader(good_action, good_gt)
590
  score_bad = grader(bad_action, bad_gt)
591
 
592
- assert score_good > score_bad, f"{task_name}: good action ({score_good}) should score > bad action ({score_bad})"
593
- print(f"✓ test_registry_4_graders_distinguish_performance: All graders differentiate good/bad")
594
 
595
 
596
  def test_registry_5_boundary_confidence_values():
597
- """Test 5: Graders handle boundary confidence values (0.0, 1.0)"""
598
  action_0 = {"decision": "approve", "confidence": 0.0, "labels": []}
599
  action_100 = {"decision": "approve", "confidence": 1.0, "labels": []}
600
  gt = {"decision": "approve", "labels": [], "is_harmful": False}
@@ -607,8 +484,7 @@ def test_registry_5_boundary_confidence_values():
607
  score_0 = grader(action_0, gt)
608
  score_100 = grader(action_100, gt)
609
 
610
- assert 0.0 <= score_0 <= 1.0, f"{task_name} failed on confidence=0.0"
611
- assert 0.0 <= score_100 <= 1.0, f"{task_name} failed on confidence=1.0"
612
- assert score_100 >= score_0, f"{task_name}: high confidence should >= low confidence"
613
- print(f"✓ test_registry_5_boundary_confidence_values: All graders handle boundaries")
614
 
 
5
  import pytest
6
  from server.models import ModerationAction, ContentObservation, StepResult, ResetResult, EnvState
7
  from server.env import ContentModerationEnv
8
+ from server.graders import grade_text_spam, grade_content_moderation, grade_deepfake, GRADERS
9
  from server.tasks import TASKS, TASK_NAMES
10
 
11
 
 
243
  assert obs.content_type == "multimodal"
244
 
245
 
 
 
 
 
 
246
  def test_text_spam_1_correct_reject():
 
247
  gt = {"decision": "reject", "labels": ["spam"], "is_harmful": True}
248
  r = grade_text_spam(
249
  make_action_dict("reject", confidence=0.9, labels=["spam"]), gt
250
  )
251
  assert 0.0 <= r <= 1.0
252
+ assert r > 0.6
 
253
 
254
 
255
  def test_text_spam_2_correct_approve():
 
256
  gt = {"decision": "approve", "labels": [], "is_harmful": False}
257
  r = grade_text_spam(
258
  make_action_dict("approve", confidence=0.85, labels=[]), gt
259
  )
260
  assert 0.0 <= r <= 1.0
261
+ assert r > 0.6
 
262
 
263
 
264
  def test_text_spam_3_wrong_decision_penalty():
 
265
  gt = {"decision": "reject", "labels": ["spam"], "is_harmful": True}
266
  r = grade_text_spam(
267
  make_action_dict("approve", confidence=0.9, labels=[]), gt
268
  )
269
  assert 0.0 <= r <= 1.0
270
+ assert r < 0.4
 
271
 
272
 
273
  def test_text_spam_4_escalate_partial_credit():
 
274
  gt = {"decision": "reject", "labels": ["spam"], "is_harmful": True}
275
  r = grade_text_spam(
276
  make_action_dict("escalate", confidence=0.5, labels=["spam"]), gt
277
  )
278
  assert 0.0 <= r <= 1.0
279
+ assert 0.15 <= r <= 0.5
 
280
 
281
 
282
  def test_text_spam_5_low_confidence_penalty():
 
283
  gt_reject = {"decision": "reject", "labels": ["spam"], "is_harmful": True}
284
  r_high = grade_text_spam(
285
  make_action_dict("reject", confidence=0.9, labels=["spam"]), gt_reject
 
289
  )
290
  assert 0.0 <= r_high <= 1.0
291
  assert 0.0 <= r_low <= 1.0
292
+ assert r_high > r_low
 
 
293
 
 
294
 
295
  def test_content_mod_1_correct_reject_multilabel():
 
296
  gt = {"decision": "reject", "labels": ["hate_speech", "harassment"], "is_harmful": True}
297
  r = grade_content_moderation(
298
  make_action_dict("reject", confidence=0.9, labels=["hate_speech", "harassment"]), gt
299
  )
300
  assert 0.0 <= r <= 1.0
301
+ assert r > 0.6
 
302
 
303
 
304
  def test_content_mod_2_correct_approve():
 
305
  gt = {"decision": "approve", "labels": [], "is_harmful": False}
306
  r = grade_content_moderation(
307
  make_action_dict("approve", confidence=0.8, labels=[]), gt
308
  )
309
  assert 0.0 <= r <= 1.0
310
+ assert r > 0.6
 
311
 
312
 
313
  def test_content_mod_3_partial_label_match():
 
314
  gt = {"decision": "reject", "labels": ["hate_speech", "harassment"], "is_harmful": True}
315
  r_partial = grade_content_moderation(
316
  make_action_dict("reject", confidence=0.7, labels=["hate_speech"]), gt
 
320
  )
321
  assert 0.0 <= r_partial <= 1.0
322
  assert 0.0 <= r_full <= 1.0
323
+ assert r_full > r_partial
 
324
 
325
 
326
  def test_content_mod_4_flag_on_harmful():
 
327
  gt = {"decision": "reject", "labels": ["violence"], "is_harmful": True}
328
  r = grade_content_moderation(
329
  make_action_dict("flag", confidence=0.5, labels=["violence"]), gt
330
  )
331
  assert 0.0 <= r <= 1.0
332
+ assert 0.2 <= r <= 0.6
 
333
 
334
 
335
  def test_content_mod_5_wrong_labels_hurt_score():
 
336
  gt = {"decision": "reject", "labels": ["hate_speech"], "is_harmful": True}
337
  r_correct_labels = grade_content_moderation(
338
  make_action_dict("reject", confidence=0.9, labels=["hate_speech"]), gt
 
342
  )
343
  assert 0.0 <= r_correct_labels <= 1.0
344
  assert 0.0 <= r_wrong_labels <= 1.0
345
+ assert r_correct_labels > r_wrong_labels
 
346
 
347
 
 
 
348
  def test_deepfake_1_correct_detection():
 
349
  gt = {"decision": "reject", "labels": ["deepfake"], "is_deepfake": True, "is_harmful": True}
350
  r = grade_deepfake(
351
  make_action_dict("reject", confidence=0.95, labels=["deepfake"]), gt, detector_score=0.85
352
  )
353
  assert 0.0 <= r <= 1.0
354
+ assert r > 0.6
 
355
 
356
 
357
  def test_deepfake_2_correct_authentic():
 
358
  gt = {"decision": "approve", "labels": [], "is_deepfake": False, "is_harmful": False}
359
  r = grade_deepfake(
360
  make_action_dict("approve", confidence=0.9, labels=[]), gt, detector_score=0.05
361
  )
362
  assert 0.0 <= r <= 1.0
363
+ assert r > 0.6
 
364
 
365
 
366
  def test_deepfake_3_false_positive_penalty():
 
367
  gt = {"decision": "approve", "labels": [], "is_deepfake": False, "is_harmful": False}
368
  r = grade_deepfake(
369
  make_action_dict("reject", confidence=0.8, labels=["deepfake"]), gt, detector_score=0.1
370
  )
371
  assert 0.0 <= r <= 1.0
372
+ assert r < 0.4
 
373
 
374
 
375
  def test_deepfake_4_escalate_uncertain():
 
376
  gt = {"decision": "reject", "labels": ["deepfake"], "is_deepfake": True, "is_harmful": True}
377
  r = grade_deepfake(
378
  make_action_dict("escalate", confidence=0.5, labels=["deepfake"]), gt, detector_score=0.5
379
  )
380
  assert 0.0 <= r <= 1.0
381
+ assert 0.15 <= r <= 0.5
 
382
 
383
 
384
  def test_deepfake_5_missing_label_hurts():
 
385
  gt = {"decision": "reject", "labels": ["deepfake"], "is_deepfake": True, "is_harmful": True}
386
  r_missing_label = grade_deepfake(
387
  make_action_dict("reject", confidence=0.7, labels=[]), gt, detector_score=0.8
 
391
  )
392
  assert 0.0 <= r_missing_label <= 1.0
393
  assert 0.0 <= r_with_label <= 1.0
394
+ assert r_with_label > r_missing_label
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395
 
396
 
397
+ def test_registry_1_all_3_graders_exist():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398
  assert "text_spam" in GRADERS
399
  assert "content_moderation" in GRADERS
400
  assert "deepfake_detection" in GRADERS
 
 
401
 
402
 
403
  def test_registry_2_all_graders_callable():
 
404
  for task_name, grader in GRADERS.items():
405
+ assert callable(grader)
 
406
 
407
 
408
  def test_registry_3_all_graders_return_valid_scores():
 
409
  test_cases = {
410
  "text_spam": (
411
  {"decision": "approve", "confidence": 0.5, "labels": []},
 
422
  {"decision": "approve", "labels": [], "is_deepfake": False, "is_harmful": False},
423
  0.5
424
  ),
 
 
 
 
 
425
  }
426
 
427
  for task_name, (action, ground_truth, detector_score) in test_cases.items():
 
430
  score = grader(action, ground_truth, detector_score)
431
  else:
432
  score = grader(action, ground_truth)
433
+ assert isinstance(score, (int, float))
434
+ assert 0.0 <= score <= 1.0
 
435
 
436
 
437
  def test_registry_4_graders_distinguish_performance():
 
438
  test_pairs = {
439
  "text_spam": (
440
  ({"decision": "reject", "confidence": 0.9, "labels": ["spam"]},
 
454
  ({"decision": "approve", "confidence": 0.9, "labels": []},
455
  {"decision": "reject", "labels": ["deepfake"], "is_deepfake": True, "is_harmful": True})
456
  ),
 
 
 
 
 
 
457
  }
458
 
459
  for task_name, (good_pair, bad_pair) in test_pairs.items():
 
468
  score_good = grader(good_action, good_gt)
469
  score_bad = grader(bad_action, bad_gt)
470
 
471
+ assert score_good > score_bad
 
472
 
473
 
474
  def test_registry_5_boundary_confidence_values():
 
475
  action_0 = {"decision": "approve", "confidence": 0.0, "labels": []}
476
  action_100 = {"decision": "approve", "confidence": 1.0, "labels": []}
477
  gt = {"decision": "approve", "labels": [], "is_harmful": False}
 
484
  score_0 = grader(action_0, gt)
485
  score_100 = grader(action_100, gt)
486
 
487
+ assert 0.0 <= score_0 <= 1.0
488
+ assert 0.0 <= score_100 <= 1.0
489
+ assert score_100 >= score_0
 
490