File size: 2,683 Bytes
0584798 53d5d9f 0584798 1519226 0584798 53d5d9f 0584798 53d5d9f 0584798 53d5d9f 0584798 53d5d9f 0584798 53d5d9f 0584798 53d5d9f 0584798 53d5d9f 0584798 53d5d9f 1519226 0584798 53d5d9f 0584798 53d5d9f 0584798 53d5d9f 0584798 1519226 53d5d9f 1519226 53d5d9f 1519226 53d5d9f 1519226 0584798 53d5d9f 0584798 53d5d9f 0584798 53d5d9f 0584798 1519226 0584798 1519226 0584798 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | {
"accepted_accuracy": 0.32,
"accepted_coverage": 0.8013,
"accuracy": 0.2564,
"count": 156,
"dataset_path": "/content/agentic-intent-classifier/data/iab_benchmark.jsonl",
"difficulty_breakdown": {
"easy": {
"accepted_accuracy": 0.35,
"accepted_coverage": 0.7692,
"accuracy": 0.2692,
"count": 52,
"fallback_rate": 0.2308,
"macro_f1": 0.153
},
"hard": {
"accepted_accuracy": 0.275,
"accepted_coverage": 0.7692,
"accuracy": 0.2115,
"count": 52,
"fallback_rate": 0.2308,
"macro_f1": 0.1108
},
"medium": {
"accepted_accuracy": 0.3333,
"accepted_coverage": 0.8654,
"accuracy": 0.2885,
"count": 52,
"fallback_rate": 0.1346,
"macro_f1": 0.1491
}
},
"fallback_rate": 0.1987,
"head": "iab_content",
"macro_f1": 0.105,
"primary_source": "supervised_classifier",
"suite": "difficulty_benchmark",
"tier_metrics": {
"average_prediction_depth": 1.7564,
"error_buckets": {
"exact_match": 40,
"parent_safe_stop": 11,
"right_tier1_wrong_tier2": 58,
"wrong_deep_leaf": 1,
"wrong_tier1": 46
},
"exact_path_accuracy": 0.2564,
"parent_safe_accuracy": 0.6218,
"tier1_accuracy": 0.7051,
"tier2_accuracy": 0.3333,
"tier3_accuracy": 0.2315,
"tier4_accuracy": 0.0
},
"view_metrics": {
"classifier": {
"average_prediction_depth": 1.7564,
"error_buckets": {
"exact_match": 40,
"parent_safe_stop": 11,
"right_tier1_wrong_tier2": 58,
"wrong_deep_leaf": 1,
"wrong_tier1": 46
},
"exact_path_accuracy": 0.2564,
"parent_safe_accuracy": 0.6218,
"tier1_accuracy": 0.7051,
"tier2_accuracy": 0.3333,
"tier3_accuracy": 0.2315,
"tier4_accuracy": 0.0
},
"combined_path": {
"average_prediction_depth": 1.7564,
"error_buckets": {
"exact_match": 40,
"parent_safe_stop": 11,
"right_tier1_wrong_tier2": 58,
"wrong_deep_leaf": 1,
"wrong_tier1": 46
},
"exact_path_accuracy": 0.2564,
"fallback_overuse_count": 13,
"fallback_rate": 0.0833,
"parent_safe_accuracy": 0.6218,
"tier1_accuracy": 0.7051,
"tier2_accuracy": 0.3333,
"tier3_accuracy": 0.2315,
"tier4_accuracy": 0.0
},
"disagreements": {
"classifier_vs_combined": 0
},
"shadow_embedding_retrieval": {
"hint": "Set IAB_EVAL_INCLUDE_SHADOW_RETRIEVAL=1 to run shadow embedding retrieval (downloads/loads gte-Qwen2 when index is present).",
"reason": "disabled_by_default",
"skipped": true
}
}
}
|