agentic-intent-classifier / artifacts /evaluation /latest /iab_content_difficulty_benchmark_report.json
| { | |
| "accepted_accuracy": 0.32, | |
| "accepted_coverage": 0.8013, | |
| "accuracy": 0.2564, | |
| "count": 156, | |
| "dataset_path": "/content/agentic-intent-classifier/data/iab_benchmark.jsonl", | |
| "difficulty_breakdown": { | |
| "easy": { | |
| "accepted_accuracy": 0.35, | |
| "accepted_coverage": 0.7692, | |
| "accuracy": 0.2692, | |
| "count": 52, | |
| "fallback_rate": 0.2308, | |
| "macro_f1": 0.153 | |
| }, | |
| "hard": { | |
| "accepted_accuracy": 0.275, | |
| "accepted_coverage": 0.7692, | |
| "accuracy": 0.2115, | |
| "count": 52, | |
| "fallback_rate": 0.2308, | |
| "macro_f1": 0.1108 | |
| }, | |
| "medium": { | |
| "accepted_accuracy": 0.3333, | |
| "accepted_coverage": 0.8654, | |
| "accuracy": 0.2885, | |
| "count": 52, | |
| "fallback_rate": 0.1346, | |
| "macro_f1": 0.1491 | |
| } | |
| }, | |
| "fallback_rate": 0.1987, | |
| "head": "iab_content", | |
| "macro_f1": 0.105, | |
| "primary_source": "supervised_classifier", | |
| "suite": "difficulty_benchmark", | |
| "tier_metrics": { | |
| "average_prediction_depth": 1.7564, | |
| "error_buckets": { | |
| "exact_match": 40, | |
| "parent_safe_stop": 11, | |
| "right_tier1_wrong_tier2": 58, | |
| "wrong_deep_leaf": 1, | |
| "wrong_tier1": 46 | |
| }, | |
| "exact_path_accuracy": 0.2564, | |
| "parent_safe_accuracy": 0.6218, | |
| "tier1_accuracy": 0.7051, | |
| "tier2_accuracy": 0.3333, | |
| "tier3_accuracy": 0.2315, | |
| "tier4_accuracy": 0.0 | |
| }, | |
| "view_metrics": { | |
| "classifier": { | |
| "average_prediction_depth": 1.7564, | |
| "error_buckets": { | |
| "exact_match": 40, | |
| "parent_safe_stop": 11, | |
| "right_tier1_wrong_tier2": 58, | |
| "wrong_deep_leaf": 1, | |
| "wrong_tier1": 46 | |
| }, | |
| "exact_path_accuracy": 0.2564, | |
| "parent_safe_accuracy": 0.6218, | |
| "tier1_accuracy": 0.7051, | |
| "tier2_accuracy": 0.3333, | |
| "tier3_accuracy": 0.2315, | |
| "tier4_accuracy": 0.0 | |
| }, | |
| "combined_path": { | |
| "average_prediction_depth": 1.7564, | |
| "error_buckets": { | |
| "exact_match": 40, | |
| "parent_safe_stop": 11, | |
| "right_tier1_wrong_tier2": 58, | |
| "wrong_deep_leaf": 1, | |
| "wrong_tier1": 46 | |
| }, | |
| "exact_path_accuracy": 0.2564, | |
| "fallback_overuse_count": 13, | |
| "fallback_rate": 0.0833, | |
| "parent_safe_accuracy": 0.6218, | |
| "tier1_accuracy": 0.7051, | |
| "tier2_accuracy": 0.3333, | |
| "tier3_accuracy": 0.2315, | |
| "tier4_accuracy": 0.0 | |
| }, | |
| "disagreements": { | |
| "classifier_vs_combined": 0 | |
| }, | |
| "shadow_embedding_retrieval": { | |
| "hint": "Set IAB_EVAL_INCLUDE_SHADOW_RETRIEVAL=1 to run shadow embedding retrieval (downloads/loads gte-Qwen2 when index is present).", | |
| "reason": "disabled_by_default", | |
| "skipped": true | |
| } | |
| } | |
| } | |