{ "accepted_accuracy": 0.32, "accepted_coverage": 0.8013, "accuracy": 0.2564, "count": 156, "dataset_path": "/content/agentic-intent-classifier/data/iab_benchmark.jsonl", "difficulty_breakdown": { "easy": { "accepted_accuracy": 0.35, "accepted_coverage": 0.7692, "accuracy": 0.2692, "count": 52, "fallback_rate": 0.2308, "macro_f1": 0.153 }, "hard": { "accepted_accuracy": 0.275, "accepted_coverage": 0.7692, "accuracy": 0.2115, "count": 52, "fallback_rate": 0.2308, "macro_f1": 0.1108 }, "medium": { "accepted_accuracy": 0.3333, "accepted_coverage": 0.8654, "accuracy": 0.2885, "count": 52, "fallback_rate": 0.1346, "macro_f1": 0.1491 } }, "fallback_rate": 0.1987, "head": "iab_content", "macro_f1": 0.105, "primary_source": "supervised_classifier", "suite": "difficulty_benchmark", "tier_metrics": { "average_prediction_depth": 1.7564, "error_buckets": { "exact_match": 40, "parent_safe_stop": 11, "right_tier1_wrong_tier2": 58, "wrong_deep_leaf": 1, "wrong_tier1": 46 }, "exact_path_accuracy": 0.2564, "parent_safe_accuracy": 0.6218, "tier1_accuracy": 0.7051, "tier2_accuracy": 0.3333, "tier3_accuracy": 0.2315, "tier4_accuracy": 0.0 }, "view_metrics": { "classifier": { "average_prediction_depth": 1.7564, "error_buckets": { "exact_match": 40, "parent_safe_stop": 11, "right_tier1_wrong_tier2": 58, "wrong_deep_leaf": 1, "wrong_tier1": 46 }, "exact_path_accuracy": 0.2564, "parent_safe_accuracy": 0.6218, "tier1_accuracy": 0.7051, "tier2_accuracy": 0.3333, "tier3_accuracy": 0.2315, "tier4_accuracy": 0.0 }, "combined_path": { "average_prediction_depth": 1.7564, "error_buckets": { "exact_match": 40, "parent_safe_stop": 11, "right_tier1_wrong_tier2": 58, "wrong_deep_leaf": 1, "wrong_tier1": 46 }, "exact_path_accuracy": 0.2564, "fallback_overuse_count": 13, "fallback_rate": 0.0833, "parent_safe_accuracy": 0.6218, "tier1_accuracy": 0.7051, "tier2_accuracy": 0.3333, "tier3_accuracy": 0.2315, "tier4_accuracy": 0.0 }, "disagreements": { "classifier_vs_combined": 0 }, "shadow_embedding_retrieval": { "hint": "Set IAB_EVAL_INCLUDE_SHADOW_RETRIEVAL=1 to run shadow embedding retrieval (downloads/loads gte-Qwen2 when index is present).", "reason": "disabled_by_default", "skipped": true } } }