agentic-intent-classifier / artifacts /evaluation /latest /iab_content_difficulty_benchmark_report.json

Upload folder using huggingface_hub

53d5d9f verified 6 days ago

2.68 kB

	{
	"accepted_accuracy": 0.32,
	"accepted_coverage": 0.8013,
	"accuracy": 0.2564,
	"count": 156,
	"dataset_path": "/content/agentic-intent-classifier/data/iab_benchmark.jsonl",
	"difficulty_breakdown": {
	"easy": {
	"accepted_accuracy": 0.35,
	"accepted_coverage": 0.7692,
	"accuracy": 0.2692,
	"count": 52,
	"fallback_rate": 0.2308,
	"macro_f1": 0.153
	},
	"hard": {
	"accepted_accuracy": 0.275,
	"accepted_coverage": 0.7692,
	"accuracy": 0.2115,
	"count": 52,
	"fallback_rate": 0.2308,
	"macro_f1": 0.1108
	},
	"medium": {
	"accepted_accuracy": 0.3333,
	"accepted_coverage": 0.8654,
	"accuracy": 0.2885,
	"count": 52,
	"fallback_rate": 0.1346,
	"macro_f1": 0.1491
	}
	},
	"fallback_rate": 0.1987,
	"head": "iab_content",
	"macro_f1": 0.105,
	"primary_source": "supervised_classifier",
	"suite": "difficulty_benchmark",
	"tier_metrics": {
	"average_prediction_depth": 1.7564,
	"error_buckets": {
	"exact_match": 40,
	"parent_safe_stop": 11,
	"right_tier1_wrong_tier2": 58,
	"wrong_deep_leaf": 1,
	"wrong_tier1": 46
	},
	"exact_path_accuracy": 0.2564,
	"parent_safe_accuracy": 0.6218,
	"tier1_accuracy": 0.7051,
	"tier2_accuracy": 0.3333,
	"tier3_accuracy": 0.2315,
	"tier4_accuracy": 0.0
	},
	"view_metrics": {
	"classifier": {
	"average_prediction_depth": 1.7564,
	"error_buckets": {
	"exact_match": 40,
	"parent_safe_stop": 11,
	"right_tier1_wrong_tier2": 58,
	"wrong_deep_leaf": 1,
	"wrong_tier1": 46
	},
	"exact_path_accuracy": 0.2564,
	"parent_safe_accuracy": 0.6218,
	"tier1_accuracy": 0.7051,
	"tier2_accuracy": 0.3333,
	"tier3_accuracy": 0.2315,
	"tier4_accuracy": 0.0
	},
	"combined_path": {
	"average_prediction_depth": 1.7564,
	"error_buckets": {
	"exact_match": 40,
	"parent_safe_stop": 11,
	"right_tier1_wrong_tier2": 58,
	"wrong_deep_leaf": 1,
	"wrong_tier1": 46
	},
	"exact_path_accuracy": 0.2564,
	"fallback_overuse_count": 13,
	"fallback_rate": 0.0833,
	"parent_safe_accuracy": 0.6218,
	"tier1_accuracy": 0.7051,
	"tier2_accuracy": 0.3333,
	"tier3_accuracy": 0.2315,
	"tier4_accuracy": 0.0
	},
	"disagreements": {
	"classifier_vs_combined": 0
	},
	"shadow_embedding_retrieval": {
	"hint": "Set IAB_EVAL_INCLUDE_SHADOW_RETRIEVAL=1 to run shadow embedding retrieval (downloads/loads gte-Qwen2 when index is present).",
	"reason": "disabled_by_default",
	"skipped": true
	}
	}
	}