coding-router v6: ONNX (int8+fp32) + PyTorch weights, tokenizer, calibration, eval results

e07ba76 verified 4 days ago

6.02 kB

	{
	"per_head": {
	"complexity": {
	"accuracy": 0.6954,
	"macro_f1": 0.7011,
	"per_label": {
	"easy": {
	"precision": 0.7258,
	"recall": 0.75,
	"f1": 0.7377,
	"support": 60
	},
	"medium": {
	"precision": 0.6232,
	"recall": 0.6418,
	"f1": 0.6324,
	"support": 67
	},
	"hard": {
	"precision": 0.7674,
	"recall": 0.7021,
	"f1": 0.7333,
	"support": 47
	}
	},
	"confusion_matrix": [
	[
	45,
	15,
	0
	],
	[
	14,
	43,
	10
	],
	[
	3,
	11,
	33
	]
	]
	},
	"task_type": {
	"accuracy": 0.9138,
	"macro_f1": 0.9093,
	"per_label": {
	"bugfix": {
	"precision": 0.9565,
	"recall": 0.88,
	"f1": 0.9167,
	"support": 25
	},
	"feature": {
	"precision": 0.9302,
	"recall": 0.9524,
	"f1": 0.9412,
	"support": 42
	},
	"refactor": {
	"precision": 0.8696,
	"recall": 0.9524,
	"f1": 0.9091,
	"support": 21
	},
	"test": {
	"precision": 1.0,
	"recall": 0.8571,
	"f1": 0.9231,
	"support": 14
	},
	"design": {
	"precision": 0.9375,
	"recall": 0.9375,
	"f1": 0.9375,
	"support": 16
	},
	"docs": {
	"precision": 0.9333,
	"recall": 0.9333,
	"f1": 0.9333,
	"support": 15
	},
	"migration": {
	"precision": 0.9167,
	"recall": 1.0,
	"f1": 0.9565,
	"support": 22
	},
	"exploration": {
	"precision": 0.7778,
	"recall": 0.7368,
	"f1": 0.7568,
	"support": 19
	}
	},
	"confusion_matrix": [
	[
	22,
	1,
	0,
	0,
	0,
	0,
	0,
	2
	],
	[
	0,
	40,
	1,
	0,
	0,
	0,
	0,
	1
	],
	[
	0,
	0,
	20,
	0,
	0,
	0,
	1,
	0
	],
	[
	1,
	1,
	0,
	12,
	0,
	0,
	0,
	0
	],
	[
	0,
	0,
	1,
	0,
	15,
	0,
	0,
	0
	],
	[
	0,
	0,
	0,
	0,
	0,
	14,
	0,
	1
	],
	[
	0,
	0,
	0,
	0,
	0,
	0,
	22,
	0
	],
	[
	0,
	1,
	1,
	0,
	1,
	1,
	1,
	14
	]
	]
	},
	"risk": {
	"accuracy": 0.6954,
	"macro_f1": 0.6554,
	"per_label": {
	"low": {
	"precision": 0.8,
	"recall": 0.8276,
	"f1": 0.8136,
	"support": 87
	},
	"medium": {
	"precision": 0.5333,
	"recall": 0.6038,
	"f1": 0.5664,
	"support": 53
	},
	"high": {
	"precision": 0.7083,
	"recall": 0.5,
	"f1": 0.5862,
	"support": 34
	}
	},
	"confusion_matrix": [
	[
	72,
	13,
	2
	],
	[
	16,
	32,
	5
	],
	[
	2,
	15,
	17
	]
	]
	}
	},
	"overall": {
	"exact_match": 0.4655,
	"macro_average_f1": 0.7553,
	"automation_safe_accuracy": 0.6935,
	"automation_safe_coverage": 0.3563,
	"confidence_threshold": 0.8,
	"confidence_calibration": {
	"ece": 0.303072,
	"bins": [
	{
	"range": [
	0.5,
	0.6
	],
	"count": 5,
	"avg_confidence": 0.586,
	"accuracy": 0.0
	},
	{
	"range": [
	0.6,
	0.7
	],
	"count": 48,
	"avg_confidence": 0.6575,
	"accuracy": 0.2917
	},
	{
	"range": [
	0.7,
	0.8
	],
	"count": 59,
	"avg_confidence": 0.7473,
	"accuracy": 0.4068
	},
	{
	"range": [
	0.8,
	0.9
	],
	"count": 33,
	"avg_confidence": 0.8512,
	"accuracy": 0.5758
	},
	{
	"range": [
	0.9,
	1.0
	],
	"count": 29,
	"avg_confidence": 0.9333,
	"accuracy": 0.8276
	}
	]
	}
	},
	"temperature_scaling": {
	"method": "per_head_temperature_scaling",
	"per_head": {
	"complexity": 0.891251,
	"task_type": 0.707946,
	"risk": 1.059254
	}
	},
	"complexity_subdims": {
	"reasoning_depth": {
	"mae": 0.1069,
	"r2": 0.5888
	},
	"spec_completeness": {
	"mae": 0.1033,
	"r2": 0.3667
	},
	"scope_breadth": {
	"mae": 0.1076,
	"r2": 0.5044
	},
	"domain_knowledge": {
	"mae": 0.1036,
	"r2": 0.4405
	}
	},
	"risk_subdims": {
	"security_surface": {
	"mae": 0.1517,
	"r2": 0.1937
	},
	"data_sensitivity": {
	"mae": 0.1184,
	"r2": 0.3094
	},
	"production_exposure": {
	"mae": 0.1182,
	"r2": 0.6323
	},
	"reversal_cost": {
	"mae": 0.1045,
	"r2": 0.6316
	}
	}
	}