spawn-router / test_metrics.json

coding-router v6: ONNX (int8+fp32) + PyTorch weights, tokenizer, calibration, eval results

e07ba76 verified 4 days ago

6.02 kB

	{
	"per_head": {
	"complexity": {
	"accuracy": 0.6782,
	"macro_f1": 0.6806,
	"per_label": {
	"easy": {
	"precision": 0.5758,
	"recall": 0.8636,
	"f1": 0.6909,
	"support": 44
	},
	"medium": {
	"precision": 0.7941,
	"recall": 0.5745,
	"f1": 0.6667,
	"support": 94
	},
	"hard": {
	"precision": 0.65,
	"recall": 0.7222,
	"f1": 0.6842,
	"support": 36
	}
	},
	"confusion_matrix": [
	[
	38,
	6,
	0
	],
	[
	26,
	54,
	14
	],
	[
	2,
	8,
	26
	]
	]
	},
	"task_type": {
	"accuracy": 0.8678,
	"macro_f1": 0.8718,
	"per_label": {
	"bugfix": {
	"precision": 0.875,
	"recall": 0.84,
	"f1": 0.8571,
	"support": 25
	},
	"feature": {
	"precision": 0.875,
	"recall": 0.8333,
	"f1": 0.8537,
	"support": 42
	},
	"refactor": {
	"precision": 0.84,
	"recall": 1.0,
	"f1": 0.913,
	"support": 21
	},
	"test": {
	"precision": 1.0,
	"recall": 0.8571,
	"f1": 0.9231,
	"support": 14
	},
	"design": {
	"precision": 0.8667,
	"recall": 0.8125,
	"f1": 0.8387,
	"support": 16
	},
	"docs": {
	"precision": 0.9286,
	"recall": 0.8667,
	"f1": 0.8966,
	"support": 15
	},
	"migration": {
	"precision": 0.9524,
	"recall": 0.9091,
	"f1": 0.9302,
	"support": 22
	},
	"exploration": {
	"precision": 0.6957,
	"recall": 0.8421,
	"f1": 0.7619,
	"support": 19
	}
	},
	"confusion_matrix": [
	[
	21,
	0,
	2,
	0,
	0,
	0,
	1,
	1
	],
	[
	0,
	35,
	1,
	0,
	1,
	0,
	0,
	5
	],
	[
	0,
	0,
	21,
	0,
	0,
	0,
	0,
	0
	],
	[
	1,
	1,
	0,
	12,
	0,
	0,
	0,
	0
	],
	[
	0,
	1,
	1,
	0,
	13,
	1,
	0,
	0
	],
	[
	0,
	1,
	0,
	0,
	0,
	13,
	0,
	1
	],
	[
	1,
	1,
	0,
	0,
	0,
	0,
	20,
	0
	],
	[
	1,
	1,
	0,
	0,
	1,
	0,
	0,
	16
	]
	]
	},
	"risk": {
	"accuracy": 0.6667,
	"macro_f1": 0.6217,
	"per_label": {
	"low": {
	"precision": 0.8068,
	"recall": 0.8353,
	"f1": 0.8208,
	"support": 85
	},
	"medium": {
	"precision": 0.5102,
	"recall": 0.4545,
	"f1": 0.4808,
	"support": 55
	},
	"high": {
	"precision": 0.5405,
	"recall": 0.5882,
	"f1": 0.5634,
	"support": 34
	}
	},
	"confusion_matrix": [
	[
	71,
	11,
	3
	],
	[
	16,
	25,
	14
	],
	[
	1,
	13,
	20
	]
	]
	}
	},
	"overall": {
	"exact_match": 0.3908,
	"macro_average_f1": 0.7247,
	"automation_safe_accuracy": 0.6167,
	"automation_safe_coverage": 0.3448,
	"confidence_threshold": 0.8,
	"confidence_calibration": {
	"ece": 0.369827,
	"bins": [
	{
	"range": [
	0.5,
	0.6
	],
	"count": 2,
	"avg_confidence": 0.5761,
	"accuracy": 0.0
	},
	{
	"range": [
	0.6,
	0.7
	],
	"count": 52,
	"avg_confidence": 0.6519,
	"accuracy": 0.3077
	},
	{
	"range": [
	0.7,
	0.8
	],
	"count": 60,
	"avg_confidence": 0.7476,
	"accuracy": 0.25
	},
	{
	"range": [
	0.8,
	0.9
	],
	"count": 41,
	"avg_confidence": 0.8439,
	"accuracy": 0.5366
	},
	{
	"range": [
	0.9,
	1.0
	],
	"count": 19,
	"avg_confidence": 0.9391,
	"accuracy": 0.7895
	}
	]
	}
	},
	"temperature_scaling": {
	"method": "per_head_temperature_scaling",
	"per_head": {
	"complexity": 0.891251,
	"task_type": 0.707946,
	"risk": 1.059254
	}
	},
	"complexity_subdims": {
	"reasoning_depth": {
	"mae": 0.1117,
	"r2": 0.513
	},
	"spec_completeness": {
	"mae": 0.1165,
	"r2": 0.3356
	},
	"scope_breadth": {
	"mae": 0.1082,
	"r2": 0.4687
	},
	"domain_knowledge": {
	"mae": 0.1199,
	"r2": 0.3003
	}
	},
	"risk_subdims": {
	"security_surface": {
	"mae": 0.1627,
	"r2": 0.1921
	},
	"data_sensitivity": {
	"mae": 0.1182,
	"r2": 0.2477
	},
	"production_exposure": {
	"mae": 0.1387,
	"r2": 0.5113
	},
	"reversal_cost": {
	"mae": 0.1103,
	"r2": 0.5463
	}
	}
	}