spawn-router / test_metrics.json
pranavkarthik10's picture
coding-router v6: ONNX (int8+fp32) + PyTorch weights, tokenizer, calibration, eval results
e07ba76 verified
Raw
History Blame Contribute Delete
6.02 kB
{
"per_head": {
"complexity": {
"accuracy": 0.6782,
"macro_f1": 0.6806,
"per_label": {
"easy": {
"precision": 0.5758,
"recall": 0.8636,
"f1": 0.6909,
"support": 44
},
"medium": {
"precision": 0.7941,
"recall": 0.5745,
"f1": 0.6667,
"support": 94
},
"hard": {
"precision": 0.65,
"recall": 0.7222,
"f1": 0.6842,
"support": 36
}
},
"confusion_matrix": [
[
38,
6,
0
],
[
26,
54,
14
],
[
2,
8,
26
]
]
},
"task_type": {
"accuracy": 0.8678,
"macro_f1": 0.8718,
"per_label": {
"bugfix": {
"precision": 0.875,
"recall": 0.84,
"f1": 0.8571,
"support": 25
},
"feature": {
"precision": 0.875,
"recall": 0.8333,
"f1": 0.8537,
"support": 42
},
"refactor": {
"precision": 0.84,
"recall": 1.0,
"f1": 0.913,
"support": 21
},
"test": {
"precision": 1.0,
"recall": 0.8571,
"f1": 0.9231,
"support": 14
},
"design": {
"precision": 0.8667,
"recall": 0.8125,
"f1": 0.8387,
"support": 16
},
"docs": {
"precision": 0.9286,
"recall": 0.8667,
"f1": 0.8966,
"support": 15
},
"migration": {
"precision": 0.9524,
"recall": 0.9091,
"f1": 0.9302,
"support": 22
},
"exploration": {
"precision": 0.6957,
"recall": 0.8421,
"f1": 0.7619,
"support": 19
}
},
"confusion_matrix": [
[
21,
0,
2,
0,
0,
0,
1,
1
],
[
0,
35,
1,
0,
1,
0,
0,
5
],
[
0,
0,
21,
0,
0,
0,
0,
0
],
[
1,
1,
0,
12,
0,
0,
0,
0
],
[
0,
1,
1,
0,
13,
1,
0,
0
],
[
0,
1,
0,
0,
0,
13,
0,
1
],
[
1,
1,
0,
0,
0,
0,
20,
0
],
[
1,
1,
0,
0,
1,
0,
0,
16
]
]
},
"risk": {
"accuracy": 0.6667,
"macro_f1": 0.6217,
"per_label": {
"low": {
"precision": 0.8068,
"recall": 0.8353,
"f1": 0.8208,
"support": 85
},
"medium": {
"precision": 0.5102,
"recall": 0.4545,
"f1": 0.4808,
"support": 55
},
"high": {
"precision": 0.5405,
"recall": 0.5882,
"f1": 0.5634,
"support": 34
}
},
"confusion_matrix": [
[
71,
11,
3
],
[
16,
25,
14
],
[
1,
13,
20
]
]
}
},
"overall": {
"exact_match": 0.3908,
"macro_average_f1": 0.7247,
"automation_safe_accuracy": 0.6167,
"automation_safe_coverage": 0.3448,
"confidence_threshold": 0.8,
"confidence_calibration": {
"ece": 0.369827,
"bins": [
{
"range": [
0.5,
0.6
],
"count": 2,
"avg_confidence": 0.5761,
"accuracy": 0.0
},
{
"range": [
0.6,
0.7
],
"count": 52,
"avg_confidence": 0.6519,
"accuracy": 0.3077
},
{
"range": [
0.7,
0.8
],
"count": 60,
"avg_confidence": 0.7476,
"accuracy": 0.25
},
{
"range": [
0.8,
0.9
],
"count": 41,
"avg_confidence": 0.8439,
"accuracy": 0.5366
},
{
"range": [
0.9,
1.0
],
"count": 19,
"avg_confidence": 0.9391,
"accuracy": 0.7895
}
]
}
},
"temperature_scaling": {
"method": "per_head_temperature_scaling",
"per_head": {
"complexity": 0.891251,
"task_type": 0.707946,
"risk": 1.059254
}
},
"complexity_subdims": {
"reasoning_depth": {
"mae": 0.1117,
"r2": 0.513
},
"spec_completeness": {
"mae": 0.1165,
"r2": 0.3356
},
"scope_breadth": {
"mae": 0.1082,
"r2": 0.4687
},
"domain_knowledge": {
"mae": 0.1199,
"r2": 0.3003
}
},
"risk_subdims": {
"security_surface": {
"mae": 0.1627,
"r2": 0.1921
},
"data_sensitivity": {
"mae": 0.1182,
"r2": 0.2477
},
"production_exposure": {
"mae": 0.1387,
"r2": 0.5113
},
"reversal_cost": {
"mae": 0.1103,
"r2": 0.5463
}
}
}