spawn-router / metrics.json
pranavkarthik10's picture
coding-router v6: ONNX (int8+fp32) + PyTorch weights, tokenizer, calibration, eval results
e07ba76 verified
Raw
History Blame Contribute Delete
6.02 kB
{
"per_head": {
"complexity": {
"accuracy": 0.6954,
"macro_f1": 0.7011,
"per_label": {
"easy": {
"precision": 0.7258,
"recall": 0.75,
"f1": 0.7377,
"support": 60
},
"medium": {
"precision": 0.6232,
"recall": 0.6418,
"f1": 0.6324,
"support": 67
},
"hard": {
"precision": 0.7674,
"recall": 0.7021,
"f1": 0.7333,
"support": 47
}
},
"confusion_matrix": [
[
45,
15,
0
],
[
14,
43,
10
],
[
3,
11,
33
]
]
},
"task_type": {
"accuracy": 0.9138,
"macro_f1": 0.9093,
"per_label": {
"bugfix": {
"precision": 0.9565,
"recall": 0.88,
"f1": 0.9167,
"support": 25
},
"feature": {
"precision": 0.9302,
"recall": 0.9524,
"f1": 0.9412,
"support": 42
},
"refactor": {
"precision": 0.8696,
"recall": 0.9524,
"f1": 0.9091,
"support": 21
},
"test": {
"precision": 1.0,
"recall": 0.8571,
"f1": 0.9231,
"support": 14
},
"design": {
"precision": 0.9375,
"recall": 0.9375,
"f1": 0.9375,
"support": 16
},
"docs": {
"precision": 0.9333,
"recall": 0.9333,
"f1": 0.9333,
"support": 15
},
"migration": {
"precision": 0.9167,
"recall": 1.0,
"f1": 0.9565,
"support": 22
},
"exploration": {
"precision": 0.7778,
"recall": 0.7368,
"f1": 0.7568,
"support": 19
}
},
"confusion_matrix": [
[
22,
1,
0,
0,
0,
0,
0,
2
],
[
0,
40,
1,
0,
0,
0,
0,
1
],
[
0,
0,
20,
0,
0,
0,
1,
0
],
[
1,
1,
0,
12,
0,
0,
0,
0
],
[
0,
0,
1,
0,
15,
0,
0,
0
],
[
0,
0,
0,
0,
0,
14,
0,
1
],
[
0,
0,
0,
0,
0,
0,
22,
0
],
[
0,
1,
1,
0,
1,
1,
1,
14
]
]
},
"risk": {
"accuracy": 0.6954,
"macro_f1": 0.6554,
"per_label": {
"low": {
"precision": 0.8,
"recall": 0.8276,
"f1": 0.8136,
"support": 87
},
"medium": {
"precision": 0.5333,
"recall": 0.6038,
"f1": 0.5664,
"support": 53
},
"high": {
"precision": 0.7083,
"recall": 0.5,
"f1": 0.5862,
"support": 34
}
},
"confusion_matrix": [
[
72,
13,
2
],
[
16,
32,
5
],
[
2,
15,
17
]
]
}
},
"overall": {
"exact_match": 0.4655,
"macro_average_f1": 0.7553,
"automation_safe_accuracy": 0.6935,
"automation_safe_coverage": 0.3563,
"confidence_threshold": 0.8,
"confidence_calibration": {
"ece": 0.303072,
"bins": [
{
"range": [
0.5,
0.6
],
"count": 5,
"avg_confidence": 0.586,
"accuracy": 0.0
},
{
"range": [
0.6,
0.7
],
"count": 48,
"avg_confidence": 0.6575,
"accuracy": 0.2917
},
{
"range": [
0.7,
0.8
],
"count": 59,
"avg_confidence": 0.7473,
"accuracy": 0.4068
},
{
"range": [
0.8,
0.9
],
"count": 33,
"avg_confidence": 0.8512,
"accuracy": 0.5758
},
{
"range": [
0.9,
1.0
],
"count": 29,
"avg_confidence": 0.9333,
"accuracy": 0.8276
}
]
}
},
"temperature_scaling": {
"method": "per_head_temperature_scaling",
"per_head": {
"complexity": 0.891251,
"task_type": 0.707946,
"risk": 1.059254
}
},
"complexity_subdims": {
"reasoning_depth": {
"mae": 0.1069,
"r2": 0.5888
},
"spec_completeness": {
"mae": 0.1033,
"r2": 0.3667
},
"scope_breadth": {
"mae": 0.1076,
"r2": 0.5044
},
"domain_knowledge": {
"mae": 0.1036,
"r2": 0.4405
}
},
"risk_subdims": {
"security_surface": {
"mae": 0.1517,
"r2": 0.1937
},
"data_sensitivity": {
"mae": 0.1184,
"r2": 0.3094
},
"production_exposure": {
"mae": 0.1182,
"r2": 0.6323
},
"reversal_cost": {
"mae": 0.1045,
"r2": 0.6316
}
}
}