coding-router v6: ONNX (int8+fp32) + PyTorch weights, tokenizer, calibration, eval results
e07ba76 verified | { | |
| "per_head": { | |
| "complexity": { | |
| "accuracy": 0.6954, | |
| "macro_f1": 0.7011, | |
| "per_label": { | |
| "easy": { | |
| "precision": 0.7258, | |
| "recall": 0.75, | |
| "f1": 0.7377, | |
| "support": 60 | |
| }, | |
| "medium": { | |
| "precision": 0.6232, | |
| "recall": 0.6418, | |
| "f1": 0.6324, | |
| "support": 67 | |
| }, | |
| "hard": { | |
| "precision": 0.7674, | |
| "recall": 0.7021, | |
| "f1": 0.7333, | |
| "support": 47 | |
| } | |
| }, | |
| "confusion_matrix": [ | |
| [ | |
| 45, | |
| 15, | |
| 0 | |
| ], | |
| [ | |
| 14, | |
| 43, | |
| 10 | |
| ], | |
| [ | |
| 3, | |
| 11, | |
| 33 | |
| ] | |
| ] | |
| }, | |
| "task_type": { | |
| "accuracy": 0.9138, | |
| "macro_f1": 0.9093, | |
| "per_label": { | |
| "bugfix": { | |
| "precision": 0.9565, | |
| "recall": 0.88, | |
| "f1": 0.9167, | |
| "support": 25 | |
| }, | |
| "feature": { | |
| "precision": 0.9302, | |
| "recall": 0.9524, | |
| "f1": 0.9412, | |
| "support": 42 | |
| }, | |
| "refactor": { | |
| "precision": 0.8696, | |
| "recall": 0.9524, | |
| "f1": 0.9091, | |
| "support": 21 | |
| }, | |
| "test": { | |
| "precision": 1.0, | |
| "recall": 0.8571, | |
| "f1": 0.9231, | |
| "support": 14 | |
| }, | |
| "design": { | |
| "precision": 0.9375, | |
| "recall": 0.9375, | |
| "f1": 0.9375, | |
| "support": 16 | |
| }, | |
| "docs": { | |
| "precision": 0.9333, | |
| "recall": 0.9333, | |
| "f1": 0.9333, | |
| "support": 15 | |
| }, | |
| "migration": { | |
| "precision": 0.9167, | |
| "recall": 1.0, | |
| "f1": 0.9565, | |
| "support": 22 | |
| }, | |
| "exploration": { | |
| "precision": 0.7778, | |
| "recall": 0.7368, | |
| "f1": 0.7568, | |
| "support": 19 | |
| } | |
| }, | |
| "confusion_matrix": [ | |
| [ | |
| 22, | |
| 1, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 2 | |
| ], | |
| [ | |
| 0, | |
| 40, | |
| 1, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 1 | |
| ], | |
| [ | |
| 0, | |
| 0, | |
| 20, | |
| 0, | |
| 0, | |
| 0, | |
| 1, | |
| 0 | |
| ], | |
| [ | |
| 1, | |
| 1, | |
| 0, | |
| 12, | |
| 0, | |
| 0, | |
| 0, | |
| 0 | |
| ], | |
| [ | |
| 0, | |
| 0, | |
| 1, | |
| 0, | |
| 15, | |
| 0, | |
| 0, | |
| 0 | |
| ], | |
| [ | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 14, | |
| 0, | |
| 1 | |
| ], | |
| [ | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 22, | |
| 0 | |
| ], | |
| [ | |
| 0, | |
| 1, | |
| 1, | |
| 0, | |
| 1, | |
| 1, | |
| 1, | |
| 14 | |
| ] | |
| ] | |
| }, | |
| "risk": { | |
| "accuracy": 0.6954, | |
| "macro_f1": 0.6554, | |
| "per_label": { | |
| "low": { | |
| "precision": 0.8, | |
| "recall": 0.8276, | |
| "f1": 0.8136, | |
| "support": 87 | |
| }, | |
| "medium": { | |
| "precision": 0.5333, | |
| "recall": 0.6038, | |
| "f1": 0.5664, | |
| "support": 53 | |
| }, | |
| "high": { | |
| "precision": 0.7083, | |
| "recall": 0.5, | |
| "f1": 0.5862, | |
| "support": 34 | |
| } | |
| }, | |
| "confusion_matrix": [ | |
| [ | |
| 72, | |
| 13, | |
| 2 | |
| ], | |
| [ | |
| 16, | |
| 32, | |
| 5 | |
| ], | |
| [ | |
| 2, | |
| 15, | |
| 17 | |
| ] | |
| ] | |
| } | |
| }, | |
| "overall": { | |
| "exact_match": 0.4655, | |
| "macro_average_f1": 0.7553, | |
| "automation_safe_accuracy": 0.6935, | |
| "automation_safe_coverage": 0.3563, | |
| "confidence_threshold": 0.8, | |
| "confidence_calibration": { | |
| "ece": 0.303072, | |
| "bins": [ | |
| { | |
| "range": [ | |
| 0.5, | |
| 0.6 | |
| ], | |
| "count": 5, | |
| "avg_confidence": 0.586, | |
| "accuracy": 0.0 | |
| }, | |
| { | |
| "range": [ | |
| 0.6, | |
| 0.7 | |
| ], | |
| "count": 48, | |
| "avg_confidence": 0.6575, | |
| "accuracy": 0.2917 | |
| }, | |
| { | |
| "range": [ | |
| 0.7, | |
| 0.8 | |
| ], | |
| "count": 59, | |
| "avg_confidence": 0.7473, | |
| "accuracy": 0.4068 | |
| }, | |
| { | |
| "range": [ | |
| 0.8, | |
| 0.9 | |
| ], | |
| "count": 33, | |
| "avg_confidence": 0.8512, | |
| "accuracy": 0.5758 | |
| }, | |
| { | |
| "range": [ | |
| 0.9, | |
| 1.0 | |
| ], | |
| "count": 29, | |
| "avg_confidence": 0.9333, | |
| "accuracy": 0.8276 | |
| } | |
| ] | |
| } | |
| }, | |
| "temperature_scaling": { | |
| "method": "per_head_temperature_scaling", | |
| "per_head": { | |
| "complexity": 0.891251, | |
| "task_type": 0.707946, | |
| "risk": 1.059254 | |
| } | |
| }, | |
| "complexity_subdims": { | |
| "reasoning_depth": { | |
| "mae": 0.1069, | |
| "r2": 0.5888 | |
| }, | |
| "spec_completeness": { | |
| "mae": 0.1033, | |
| "r2": 0.3667 | |
| }, | |
| "scope_breadth": { | |
| "mae": 0.1076, | |
| "r2": 0.5044 | |
| }, | |
| "domain_knowledge": { | |
| "mae": 0.1036, | |
| "r2": 0.4405 | |
| } | |
| }, | |
| "risk_subdims": { | |
| "security_surface": { | |
| "mae": 0.1517, | |
| "r2": 0.1937 | |
| }, | |
| "data_sensitivity": { | |
| "mae": 0.1184, | |
| "r2": 0.3094 | |
| }, | |
| "production_exposure": { | |
| "mae": 0.1182, | |
| "r2": 0.6323 | |
| }, | |
| "reversal_cost": { | |
| "mae": 0.1045, | |
| "r2": 0.6316 | |
| } | |
| } | |
| } |