coding-router v6: ONNX (int8+fp32) + PyTorch weights, tokenizer, calibration, eval results
e07ba76 verified | { | |
| "per_head": { | |
| "complexity": { | |
| "accuracy": 0.6782, | |
| "macro_f1": 0.6806, | |
| "per_label": { | |
| "easy": { | |
| "precision": 0.5758, | |
| "recall": 0.8636, | |
| "f1": 0.6909, | |
| "support": 44 | |
| }, | |
| "medium": { | |
| "precision": 0.7941, | |
| "recall": 0.5745, | |
| "f1": 0.6667, | |
| "support": 94 | |
| }, | |
| "hard": { | |
| "precision": 0.65, | |
| "recall": 0.7222, | |
| "f1": 0.6842, | |
| "support": 36 | |
| } | |
| }, | |
| "confusion_matrix": [ | |
| [ | |
| 38, | |
| 6, | |
| 0 | |
| ], | |
| [ | |
| 26, | |
| 54, | |
| 14 | |
| ], | |
| [ | |
| 2, | |
| 8, | |
| 26 | |
| ] | |
| ] | |
| }, | |
| "task_type": { | |
| "accuracy": 0.8678, | |
| "macro_f1": 0.8718, | |
| "per_label": { | |
| "bugfix": { | |
| "precision": 0.875, | |
| "recall": 0.84, | |
| "f1": 0.8571, | |
| "support": 25 | |
| }, | |
| "feature": { | |
| "precision": 0.875, | |
| "recall": 0.8333, | |
| "f1": 0.8537, | |
| "support": 42 | |
| }, | |
| "refactor": { | |
| "precision": 0.84, | |
| "recall": 1.0, | |
| "f1": 0.913, | |
| "support": 21 | |
| }, | |
| "test": { | |
| "precision": 1.0, | |
| "recall": 0.8571, | |
| "f1": 0.9231, | |
| "support": 14 | |
| }, | |
| "design": { | |
| "precision": 0.8667, | |
| "recall": 0.8125, | |
| "f1": 0.8387, | |
| "support": 16 | |
| }, | |
| "docs": { | |
| "precision": 0.9286, | |
| "recall": 0.8667, | |
| "f1": 0.8966, | |
| "support": 15 | |
| }, | |
| "migration": { | |
| "precision": 0.9524, | |
| "recall": 0.9091, | |
| "f1": 0.9302, | |
| "support": 22 | |
| }, | |
| "exploration": { | |
| "precision": 0.6957, | |
| "recall": 0.8421, | |
| "f1": 0.7619, | |
| "support": 19 | |
| } | |
| }, | |
| "confusion_matrix": [ | |
| [ | |
| 21, | |
| 0, | |
| 2, | |
| 0, | |
| 0, | |
| 0, | |
| 1, | |
| 1 | |
| ], | |
| [ | |
| 0, | |
| 35, | |
| 1, | |
| 0, | |
| 1, | |
| 0, | |
| 0, | |
| 5 | |
| ], | |
| [ | |
| 0, | |
| 0, | |
| 21, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0 | |
| ], | |
| [ | |
| 1, | |
| 1, | |
| 0, | |
| 12, | |
| 0, | |
| 0, | |
| 0, | |
| 0 | |
| ], | |
| [ | |
| 0, | |
| 1, | |
| 1, | |
| 0, | |
| 13, | |
| 1, | |
| 0, | |
| 0 | |
| ], | |
| [ | |
| 0, | |
| 1, | |
| 0, | |
| 0, | |
| 0, | |
| 13, | |
| 0, | |
| 1 | |
| ], | |
| [ | |
| 1, | |
| 1, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 20, | |
| 0 | |
| ], | |
| [ | |
| 1, | |
| 1, | |
| 0, | |
| 0, | |
| 1, | |
| 0, | |
| 0, | |
| 16 | |
| ] | |
| ] | |
| }, | |
| "risk": { | |
| "accuracy": 0.6667, | |
| "macro_f1": 0.6217, | |
| "per_label": { | |
| "low": { | |
| "precision": 0.8068, | |
| "recall": 0.8353, | |
| "f1": 0.8208, | |
| "support": 85 | |
| }, | |
| "medium": { | |
| "precision": 0.5102, | |
| "recall": 0.4545, | |
| "f1": 0.4808, | |
| "support": 55 | |
| }, | |
| "high": { | |
| "precision": 0.5405, | |
| "recall": 0.5882, | |
| "f1": 0.5634, | |
| "support": 34 | |
| } | |
| }, | |
| "confusion_matrix": [ | |
| [ | |
| 71, | |
| 11, | |
| 3 | |
| ], | |
| [ | |
| 16, | |
| 25, | |
| 14 | |
| ], | |
| [ | |
| 1, | |
| 13, | |
| 20 | |
| ] | |
| ] | |
| } | |
| }, | |
| "overall": { | |
| "exact_match": 0.3908, | |
| "macro_average_f1": 0.7247, | |
| "automation_safe_accuracy": 0.6167, | |
| "automation_safe_coverage": 0.3448, | |
| "confidence_threshold": 0.8, | |
| "confidence_calibration": { | |
| "ece": 0.369827, | |
| "bins": [ | |
| { | |
| "range": [ | |
| 0.5, | |
| 0.6 | |
| ], | |
| "count": 2, | |
| "avg_confidence": 0.5761, | |
| "accuracy": 0.0 | |
| }, | |
| { | |
| "range": [ | |
| 0.6, | |
| 0.7 | |
| ], | |
| "count": 52, | |
| "avg_confidence": 0.6519, | |
| "accuracy": 0.3077 | |
| }, | |
| { | |
| "range": [ | |
| 0.7, | |
| 0.8 | |
| ], | |
| "count": 60, | |
| "avg_confidence": 0.7476, | |
| "accuracy": 0.25 | |
| }, | |
| { | |
| "range": [ | |
| 0.8, | |
| 0.9 | |
| ], | |
| "count": 41, | |
| "avg_confidence": 0.8439, | |
| "accuracy": 0.5366 | |
| }, | |
| { | |
| "range": [ | |
| 0.9, | |
| 1.0 | |
| ], | |
| "count": 19, | |
| "avg_confidence": 0.9391, | |
| "accuracy": 0.7895 | |
| } | |
| ] | |
| } | |
| }, | |
| "temperature_scaling": { | |
| "method": "per_head_temperature_scaling", | |
| "per_head": { | |
| "complexity": 0.891251, | |
| "task_type": 0.707946, | |
| "risk": 1.059254 | |
| } | |
| }, | |
| "complexity_subdims": { | |
| "reasoning_depth": { | |
| "mae": 0.1117, | |
| "r2": 0.513 | |
| }, | |
| "spec_completeness": { | |
| "mae": 0.1165, | |
| "r2": 0.3356 | |
| }, | |
| "scope_breadth": { | |
| "mae": 0.1082, | |
| "r2": 0.4687 | |
| }, | |
| "domain_knowledge": { | |
| "mae": 0.1199, | |
| "r2": 0.3003 | |
| } | |
| }, | |
| "risk_subdims": { | |
| "security_surface": { | |
| "mae": 0.1627, | |
| "r2": 0.1921 | |
| }, | |
| "data_sensitivity": { | |
| "mae": 0.1182, | |
| "r2": 0.2477 | |
| }, | |
| "production_exposure": { | |
| "mae": 0.1387, | |
| "r2": 0.5113 | |
| }, | |
| "reversal_cost": { | |
| "mae": 0.1103, | |
| "r2": 0.5463 | |
| } | |
| } | |
| } |