Spaces:
Sleeping
Sleeping
Bachstelze commited on
Commit Β·
a639edc
1
Parent(s): 2a23fe1
add time bench and viz
Browse files- A6/adaboost_classes.py +196 -0
- A6/all_classification.py +5 -0
- A6/benchmark_results/benchmark_20260310_090052.json +247 -0
- A6/benchmark_results/single_benchmark_20260310_090011.json +0 -0
- A6/benchmark_results/visualizations/accuracy_vs_inference_time.png +3 -0
- A6/benchmark_results/visualizations/compare_benchmarks.py +503 -0
- A6/benchmark_results/visualizations/inference_time_distribution.png +3 -0
- A6/benchmark_results/visualizations/mean_inference_times.png +3 -0
- A6/benchmark_results/visualizations/percentile_comparison.png +3 -0
- A6/benchmark_results/visualizations/response_time_comparison.html +234 -0
- A6/benchmark_results/visualizations/response_time_comparison.png +3 -0
- A6/benchmark_results/visualizations/standard_deviation_comparison.png +3 -0
- A6/benchmark_results/visualizations/summary_statistics.png +3 -0
- A6/benchmark_timing.md +335 -0
- A6/benchmark_timing.py +960 -0
- A6/check_svm_model.py +28 -0
- A6/test_classification_loading.py +380 -0
- A6/time_specification.md +22 -0
A6/adaboost_classes.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Helper module to import AdaBoost classes without running module-level code.
|
| 4 |
+
|
| 5 |
+
This module re-exports the AdaBoostEnsemble and WeightedDecisionTree classes
|
| 6 |
+
from classification_adaboost.py, but without triggering the module-level
|
| 7 |
+
data loading and training code.
|
| 8 |
+
"""
|
| 9 |
+
import numpy as np
|
| 10 |
+
from sklearn.base import BaseEstimator, ClassifierMixin
|
| 11 |
+
from sklearn.tree import DecisionTreeClassifier
|
| 12 |
+
from typing import List
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class WeightedDecisionTree(DecisionTreeClassifier):
|
| 16 |
+
"""
|
| 17 |
+
A wrapper around DecisionTreeClassifier that properly handles sample weights.
|
| 18 |
+
This tree is grown based on weighted training errors.
|
| 19 |
+
"""
|
| 20 |
+
def __init__(self, max_depth: int = 5, min_samples_split: int = 2,
|
| 21 |
+
min_samples_leaf: int = 1, random_state: int = 42):
|
| 22 |
+
super().__init__(
|
| 23 |
+
max_depth=max_depth,
|
| 24 |
+
min_samples_split=min_samples_split,
|
| 25 |
+
min_samples_leaf=min_samples_leaf,
|
| 26 |
+
random_state=random_state
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
def fit(self, X, y, sample_weight=None):
|
| 30 |
+
"""Fit the decision tree with optional sample weights."""
|
| 31 |
+
return super().fit(X, y, sample_weight=sample_weight)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class AdaBoostEnsemble(BaseEstimator, ClassifierMixin):
|
| 35 |
+
"""
|
| 36 |
+
AdaBoost ensemble of decision trees where each tree is grown based on
|
| 37 |
+
weighted training errors. Weights are updated based on the error of
|
| 38 |
+
previous trees.
|
| 39 |
+
|
| 40 |
+
The algorithm:
|
| 41 |
+
1. Initialize equal weights for all training samples
|
| 42 |
+
2. For each tree in the ensemble:
|
| 43 |
+
- Train a decision tree on weighted data
|
| 44 |
+
- Calculate weighted error rate
|
| 45 |
+
- Compute tree weight (alpha)
|
| 46 |
+
- Update sample weights (increase for misclassified, decrease for correct)
|
| 47 |
+
- Normalize weights
|
| 48 |
+
3. Make predictions using weighted voting
|
| 49 |
+
"""
|
| 50 |
+
|
| 51 |
+
def __init__(
|
| 52 |
+
self,
|
| 53 |
+
n_estimators: int = 50,
|
| 54 |
+
max_depth: int = 5,
|
| 55 |
+
min_samples_split: int = 2,
|
| 56 |
+
min_samples_leaf: int = 1,
|
| 57 |
+
random_state: int = 42
|
| 58 |
+
):
|
| 59 |
+
self.n_estimators = n_estimators
|
| 60 |
+
self.max_depth = max_depth
|
| 61 |
+
self.min_samples_split = min_samples_split
|
| 62 |
+
self.min_samples_leaf = min_samples_leaf
|
| 63 |
+
self.random_state = random_state
|
| 64 |
+
self.trees: List[WeightedDecisionTree] = []
|
| 65 |
+
self.tree_weights: List[float] = []
|
| 66 |
+
self.n_classes: int = 0
|
| 67 |
+
self.classes_: np.ndarray = None
|
| 68 |
+
|
| 69 |
+
def _initialize_weights(self, n_samples: int) -> np.ndarray:
|
| 70 |
+
"""Initialize equal weights for all samples."""
|
| 71 |
+
return np.ones(n_samples) / n_samples
|
| 72 |
+
|
| 73 |
+
def _update_weights(
|
| 74 |
+
self,
|
| 75 |
+
weights: np.ndarray,
|
| 76 |
+
y_true: np.ndarray,
|
| 77 |
+
y_pred: np.ndarray,
|
| 78 |
+
alpha: float
|
| 79 |
+
) -> np.ndarray:
|
| 80 |
+
"""
|
| 81 |
+
Update sample weights based on prediction errors.
|
| 82 |
+
Increase weight for misclassified samples, decrease for correct.
|
| 83 |
+
"""
|
| 84 |
+
# Misclassified samples get multiplied by exp(alpha)
|
| 85 |
+
# Correctly classified samples get multiplied by exp(-alpha)
|
| 86 |
+
misclassified = y_true != y_pred
|
| 87 |
+
updated_weights = weights * np.exp(alpha * misclassified.astype(float))
|
| 88 |
+
|
| 89 |
+
# Normalize weights
|
| 90 |
+
return updated_weights / updated_weights.sum()
|
| 91 |
+
|
| 92 |
+
def _compute_weighted_error(
|
| 93 |
+
self,
|
| 94 |
+
weights: np.ndarray,
|
| 95 |
+
y_true: np.ndarray,
|
| 96 |
+
y_pred: np.ndarray
|
| 97 |
+
) -> float:
|
| 98 |
+
"""Compute weighted error rate."""
|
| 99 |
+
misclassified = (y_true != y_pred).astype(float)
|
| 100 |
+
return np.sum(weights * misclassified) / np.sum(weights)
|
| 101 |
+
|
| 102 |
+
def _compute_alpha(self, error: float) -> float:
|
| 103 |
+
"""
|
| 104 |
+
Compute the weight of the classifier.
|
| 105 |
+
Avoid division by zero and log(0).
|
| 106 |
+
"""
|
| 107 |
+
if error <= 0:
|
| 108 |
+
return 10.0 # Very high weight for perfect classifier
|
| 109 |
+
if error >= 1:
|
| 110 |
+
return -10.0 # Very negative weight for completely wrong classifier
|
| 111 |
+
return 0.5 * np.log((1 - error) / error)
|
| 112 |
+
|
| 113 |
+
def fit(self, X: np.ndarray, y: np.ndarray) -> 'AdaBoostEnsemble':
|
| 114 |
+
"""Fit the AdaBoost ensemble."""
|
| 115 |
+
n_samples, n_features = X.shape
|
| 116 |
+
self.classes_ = np.unique(y)
|
| 117 |
+
self.n_classes = len(self.classes_)
|
| 118 |
+
|
| 119 |
+
# Initialize sample weights
|
| 120 |
+
weights = self._initialize_weights(n_samples)
|
| 121 |
+
|
| 122 |
+
for i in range(self.n_estimators):
|
| 123 |
+
# Create and train decision tree with current weights
|
| 124 |
+
tree = WeightedDecisionTree(
|
| 125 |
+
max_depth=self.max_depth,
|
| 126 |
+
min_samples_split=self.min_samples_split,
|
| 127 |
+
min_samples_leaf=self.min_samples_leaf,
|
| 128 |
+
random_state=self.random_state + i
|
| 129 |
+
)
|
| 130 |
+
tree.fit(X, y, sample_weight=weights)
|
| 131 |
+
|
| 132 |
+
# Make predictions
|
| 133 |
+
y_pred = tree.predict(X)
|
| 134 |
+
|
| 135 |
+
# Calculate weighted error
|
| 136 |
+
error = self._compute_weighted_error(weights, y, y_pred)
|
| 137 |
+
|
| 138 |
+
# Compute tree weight (alpha)
|
| 139 |
+
alpha = self._compute_alpha(error)
|
| 140 |
+
|
| 141 |
+
# Update sample weights
|
| 142 |
+
weights = self._update_weights(weights, y, y_pred, alpha)
|
| 143 |
+
|
| 144 |
+
# Store tree and its weight
|
| 145 |
+
self.trees.append(tree)
|
| 146 |
+
self.tree_weights.append(alpha)
|
| 147 |
+
|
| 148 |
+
print(f"Tree {i+1}/{self.n_estimators}: Error={error:.4f}, Alpha={alpha:.4f}")
|
| 149 |
+
|
| 150 |
+
return self
|
| 151 |
+
|
| 152 |
+
def predict(self, X: np.ndarray) -> np.ndarray:
|
| 153 |
+
"""Predict using weighted voting."""
|
| 154 |
+
# Get predictions from all trees
|
| 155 |
+
all_predictions = np.array([tree.predict(X) for tree in self.trees])
|
| 156 |
+
|
| 157 |
+
# Get class labels
|
| 158 |
+
classes = self.classes_
|
| 159 |
+
|
| 160 |
+
# Compute weighted votes for each class
|
| 161 |
+
n_samples = X.shape[0]
|
| 162 |
+
weighted_votes = np.zeros((n_samples, len(classes)))
|
| 163 |
+
|
| 164 |
+
for tree_idx, tree in enumerate(self.trees):
|
| 165 |
+
alpha = self.tree_weights[tree_idx]
|
| 166 |
+
predictions = all_predictions[tree_idx]
|
| 167 |
+
|
| 168 |
+
for class_idx, class_label in enumerate(classes):
|
| 169 |
+
weighted_votes[:, class_idx] += alpha * (predictions == class_label)
|
| 170 |
+
|
| 171 |
+
# Return class with highest weighted vote
|
| 172 |
+
return classes[np.argmax(weighted_votes, axis=1)]
|
| 173 |
+
|
| 174 |
+
def predict_proba(self, X: np.ndarray) -> np.ndarray:
|
| 175 |
+
"""Predict class probabilities using weighted voting."""
|
| 176 |
+
# Get predictions from all trees
|
| 177 |
+
all_predictions = np.array([tree.predict(X) for tree in self.trees])
|
| 178 |
+
|
| 179 |
+
# Get class labels
|
| 180 |
+
classes = self.classes_
|
| 181 |
+
|
| 182 |
+
# Compute weighted vote proportions for each class
|
| 183 |
+
n_samples = X.shape[0]
|
| 184 |
+
weighted_votes = np.zeros((n_samples, len(classes)))
|
| 185 |
+
|
| 186 |
+
total_weight = sum(abs(w) for w in self.tree_weights)
|
| 187 |
+
|
| 188 |
+
for tree_idx, tree in enumerate(self.trees):
|
| 189 |
+
alpha = self.tree_weights[tree_idx]
|
| 190 |
+
predictions = all_predictions[tree_idx]
|
| 191 |
+
|
| 192 |
+
for class_idx, class_label in enumerate(classes):
|
| 193 |
+
weighted_votes[:, class_idx] += abs(alpha) * (predictions == class_label)
|
| 194 |
+
|
| 195 |
+
# Normalize to get probabilities
|
| 196 |
+
return weighted_votes / total_weight
|
A6/all_classification.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
a4_rf = "../A4/models/weaklink_classifier_rf.pkl"
|
| 2 |
+
a5_ensemnble = "../A5/models/ensemble_classification_champion.pkl"
|
| 3 |
+
a5b_adaboost = "../A5b/models/adaboost_classification.pkl"
|
| 4 |
+
a5b_bagging_tree = "../A5b/models/bagging_trees_champion.pkl"
|
| 5 |
+
a6_svm = "models/champion_svm.pkl"
|
A6/benchmark_results/benchmark_20260310_090052.json
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"timestamp": "2026-03-10T09:00:50.070144",
|
| 3 |
+
"num_samples": 100,
|
| 4 |
+
"num_repeats": 10,
|
| 5 |
+
"models": {
|
| 6 |
+
"A4 Random Forest": {
|
| 7 |
+
"model_name": "A4 Random Forest",
|
| 8 |
+
"model_path": "../A4/models/weaklink_classifier_rf.pkl",
|
| 9 |
+
"inference_time_mean": 0.06072263170499355,
|
| 10 |
+
"inference_time_std": 0.0030473875509894866,
|
| 11 |
+
"inference_time_min": 0.058138252003118396,
|
| 12 |
+
"inference_time_max": 0.06896431901259348,
|
| 13 |
+
"inference_time_p50": 0.060211887990590185,
|
| 14 |
+
"inference_time_p95": 0.06896431901259348,
|
| 15 |
+
"inference_time_p99": 0.06896431901259348,
|
| 16 |
+
"memory_usage_mean": 360134.1,
|
| 17 |
+
"memory_usage_std": 67634.63257081308,
|
| 18 |
+
"memory_usage_peak": 512177,
|
| 19 |
+
"accuracy": 0.89,
|
| 20 |
+
"predictions_correct": 89,
|
| 21 |
+
"predictions_total": 100,
|
| 22 |
+
"model_size_bytes": 16381898,
|
| 23 |
+
"num_features": 41,
|
| 24 |
+
"num_parameters": 0,
|
| 25 |
+
"model_type": "RandomForestClassifier",
|
| 26 |
+
"feature_extraction_time_mean": 0.0,
|
| 27 |
+
"timing_samples": [
|
| 28 |
+
0.060492476040963084,
|
| 29 |
+
0.05959970800904557,
|
| 30 |
+
0.05881448305444792,
|
| 31 |
+
0.058138252003118396,
|
| 32 |
+
0.06896431901259348,
|
| 33 |
+
0.060211887990590185,
|
| 34 |
+
0.05942972801858559,
|
| 35 |
+
0.061595859995577484,
|
| 36 |
+
0.0596357659669593,
|
| 37 |
+
0.06034383695805445
|
| 38 |
+
],
|
| 39 |
+
"memory_samples": [
|
| 40 |
+
512177,
|
| 41 |
+
377303,
|
| 42 |
+
302127,
|
| 43 |
+
358391,
|
| 44 |
+
379313,
|
| 45 |
+
354423,
|
| 46 |
+
380515,
|
| 47 |
+
281588,
|
| 48 |
+
379268,
|
| 49 |
+
276236
|
| 50 |
+
],
|
| 51 |
+
"status": "SUCCESS",
|
| 52 |
+
"error_message": ""
|
| 53 |
+
},
|
| 54 |
+
"A5 Ensemble": {
|
| 55 |
+
"model_name": "A5 Ensemble",
|
| 56 |
+
"model_path": "../A5/models/ensemble_classification_champion.pkl",
|
| 57 |
+
"inference_time_mean": 0.08792474841466173,
|
| 58 |
+
"inference_time_std": 0.019674506115526187,
|
| 59 |
+
"inference_time_min": 0.067903274029959,
|
| 60 |
+
"inference_time_max": 0.13867365900659934,
|
| 61 |
+
"inference_time_p50": 0.08352956402814016,
|
| 62 |
+
"inference_time_p95": 0.13867365900659934,
|
| 63 |
+
"inference_time_p99": 0.13867365900659934,
|
| 64 |
+
"memory_usage_mean": 404756.5,
|
| 65 |
+
"memory_usage_std": 288156.9877403828,
|
| 66 |
+
"memory_usage_peak": 1210671,
|
| 67 |
+
"accuracy": 0.67,
|
| 68 |
+
"predictions_correct": 67,
|
| 69 |
+
"predictions_total": 100,
|
| 70 |
+
"model_size_bytes": 26660056,
|
| 71 |
+
"num_features": 36,
|
| 72 |
+
"num_parameters": 0,
|
| 73 |
+
"model_type": "VotingClassifier",
|
| 74 |
+
"feature_extraction_time_mean": 0.0,
|
| 75 |
+
"timing_samples": [
|
| 76 |
+
0.13867365900659934,
|
| 77 |
+
0.08352956402814016,
|
| 78 |
+
0.067903274029959,
|
| 79 |
+
0.08235391502967104,
|
| 80 |
+
0.09512816503411159,
|
| 81 |
+
0.09174130897736177,
|
| 82 |
+
0.07728461700025946,
|
| 83 |
+
0.07468455104390159,
|
| 84 |
+
0.07801902701612562,
|
| 85 |
+
0.0899294029804878
|
| 86 |
+
],
|
| 87 |
+
"memory_samples": [
|
| 88 |
+
1210671,
|
| 89 |
+
276078,
|
| 90 |
+
257244,
|
| 91 |
+
374860,
|
| 92 |
+
258411,
|
| 93 |
+
374702,
|
| 94 |
+
277252,
|
| 95 |
+
270064,
|
| 96 |
+
372458,
|
| 97 |
+
375825
|
| 98 |
+
],
|
| 99 |
+
"status": "SUCCESS",
|
| 100 |
+
"error_message": ""
|
| 101 |
+
},
|
| 102 |
+
"A5b Adaboost": {
|
| 103 |
+
"model_name": "A5b Adaboost",
|
| 104 |
+
"model_path": "../A5b/models/adaboost_classification.pkl",
|
| 105 |
+
"inference_time_mean": 0.03466975499759428,
|
| 106 |
+
"inference_time_std": 0.006925241966045739,
|
| 107 |
+
"inference_time_min": 0.030500065011437982,
|
| 108 |
+
"inference_time_max": 0.048356816987507045,
|
| 109 |
+
"inference_time_p50": 0.032038366014603525,
|
| 110 |
+
"inference_time_p95": 0.048356816987507045,
|
| 111 |
+
"inference_time_p99": 0.048356816987507045,
|
| 112 |
+
"memory_usage_mean": 204768.4,
|
| 113 |
+
"memory_usage_std": 311.91138342662504,
|
| 114 |
+
"memory_usage_peak": 205656,
|
| 115 |
+
"accuracy": 0.52,
|
| 116 |
+
"predictions_correct": 52,
|
| 117 |
+
"predictions_total": 100,
|
| 118 |
+
"model_size_bytes": 725059,
|
| 119 |
+
"num_features": 0,
|
| 120 |
+
"num_parameters": 0,
|
| 121 |
+
"model_type": "AdaBoostEnsemble",
|
| 122 |
+
"feature_extraction_time_mean": 0.0,
|
| 123 |
+
"timing_samples": [
|
| 124 |
+
0.048356816987507045,
|
| 125 |
+
0.047088092018384486,
|
| 126 |
+
0.03258101601386443,
|
| 127 |
+
0.03238268301356584,
|
| 128 |
+
0.03146621095947921,
|
| 129 |
+
0.032038366014603525,
|
| 130 |
+
0.030500065011437982,
|
| 131 |
+
0.03090687998337671,
|
| 132 |
+
0.03052046400262043,
|
| 133 |
+
0.03085695597110316
|
| 134 |
+
],
|
| 135 |
+
"memory_samples": [
|
| 136 |
+
205656,
|
| 137 |
+
204684,
|
| 138 |
+
204668,
|
| 139 |
+
204668,
|
| 140 |
+
204668,
|
| 141 |
+
204668,
|
| 142 |
+
204668,
|
| 143 |
+
204668,
|
| 144 |
+
204668,
|
| 145 |
+
204668
|
| 146 |
+
],
|
| 147 |
+
"status": "SUCCESS",
|
| 148 |
+
"error_message": ""
|
| 149 |
+
},
|
| 150 |
+
"A5b Bagging Trees": {
|
| 151 |
+
"model_name": "A5b Bagging Trees",
|
| 152 |
+
"model_path": "../A5b/models/bagging_trees_champion.pkl",
|
| 153 |
+
"inference_time_mean": 0.006075771508039907,
|
| 154 |
+
"inference_time_std": 0.0017926972777932554,
|
| 155 |
+
"inference_time_min": 0.0038332950207404792,
|
| 156 |
+
"inference_time_max": 0.00979096203809604,
|
| 157 |
+
"inference_time_p50": 0.006550171005073935,
|
| 158 |
+
"inference_time_p95": 0.00979096203809604,
|
| 159 |
+
"inference_time_p99": 0.00979096203809604,
|
| 160 |
+
"memory_usage_mean": 59716.6,
|
| 161 |
+
"memory_usage_std": 68.09176814335848,
|
| 162 |
+
"memory_usage_peak": 59866,
|
| 163 |
+
"accuracy": 0.0,
|
| 164 |
+
"predictions_correct": 0,
|
| 165 |
+
"predictions_total": 100,
|
| 166 |
+
"model_size_bytes": 6506123,
|
| 167 |
+
"num_features": 36,
|
| 168 |
+
"num_parameters": 0,
|
| 169 |
+
"model_type": "LGBMClassifier",
|
| 170 |
+
"feature_extraction_time_mean": 0.0,
|
| 171 |
+
"timing_samples": [
|
| 172 |
+
0.006550171005073935,
|
| 173 |
+
0.0061910360236652195,
|
| 174 |
+
0.0068354670074768364,
|
| 175 |
+
0.006988314969930798,
|
| 176 |
+
0.004823405994102359,
|
| 177 |
+
0.006920185987837613,
|
| 178 |
+
0.00979096203809604,
|
| 179 |
+
0.0038514090119861066,
|
| 180 |
+
0.0038332950207404792,
|
| 181 |
+
0.00497346802148968
|
| 182 |
+
],
|
| 183 |
+
"memory_samples": [
|
| 184 |
+
59866,
|
| 185 |
+
59746,
|
| 186 |
+
59746,
|
| 187 |
+
59746,
|
| 188 |
+
59746,
|
| 189 |
+
59700,
|
| 190 |
+
59654,
|
| 191 |
+
59654,
|
| 192 |
+
59654,
|
| 193 |
+
59654
|
| 194 |
+
],
|
| 195 |
+
"status": "SUCCESS",
|
| 196 |
+
"error_message": ""
|
| 197 |
+
},
|
| 198 |
+
"A6 SVM": {
|
| 199 |
+
"model_name": "A6 SVM",
|
| 200 |
+
"model_path": "models/champion_svm.pkl",
|
| 201 |
+
"inference_time_mean": 0.009102203900692985,
|
| 202 |
+
"inference_time_std": 0.0003233410993925297,
|
| 203 |
+
"inference_time_min": 0.008689811977092177,
|
| 204 |
+
"inference_time_max": 0.009627135004848242,
|
| 205 |
+
"inference_time_p50": 0.009107397985644639,
|
| 206 |
+
"inference_time_p95": 0.009627135004848242,
|
| 207 |
+
"inference_time_p99": 0.009627135004848242,
|
| 208 |
+
"memory_usage_mean": 62088.6,
|
| 209 |
+
"memory_usage_std": 193.42021036535397,
|
| 210 |
+
"memory_usage_peak": 62631,
|
| 211 |
+
"accuracy": 0.83,
|
| 212 |
+
"predictions_correct": 83,
|
| 213 |
+
"predictions_total": 100,
|
| 214 |
+
"model_size_bytes": 700346,
|
| 215 |
+
"num_features": 36,
|
| 216 |
+
"num_parameters": 0,
|
| 217 |
+
"model_type": "Pipeline",
|
| 218 |
+
"feature_extraction_time_mean": 0.0,
|
| 219 |
+
"timing_samples": [
|
| 220 |
+
0.009627135004848242,
|
| 221 |
+
0.009057053015567362,
|
| 222 |
+
0.009107397985644639,
|
| 223 |
+
0.008771255961619318,
|
| 224 |
+
0.00915416597854346,
|
| 225 |
+
0.008994235016871244,
|
| 226 |
+
0.00961044302675873,
|
| 227 |
+
0.00879047199850902,
|
| 228 |
+
0.009220069041475654,
|
| 229 |
+
0.008689811977092177
|
| 230 |
+
],
|
| 231 |
+
"memory_samples": [
|
| 232 |
+
62631,
|
| 233 |
+
62063,
|
| 234 |
+
62047,
|
| 235 |
+
62047,
|
| 236 |
+
61955,
|
| 237 |
+
62047,
|
| 238 |
+
62047,
|
| 239 |
+
62047,
|
| 240 |
+
62001,
|
| 241 |
+
62001
|
| 242 |
+
],
|
| 243 |
+
"status": "SUCCESS",
|
| 244 |
+
"error_message": ""
|
| 245 |
+
}
|
| 246 |
+
}
|
| 247 |
+
}
|
A6/benchmark_results/single_benchmark_20260310_090011.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
A6/benchmark_results/visualizations/accuracy_vs_inference_time.png
ADDED
|
Git LFS Details
|
A6/benchmark_results/visualizations/compare_benchmarks.py
ADDED
|
@@ -0,0 +1,503 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Script to compare response times (inference times) from two benchmark JSON files.
|
| 4 |
+
Generates a visualization comparing the models from both benchmarks.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
import numpy as np
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
# File paths
|
| 13 |
+
benchmark_path = Path(__file__).parent / "../benchmark_20260310_090052.json"
|
| 14 |
+
single_benchmark_path = Path(__file__).parent / "../single_benchmark_20260310_090011.json"
|
| 15 |
+
|
| 16 |
+
# Load benchmark data
|
| 17 |
+
with open(benchmark_path, 'r') as f:
|
| 18 |
+
benchmark_data = json.load(f)
|
| 19 |
+
|
| 20 |
+
with open(single_benchmark_path, 'r') as f:
|
| 21 |
+
single_benchmark_data = json.load(f)
|
| 22 |
+
|
| 23 |
+
# Extract model data
|
| 24 |
+
def extract_model_data(data_dict):
|
| 25 |
+
models = {}
|
| 26 |
+
for model_name, model_info in data_dict.get('models', {}).items():
|
| 27 |
+
models[model_name] = {
|
| 28 |
+
'mean': model_info.get('inference_time_mean', 0),
|
| 29 |
+
'std': model_info.get('inference_time_std', 0),
|
| 30 |
+
'min': model_info.get('inference_time_min', 0),
|
| 31 |
+
'max': model_info.get('inference_time_max', 0),
|
| 32 |
+
'p50': model_info.get('inference_time_p50', 0),
|
| 33 |
+
'p95': model_info.get('inference_time_p95', 0),
|
| 34 |
+
'p99': model_info.get('inference_time_p99', 0),
|
| 35 |
+
'accuracy': model_info.get('accuracy', 0),
|
| 36 |
+
'timing_samples': model_info.get('timing_samples', [])
|
| 37 |
+
}
|
| 38 |
+
return models
|
| 39 |
+
|
| 40 |
+
benchmark_models = extract_model_data(benchmark_data)
|
| 41 |
+
single_benchmark_models = extract_model_data(single_benchmark_data)
|
| 42 |
+
|
| 43 |
+
# Get all model names (should be the same in both)
|
| 44 |
+
all_model_names = sorted(benchmark_models.keys())
|
| 45 |
+
|
| 46 |
+
# Create figure with subplots
|
| 47 |
+
fig = plt.figure(figsize=(16, 10))
|
| 48 |
+
|
| 49 |
+
# 1. Bar chart comparing mean inference times
|
| 50 |
+
ax1 = fig.add_subplot(2, 3, 1)
|
| 51 |
+
x = np.arange(len(all_model_names))
|
| 52 |
+
width = 0.35
|
| 53 |
+
|
| 54 |
+
benchmark_means = [benchmark_models[m]['mean'] * 1000 for m in all_model_names] # Convert to ms
|
| 55 |
+
single_means = [single_benchmark_models[m]['mean'] * 1000 for m in all_model_names] # Convert to ms
|
| 56 |
+
|
| 57 |
+
bars1 = ax1.bar(x - width/2, benchmark_means, width, label='Multi-benchmark (100 samples)', alpha=0.8)
|
| 58 |
+
bars2 = ax1.bar(x + width/2, single_means, width, label='Single-benchmark (10 samples)', alpha=0.8)
|
| 59 |
+
|
| 60 |
+
ax1.set_xlabel('Model')
|
| 61 |
+
ax1.set_ylabel('Mean Inference Time (ms)')
|
| 62 |
+
ax1.set_title('Comparison of Mean Inference Times')
|
| 63 |
+
ax1.set_xticks(x)
|
| 64 |
+
ax1.set_xticklabels(all_model_names, rotation=45, ha='right')
|
| 65 |
+
ax1.legend()
|
| 66 |
+
ax1.grid(axis='y', alpha=0.3)
|
| 67 |
+
|
| 68 |
+
# Add value labels on bars
|
| 69 |
+
for bar in bars1:
|
| 70 |
+
height = bar.get_height()
|
| 71 |
+
ax1.annotate(f'{height:.3f}',
|
| 72 |
+
xy=(bar.get_x() + bar.get_width() / 2, height),
|
| 73 |
+
xytext=(0, 3),
|
| 74 |
+
textcoords="offset points",
|
| 75 |
+
ha='center', va='bottom', fontsize=8)
|
| 76 |
+
|
| 77 |
+
for bar in bars2:
|
| 78 |
+
height = bar.get_height()
|
| 79 |
+
ax1.annotate(f'{height:.3f}',
|
| 80 |
+
xy=(bar.get_x() + bar.get_width() / 2, height),
|
| 81 |
+
xytext=(0, 3),
|
| 82 |
+
textcoords="offset points",
|
| 83 |
+
ha='center', va='bottom', fontsize=8)
|
| 84 |
+
|
| 85 |
+
# 2. Box plot comparing timing distributions
|
| 86 |
+
ax2 = fig.add_subplot(2, 3, 2)
|
| 87 |
+
|
| 88 |
+
# Prepare data for box plot
|
| 89 |
+
all_data = []
|
| 90 |
+
labels = []
|
| 91 |
+
colors = []
|
| 92 |
+
|
| 93 |
+
for i, model_name in enumerate(all_model_names):
|
| 94 |
+
benchmark_samples = benchmark_models[model_name]['timing_samples'][:10] # Use first 10 for comparison
|
| 95 |
+
single_samples = single_benchmark_models[model_name]['timing_samples'][:10] # Use first 10 for comparison
|
| 96 |
+
|
| 97 |
+
# Convert to ms
|
| 98 |
+
benchmark_ms = [s * 1000 for s in benchmark_samples]
|
| 99 |
+
single_ms = [s * 1000 for s in single_samples]
|
| 100 |
+
|
| 101 |
+
all_data.append(benchmark_ms)
|
| 102 |
+
all_data.append(single_ms)
|
| 103 |
+
labels.append(f'{model_name}\nMulti')
|
| 104 |
+
labels.append(f'{model_name}\nSingle')
|
| 105 |
+
colors.extend([f'C{i}', f'C{i}'])
|
| 106 |
+
|
| 107 |
+
bp = ax2.boxplot(all_data, labels=labels, patch_artist=True, vert=True)
|
| 108 |
+
for patch, color in zip(bp['boxes'], colors):
|
| 109 |
+
patch.set_facecolor(color)
|
| 110 |
+
patch.set_alpha(0.6)
|
| 111 |
+
|
| 112 |
+
ax2.set_xlabel('Model (Benchmark Type)')
|
| 113 |
+
ax2.set_ylabel('Inference Time (ms)')
|
| 114 |
+
ax2.set_title('Distribution of Inference Times (Box Plot)')
|
| 115 |
+
ax2.tick_params(axis='x', rotation=45)
|
| 116 |
+
ax2.grid(axis='y', alpha=0.3)
|
| 117 |
+
|
| 118 |
+
# 3. Comparison scatter plot with accuracy
|
| 119 |
+
ax3 = fig.add_subplot(2, 3, 3)
|
| 120 |
+
|
| 121 |
+
benchmark_accs = [benchmark_models[m]['accuracy'] * 100 for m in all_model_names]
|
| 122 |
+
single_accs = [single_benchmark_models[m]['accuracy'] * 100 for m in all_model_names]
|
| 123 |
+
benchmark_times = [benchmark_models[m]['mean'] * 1000 for m in all_model_names]
|
| 124 |
+
single_times = [single_benchmark_models[m]['mean'] * 1000 for m in all_model_names]
|
| 125 |
+
|
| 126 |
+
# Create scatter plot
|
| 127 |
+
for i, model_name in enumerate(all_model_names):
|
| 128 |
+
ax3.scatter([benchmark_times[i]], [benchmark_accs[i]], marker='o', s=100,
|
| 129 |
+
label=f'{model_name} (Multi)', alpha=0.8, color=f'C{i}')
|
| 130 |
+
ax3.scatter([single_times[i]], [single_accs[i]], marker='s', s=100,
|
| 131 |
+
label=f'{model_name} (Single)', alpha=0.8, color=f'C{i}')
|
| 132 |
+
|
| 133 |
+
ax3.set_xlabel('Mean Inference Time (ms)')
|
| 134 |
+
ax3.set_ylabel('Accuracy (%)')
|
| 135 |
+
ax3.set_title('Accuracy vs Inference Time Comparison')
|
| 136 |
+
ax3.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small')
|
| 137 |
+
ax3.grid(True, alpha=0.3)
|
| 138 |
+
|
| 139 |
+
# 4. Percentile comparison
|
| 140 |
+
ax4 = fig.add_subplot(2, 3, 4)
|
| 141 |
+
|
| 142 |
+
x = np.arange(len(all_model_names))
|
| 143 |
+
width = 0.25
|
| 144 |
+
|
| 145 |
+
benchmark_p50 = [benchmark_models[m]['p50'] * 1000 for m in all_model_names]
|
| 146 |
+
benchmark_p95 = [benchmark_models[m]['p95'] * 1000 for m in all_model_names]
|
| 147 |
+
benchmark_p99 = [benchmark_models[m]['p99'] * 1000 for m in all_model_names]
|
| 148 |
+
|
| 149 |
+
single_p50 = [single_benchmark_models[m]['p50'] * 1000 for m in all_model_names]
|
| 150 |
+
single_p95 = [single_benchmark_models[m]['p95'] * 1000 for m in all_model_names]
|
| 151 |
+
single_p99 = [single_benchmark_models[m]['p99'] * 1000 for m in all_model_names]
|
| 152 |
+
|
| 153 |
+
bars_p50 = ax4.bar(x - width, benchmark_p50, width, label='P50 (Multi)', alpha=0.8)
|
| 154 |
+
bars_p95 = ax4.bar(x, benchmark_p95, width, label='P95 (Multi)', alpha=0.8)
|
| 155 |
+
bars_p99 = ax4.bar(x + width, benchmark_p99, width, label='P99 (Multi)', alpha=0.8)
|
| 156 |
+
|
| 157 |
+
# Single benchmark percentiles (offset)
|
| 158 |
+
ax4.bar(x - width + 0.05, single_p50, width*0.8, label='P50 (Single)', alpha=0.6, hatch='//')
|
| 159 |
+
ax4.bar(x + 0.05, single_p95, width*0.8, label='P95 (Single)', alpha=0.6, hatch='//')
|
| 160 |
+
ax4.bar(x + width + 0.05, single_p99, width*0.8, label='P99 (Single)', alpha=0.6, hatch='//')
|
| 161 |
+
|
| 162 |
+
ax4.set_xlabel('Model')
|
| 163 |
+
ax4.set_ylabel('Inference Time (ms)')
|
| 164 |
+
ax4.set_title('Percentile Comparison (P50, P95, P99)')
|
| 165 |
+
ax4.set_xticks(x)
|
| 166 |
+
ax4.set_xticklabels(all_model_names, rotation=45, ha='right')
|
| 167 |
+
ax4.legend(fontsize='small')
|
| 168 |
+
ax4.grid(axis='y', alpha=0.3)
|
| 169 |
+
|
| 170 |
+
# 5. Standard deviation comparison
|
| 171 |
+
ax5 = fig.add_subplot(2, 3, 5)
|
| 172 |
+
|
| 173 |
+
benchmark_std = [benchmark_models[m]['std'] * 1000 for m in all_model_names]
|
| 174 |
+
single_std = [single_benchmark_models[m]['std'] * 1000 for m in all_model_names]
|
| 175 |
+
|
| 176 |
+
x = np.arange(len(all_model_names))
|
| 177 |
+
width = 0.35
|
| 178 |
+
|
| 179 |
+
bars_std1 = ax5.bar(x - width/2, benchmark_std, width, label='Multi-benchmark', alpha=0.8)
|
| 180 |
+
bars_std2 = ax5.bar(x + width/2, single_std, width, label='Single-benchmark', alpha=0.8)
|
| 181 |
+
|
| 182 |
+
ax5.set_xlabel('Model')
|
| 183 |
+
ax5.set_ylabel('Standard Deviation (ms)')
|
| 184 |
+
ax5.set_title('Standard Deviation of Inference Times')
|
| 185 |
+
ax5.set_xticks(x)
|
| 186 |
+
ax5.set_xticklabels(all_model_names, rotation=45, ha='right')
|
| 187 |
+
ax5.legend()
|
| 188 |
+
ax5.grid(axis='y', alpha=0.3)
|
| 189 |
+
|
| 190 |
+
# Add value labels
|
| 191 |
+
for bar in bars_std1:
|
| 192 |
+
height = bar.get_height()
|
| 193 |
+
ax5.annotate(f'{height:.4f}',
|
| 194 |
+
xy=(bar.get_x() + bar.get_width() / 2, height),
|
| 195 |
+
xytext=(0, 3),
|
| 196 |
+
textcoords="offset points",
|
| 197 |
+
ha='center', va='bottom', fontsize=7)
|
| 198 |
+
|
| 199 |
+
for bar in bars_std2:
|
| 200 |
+
height = bar.get_height()
|
| 201 |
+
ax5.annotate(f'{height:.4f}',
|
| 202 |
+
xy=(bar.get_x() + bar.get_width() / 2, height),
|
| 203 |
+
xytext=(0, 3),
|
| 204 |
+
textcoords="offset points",
|
| 205 |
+
ha='center', va='bottom', fontsize=7)
|
| 206 |
+
|
| 207 |
+
# 6. Summary statistics table
|
| 208 |
+
ax6 = fig.add_subplot(2, 3, 6)
|
| 209 |
+
ax6.axis('off')
|
| 210 |
+
|
| 211 |
+
# Create table data
|
| 212 |
+
table_data = []
|
| 213 |
+
for model_name in all_model_names:
|
| 214 |
+
row = [
|
| 215 |
+
model_name,
|
| 216 |
+
f"{benchmark_models[model_name]['mean']*1000:.3f} Β± {benchmark_models[model_name]['std']*1000:.3f}",
|
| 217 |
+
f"{benchmark_models[model_name]['min']*1000:.3f}",
|
| 218 |
+
f"{benchmark_models[model_name]['max']*1000:.3f}",
|
| 219 |
+
f"{benchmark_models[model_name]['accuracy']*100:.1f}%",
|
| 220 |
+
f"{single_benchmark_models[model_name]['mean']*1000:.3f} Β± {single_benchmark_models[model_name]['std']*1000:.3f}",
|
| 221 |
+
f"{single_benchmark_models[model_name]['min']*1000:.3f}",
|
| 222 |
+
f"{single_benchmark_models[model_name]['max']*1000:.3f}",
|
| 223 |
+
f"{single_benchmark_models[model_name]['accuracy']*100:.1f}%"
|
| 224 |
+
]
|
| 225 |
+
table_data.append(row)
|
| 226 |
+
|
| 227 |
+
columns = ['Model', 'Mean Β± Std (ms)', 'Min (ms)', 'Max (ms)', 'Acc (%)',
|
| 228 |
+
'Mean Β± Std (ms)', 'Min (ms)', 'Max (ms)', 'Acc (%)']
|
| 229 |
+
row_labels = ['Multi', 'Single'] * len(all_model_names)
|
| 230 |
+
|
| 231 |
+
# Create table
|
| 232 |
+
table = ax6.table(cellText=table_data, colLabels=columns, cellLoc='center', loc='center')
|
| 233 |
+
table.auto_set_font_size(False)
|
| 234 |
+
table.set_fontsize(9)
|
| 235 |
+
table.scale(1.1, 1.8)
|
| 236 |
+
|
| 237 |
+
# Style the table
|
| 238 |
+
for i in range(len(all_model_names)):
|
| 239 |
+
for j in range(len(columns)):
|
| 240 |
+
cell = table[(i+1, j)]
|
| 241 |
+
cell.set_height(0.4)
|
| 242 |
+
if j < 5:
|
| 243 |
+
cell.set_facecolor('#f0f0f0') # Light gray for multi-benchmark columns
|
| 244 |
+
else:
|
| 245 |
+
cell.set_facecolor('#e0e0f0') # Light blue for single-benchmark columns
|
| 246 |
+
|
| 247 |
+
ax6.set_title('Summary Statistics Comparison', fontsize=12, pad=20)
|
| 248 |
+
|
| 249 |
+
# Save each subplot as a separate PNG image
|
| 250 |
+
output_dir = Path(__file__).parent
|
| 251 |
+
|
| 252 |
+
# 1. Bar chart comparing mean inference times
|
| 253 |
+
fig1, ax1_single = plt.subplots(figsize=(10, 6))
|
| 254 |
+
x = np.arange(len(all_model_names))
|
| 255 |
+
width = 0.35
|
| 256 |
+
benchmark_means = [benchmark_models[m]['mean'] * 1000 for m in all_model_names]
|
| 257 |
+
single_means = [single_benchmark_models[m]['mean'] * 1000 for m in all_model_names]
|
| 258 |
+
bars1 = ax1_single.bar(x - width/2, benchmark_means, width, label='Multi-benchmark (100 samples)', alpha=0.8)
|
| 259 |
+
bars2 = ax1_single.bar(x + width/2, single_means, width, label='Single-benchmark (10 samples)', alpha=0.8)
|
| 260 |
+
ax1_single.set_xlabel('Model')
|
| 261 |
+
ax1_single.set_ylabel('Mean Inference Time (ms)')
|
| 262 |
+
ax1_single.set_title('Comparison of Mean Inference Times')
|
| 263 |
+
ax1_single.set_xticks(x)
|
| 264 |
+
ax1_single.set_xticklabels(all_model_names, rotation=45, ha='right')
|
| 265 |
+
ax1_single.legend()
|
| 266 |
+
ax1_single.grid(axis='y', alpha=0.3)
|
| 267 |
+
for bar in bars1:
|
| 268 |
+
height = bar.get_height()
|
| 269 |
+
ax1_single.annotate(f'{height:.3f}', xy=(bar.get_x() + bar.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=8)
|
| 270 |
+
for bar in bars2:
|
| 271 |
+
height = bar.get_height()
|
| 272 |
+
ax1_single.annotate(f'{height:.3f}', xy=(bar.get_x() + bar.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=8)
|
| 273 |
+
plt.tight_layout()
|
| 274 |
+
plt.savefig(output_dir / "mean_inference_times.png", dpi=300, bbox_inches='tight')
|
| 275 |
+
plt.close(fig1)
|
| 276 |
+
print(f"Saved: mean_inference_times.png")
|
| 277 |
+
|
| 278 |
+
# 2. Box plot comparing timing distributions
|
| 279 |
+
fig2, ax2_single = plt.subplots(figsize=(12, 6))
|
| 280 |
+
all_data = []
|
| 281 |
+
labels = []
|
| 282 |
+
colors = []
|
| 283 |
+
for i, model_name in enumerate(all_model_names):
|
| 284 |
+
benchmark_samples = benchmark_models[model_name]['timing_samples'][:10]
|
| 285 |
+
single_samples = single_benchmark_models[model_name]['timing_samples'][:10]
|
| 286 |
+
benchmark_ms = [s * 1000 for s in benchmark_samples]
|
| 287 |
+
single_ms = [s * 1000 for s in single_samples]
|
| 288 |
+
all_data.append(benchmark_ms)
|
| 289 |
+
all_data.append(single_ms)
|
| 290 |
+
labels.append(f'{model_name}\nMulti')
|
| 291 |
+
labels.append(f'{model_name}\nSingle')
|
| 292 |
+
colors.extend([f'C{i}', f'C{i}'])
|
| 293 |
+
bp = ax2_single.boxplot(all_data, labels=labels, patch_artist=True, vert=True)
|
| 294 |
+
for patch, color in zip(bp['boxes'], colors):
|
| 295 |
+
patch.set_facecolor(color)
|
| 296 |
+
patch.set_alpha(0.6)
|
| 297 |
+
ax2_single.set_xlabel('Model (Benchmark Type)')
|
| 298 |
+
ax2_single.set_ylabel('Inference Time (ms)')
|
| 299 |
+
ax2_single.set_title('Distribution of Inference Times (Box Plot)')
|
| 300 |
+
ax2_single.tick_params(axis='x', rotation=45)
|
| 301 |
+
ax2_single.grid(axis='y', alpha=0.3)
|
| 302 |
+
plt.tight_layout()
|
| 303 |
+
plt.savefig(output_dir / "inference_time_distribution.png", dpi=300, bbox_inches='tight')
|
| 304 |
+
plt.close(fig2)
|
| 305 |
+
print(f"Saved: inference_time_distribution.png")
|
| 306 |
+
|
| 307 |
+
# 3. Comparison scatter plot with accuracy
|
| 308 |
+
fig3, ax3_single = plt.subplots(figsize=(10, 6))
|
| 309 |
+
benchmark_accs = [benchmark_models[m]['accuracy'] * 100 for m in all_model_names]
|
| 310 |
+
single_accs = [single_benchmark_models[m]['accuracy'] * 100 for m in all_model_names]
|
| 311 |
+
benchmark_times = [benchmark_models[m]['mean'] * 1000 for m in all_model_names]
|
| 312 |
+
single_times = [single_benchmark_models[m]['mean'] * 1000 for m in all_model_names]
|
| 313 |
+
for i, model_name in enumerate(all_model_names):
|
| 314 |
+
ax3_single.scatter([benchmark_times[i]], [benchmark_accs[i]], marker='o', s=100, label=f'{model_name} (Multi)', alpha=0.8, color=f'C{i}')
|
| 315 |
+
ax3_single.scatter([single_times[i]], [single_accs[i]], marker='s', s=100, label=f'{model_name} (Single)', alpha=0.8, color=f'C{i}')
|
| 316 |
+
ax3_single.set_xlabel('Mean Inference Time (ms)')
|
| 317 |
+
ax3_single.set_ylabel('Accuracy (%)')
|
| 318 |
+
ax3_single.set_title('Accuracy vs Inference Time Comparison')
|
| 319 |
+
ax3_single.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small')
|
| 320 |
+
ax3_single.grid(True, alpha=0.3)
|
| 321 |
+
plt.tight_layout()
|
| 322 |
+
plt.savefig(output_dir / "accuracy_vs_inference_time.png", dpi=300, bbox_inches='tight')
|
| 323 |
+
plt.close(fig3)
|
| 324 |
+
print(f"Saved: accuracy_vs_inference_time.png")
|
| 325 |
+
|
| 326 |
+
# 4. Percentile comparison
|
| 327 |
+
fig4, ax4_single = plt.subplots(figsize=(12, 6))
|
| 328 |
+
x = np.arange(len(all_model_names))
|
| 329 |
+
width = 0.25
|
| 330 |
+
benchmark_p50 = [benchmark_models[m]['p50'] * 1000 for m in all_model_names]
|
| 331 |
+
benchmark_p95 = [benchmark_models[m]['p95'] * 1000 for m in all_model_names]
|
| 332 |
+
benchmark_p99 = [benchmark_models[m]['p99'] * 1000 for m in all_model_names]
|
| 333 |
+
single_p50 = [single_benchmark_models[m]['p50'] * 1000 for m in all_model_names]
|
| 334 |
+
single_p95 = [single_benchmark_models[m]['p95'] * 1000 for m in all_model_names]
|
| 335 |
+
single_p99 = [single_benchmark_models[m]['p99'] * 1000 for m in all_model_names]
|
| 336 |
+
bars_p50 = ax4_single.bar(x - width, benchmark_p50, width, label='P50 (Multi)', alpha=0.8)
|
| 337 |
+
bars_p95 = ax4_single.bar(x, benchmark_p95, width, label='P95 (Multi)', alpha=0.8)
|
| 338 |
+
bars_p99 = ax4_single.bar(x + width, benchmark_p99, width, label='P99 (Multi)', alpha=0.8)
|
| 339 |
+
ax4_single.bar(x - width + 0.05, single_p50, width*0.8, label='P50 (Single)', alpha=0.6, hatch='//')
|
| 340 |
+
ax4_single.bar(x + 0.05, single_p95, width*0.8, label='P95 (Single)', alpha=0.6, hatch='//')
|
| 341 |
+
ax4_single.bar(x + width + 0.05, single_p99, width*0.8, label='P99 (Single)', alpha=0.6, hatch='//')
|
| 342 |
+
ax4_single.set_xlabel('Model')
|
| 343 |
+
ax4_single.set_ylabel('Inference Time (ms)')
|
| 344 |
+
ax4_single.set_title('Percentile Comparison (P50, P95, P99)')
|
| 345 |
+
ax4_single.set_xticks(x)
|
| 346 |
+
ax4_single.set_xticklabels(all_model_names, rotation=45, ha='right')
|
| 347 |
+
ax4_single.legend(fontsize='small')
|
| 348 |
+
ax4_single.grid(axis='y', alpha=0.3)
|
| 349 |
+
plt.tight_layout()
|
| 350 |
+
plt.savefig(output_dir / "percentile_comparison.png", dpi=300, bbox_inches='tight')
|
| 351 |
+
plt.close(fig4)
|
| 352 |
+
print(f"Saved: percentile_comparison.png")
|
| 353 |
+
|
| 354 |
+
# 5. Standard deviation comparison
|
| 355 |
+
fig5, ax5_single = plt.subplots(figsize=(10, 6))
|
| 356 |
+
benchmark_std = [benchmark_models[m]['std'] * 1000 for m in all_model_names]
|
| 357 |
+
single_std = [single_benchmark_models[m]['std'] * 1000 for m in all_model_names]
|
| 358 |
+
x = np.arange(len(all_model_names))
|
| 359 |
+
width = 0.35
|
| 360 |
+
bars_std1 = ax5_single.bar(x - width/2, benchmark_std, width, label='Multi-benchmark', alpha=0.8)
|
| 361 |
+
bars_std2 = ax5_single.bar(x + width/2, single_std, width, label='Single-benchmark', alpha=0.8)
|
| 362 |
+
ax5_single.set_xlabel('Model')
|
| 363 |
+
ax5_single.set_ylabel('Standard Deviation (ms)')
|
| 364 |
+
ax5_single.set_title('Standard Deviation of Inference Times')
|
| 365 |
+
ax5_single.set_xticks(x)
|
| 366 |
+
ax5_single.set_xticklabels(all_model_names, rotation=45, ha='right')
|
| 367 |
+
ax5_single.legend()
|
| 368 |
+
ax5_single.grid(axis='y', alpha=0.3)
|
| 369 |
+
for bar in bars_std1:
|
| 370 |
+
height = bar.get_height()
|
| 371 |
+
ax5_single.annotate(f'{height:.4f}', xy=(bar.get_x() + bar.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=7)
|
| 372 |
+
for bar in bars_std2:
|
| 373 |
+
height = bar.get_height()
|
| 374 |
+
ax5_single.annotate(f'{height:.4f}', xy=(bar.get_x() + bar.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=7)
|
| 375 |
+
plt.tight_layout()
|
| 376 |
+
plt.savefig(output_dir / "standard_deviation_comparison.png", dpi=300, bbox_inches='tight')
|
| 377 |
+
plt.close(fig5)
|
| 378 |
+
print(f"Saved: standard_deviation_comparison.png")
|
| 379 |
+
|
| 380 |
+
# 6. Summary statistics table
|
| 381 |
+
fig6, ax6_single = plt.subplots(figsize=(14, 6))
|
| 382 |
+
ax6_single.axis('off')
|
| 383 |
+
table_data = []
|
| 384 |
+
for model_name in all_model_names:
|
| 385 |
+
row = [
|
| 386 |
+
model_name,
|
| 387 |
+
f"{benchmark_models[model_name]['mean']*1000:.3f} Β± {benchmark_models[model_name]['std']*1000:.3f}",
|
| 388 |
+
f"{benchmark_models[model_name]['min']*1000:.3f}",
|
| 389 |
+
f"{benchmark_models[model_name]['max']*1000:.3f}",
|
| 390 |
+
f"{benchmark_models[model_name]['accuracy']*100:.1f}%",
|
| 391 |
+
f"{single_benchmark_models[model_name]['mean']*1000:.3f} Β± {single_benchmark_models[model_name]['std']*1000:.3f}",
|
| 392 |
+
f"{single_benchmark_models[model_name]['min']*1000:.3f}",
|
| 393 |
+
f"{single_benchmark_models[model_name]['max']*1000:.3f}",
|
| 394 |
+
f"{single_benchmark_models[model_name]['accuracy']*100:.1f}%"
|
| 395 |
+
]
|
| 396 |
+
table_data.append(row)
|
| 397 |
+
columns = ['Model', 'Mean Β± Std (ms)', 'Min (ms)', 'Max (ms)', 'Acc (%)',
|
| 398 |
+
'Mean Β± Std (ms)', 'Min (ms)', 'Max (ms)', 'Acc (%)']
|
| 399 |
+
table = ax6_single.table(cellText=table_data, colLabels=columns, cellLoc='center', loc='center')
|
| 400 |
+
table.auto_set_font_size(False)
|
| 401 |
+
table.set_fontsize(9)
|
| 402 |
+
table.scale(1.1, 1.8)
|
| 403 |
+
for i in range(len(all_model_names)):
|
| 404 |
+
for j in range(len(columns)):
|
| 405 |
+
cell = table[(i+1, j)]
|
| 406 |
+
cell.set_height(0.4)
|
| 407 |
+
if j < 5:
|
| 408 |
+
cell.set_facecolor('#f0f0f0')
|
| 409 |
+
else:
|
| 410 |
+
cell.set_facecolor('#e0e0f0')
|
| 411 |
+
ax6_single.set_title('Summary Statistics Comparison', fontsize=12, pad=20)
|
| 412 |
+
plt.tight_layout()
|
| 413 |
+
plt.savefig(output_dir / "summary_statistics.png", dpi=300, bbox_inches='tight')
|
| 414 |
+
plt.close(fig6)
|
| 415 |
+
print(f"Saved: summary_statistics.png")
|
| 416 |
+
|
| 417 |
+
print(f"\nAll individual visualizations saved to: {output_dir}")
|
| 418 |
+
|
| 419 |
+
# Also save as interactive HTML
|
| 420 |
+
html_output = Path(__file__).parent / "response_time_comparison.html"
|
| 421 |
+
with open(html_output, 'w') as f:
|
| 422 |
+
f.write(f"""<!DOCTYPE html>
|
| 423 |
+
<html>
|
| 424 |
+
<head>
|
| 425 |
+
<title>Benchmark Response Time Comparison</title>
|
| 426 |
+
<style>
|
| 427 |
+
body {{ font-family: Arial, sans-serif; margin: 20px; }}
|
| 428 |
+
h1 {{ text-align: center; }}
|
| 429 |
+
.chart {{ max-width: 1200px; margin: 0 auto; }}
|
| 430 |
+
.model-section {{ margin: 20px 0; padding: 15px; border: 1px solid #ddd; border-radius: 5px; }}
|
| 431 |
+
.model-title {{ font-weight: bold; font-size: 1.2em; margin-bottom: 10px; }}
|
| 432 |
+
table {{ width: 100%; border-collapse: collapse; }}
|
| 433 |
+
th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
|
| 434 |
+
th {{ background-color: #f4f4f4; }}
|
| 435 |
+
</style>
|
| 436 |
+
</head>
|
| 437 |
+
<body>
|
| 438 |
+
<h1>Benchmark Response Time Comparison</h1>
|
| 439 |
+
<p><strong>Multi-benchmark:</strong> {benchmark_data['num_samples']} samples, {benchmark_data['num_repeats']} repeats</p>
|
| 440 |
+
<p><strong>Single-benchmark:</strong> {single_benchmark_data['num_samples']} samples, {single_benchmark_data['num_repeats']} repeats</p>
|
| 441 |
+
<p><img src="response_time_comparison.png" alt="Comparison Chart" class="chart"></p>
|
| 442 |
+
<h2>Detailed Statistics</h2>
|
| 443 |
+
""")
|
| 444 |
+
for model_name in all_model_names:
|
| 445 |
+
f.write(f"""
|
| 446 |
+
<div class="model-section">
|
| 447 |
+
<div class="model-title">{model_name}</div>
|
| 448 |
+
<table>
|
| 449 |
+
<tr>
|
| 450 |
+
<th>Metric</th>
|
| 451 |
+
<th>Multi-benchmark</th>
|
| 452 |
+
<th>Single-benchmark</th>
|
| 453 |
+
<th>Change</th>
|
| 454 |
+
</tr>
|
| 455 |
+
<tr>
|
| 456 |
+
<td>Mean (ms)</td>
|
| 457 |
+
<td>{benchmark_models[model_name]['mean']*1000:.4f}</td>
|
| 458 |
+
<td>{single_benchmark_models[model_name]['mean']*1000:.4f}</td>
|
| 459 |
+
<td>{((single_benchmark_models[model_name]['mean'] - benchmark_models[model_name]['mean']) / benchmark_models[model_name]['mean'] * 100):.1f}%</td>
|
| 460 |
+
</tr>
|
| 461 |
+
<tr>
|
| 462 |
+
<td>Std (ms)</td>
|
| 463 |
+
<td>{benchmark_models[model_name]['std']*1000:.4f}</td>
|
| 464 |
+
<td>{single_benchmark_models[model_name]['std']*1000:.4f}</td>
|
| 465 |
+
<td>{((single_benchmark_models[model_name]['std'] - benchmark_models[model_name]['std']) / benchmark_models[model_name]['std'] * 100):.1f}%</td>
|
| 466 |
+
</tr>
|
| 467 |
+
<tr>
|
| 468 |
+
<td>Min (ms)</td>
|
| 469 |
+
<td>{benchmark_models[model_name]['min']*1000:.4f}</td>
|
| 470 |
+
<td>{single_benchmark_models[model_name]['min']*1000:.4f}</td>
|
| 471 |
+
<td>{((single_benchmark_models[model_name]['min'] - benchmark_models[model_name]['min']) / benchmark_models[model_name]['min'] * 100):.1f}%</td>
|
| 472 |
+
</tr>
|
| 473 |
+
<tr>
|
| 474 |
+
<td>Max (ms)</td>
|
| 475 |
+
<td>{benchmark_models[model_name]['max']*1000:.4f}</td>
|
| 476 |
+
<td>{single_benchmark_models[model_name]['max']*1000:.4f}</td>
|
| 477 |
+
<td>{((single_benchmark_models[model_name]['max'] - benchmark_models[model_name]['max']) / benchmark_models[model_name]['max'] * 100):.1f}%</td>
|
| 478 |
+
</tr>
|
| 479 |
+
<tr>
|
| 480 |
+
<td>Accuracy</td>
|
| 481 |
+
<td>{benchmark_models[model_name]['accuracy']*100:.1f}%</td>
|
| 482 |
+
<td>{single_benchmark_models[model_name]['accuracy']*100:.1f}%</td>
|
| 483 |
+
<td>{(single_benchmark_models[model_name]['accuracy'] - benchmark_models[model_name]['accuracy']) * 100:.1f}pp</td>
|
| 484 |
+
</tr>
|
| 485 |
+
</table>
|
| 486 |
+
</div>
|
| 487 |
+
""")
|
| 488 |
+
f.write("""
|
| 489 |
+
</body>
|
| 490 |
+
</html>""")
|
| 491 |
+
print(f"HTML report saved to: {html_output}")
|
| 492 |
+
|
| 493 |
+
# Print summary to console
|
| 494 |
+
print("\n=== Summary ===")
|
| 495 |
+
print(f"Multi-benchmark: {benchmark_data['num_samples']} samples, {benchmark_data['num_repeats']} repeats")
|
| 496 |
+
print(f"Single-benchmark: {single_benchmark_data['num_samples']} samples, {single_benchmark_data['num_repeats']} repeats")
|
| 497 |
+
print("\nModel Comparison:")
|
| 498 |
+
print("-" * 80)
|
| 499 |
+
for model_name in all_model_names:
|
| 500 |
+
b_mean = benchmark_models[model_name]['mean'] * 1000
|
| 501 |
+
s_mean = single_benchmark_models[model_name]['mean'] * 1000
|
| 502 |
+
change = ((s_mean - b_mean) / b_mean * 100)
|
| 503 |
+
print(f"{model_name:20s} | Multi: {b_mean:6.3f}ms | Single: {s_mean:6.3f}ms | Change: {change:+6.1f}%")
|
A6/benchmark_results/visualizations/inference_time_distribution.png
ADDED
|
Git LFS Details
|
A6/benchmark_results/visualizations/mean_inference_times.png
ADDED
|
Git LFS Details
|
A6/benchmark_results/visualizations/percentile_comparison.png
ADDED
|
Git LFS Details
|
A6/benchmark_results/visualizations/response_time_comparison.html
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<title>Benchmark Response Time Comparison</title>
|
| 5 |
+
<style>
|
| 6 |
+
body { font-family: Arial, sans-serif; margin: 20px; }
|
| 7 |
+
h1 { text-align: center; }
|
| 8 |
+
.chart { max-width: 1200px; margin: 0 auto; }
|
| 9 |
+
.model-section { margin: 20px 0; padding: 15px; border: 1px solid #ddd; border-radius: 5px; }
|
| 10 |
+
.model-title { font-weight: bold; font-size: 1.2em; margin-bottom: 10px; }
|
| 11 |
+
table { width: 100%; border-collapse: collapse; }
|
| 12 |
+
th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
|
| 13 |
+
th { background-color: #f4f4f4; }
|
| 14 |
+
</style>
|
| 15 |
+
</head>
|
| 16 |
+
<body>
|
| 17 |
+
<h1>Benchmark Response Time Comparison</h1>
|
| 18 |
+
<p><strong>Multi-benchmark:</strong> 100 samples, 10 repeats</p>
|
| 19 |
+
<p><strong>Single-benchmark:</strong> 100 samples, 10 repeats</p>
|
| 20 |
+
<p><img src="response_time_comparison.png" alt="Comparison Chart" class="chart"></p>
|
| 21 |
+
<h2>Detailed Statistics</h2>
|
| 22 |
+
|
| 23 |
+
<div class="model-section">
|
| 24 |
+
<div class="model-title">A4 Random Forest</div>
|
| 25 |
+
<table>
|
| 26 |
+
<tr>
|
| 27 |
+
<th>Metric</th>
|
| 28 |
+
<th>Multi-benchmark</th>
|
| 29 |
+
<th>Single-benchmark</th>
|
| 30 |
+
<th>Change</th>
|
| 31 |
+
</tr>
|
| 32 |
+
<tr>
|
| 33 |
+
<td>Mean (ms)</td>
|
| 34 |
+
<td>60.7226</td>
|
| 35 |
+
<td>54.1178</td>
|
| 36 |
+
<td>-10.9%</td>
|
| 37 |
+
</tr>
|
| 38 |
+
<tr>
|
| 39 |
+
<td>Std (ms)</td>
|
| 40 |
+
<td>3.0474</td>
|
| 41 |
+
<td>8.3909</td>
|
| 42 |
+
<td>175.3%</td>
|
| 43 |
+
</tr>
|
| 44 |
+
<tr>
|
| 45 |
+
<td>Min (ms)</td>
|
| 46 |
+
<td>58.1383</td>
|
| 47 |
+
<td>41.5801</td>
|
| 48 |
+
<td>-28.5%</td>
|
| 49 |
+
</tr>
|
| 50 |
+
<tr>
|
| 51 |
+
<td>Max (ms)</td>
|
| 52 |
+
<td>68.9643</td>
|
| 53 |
+
<td>139.2800</td>
|
| 54 |
+
<td>102.0%</td>
|
| 55 |
+
</tr>
|
| 56 |
+
<tr>
|
| 57 |
+
<td>Accuracy</td>
|
| 58 |
+
<td>89.0%</td>
|
| 59 |
+
<td>89.0%</td>
|
| 60 |
+
<td>0.0pp</td>
|
| 61 |
+
</tr>
|
| 62 |
+
</table>
|
| 63 |
+
</div>
|
| 64 |
+
|
| 65 |
+
<div class="model-section">
|
| 66 |
+
<div class="model-title">A5 Ensemble</div>
|
| 67 |
+
<table>
|
| 68 |
+
<tr>
|
| 69 |
+
<th>Metric</th>
|
| 70 |
+
<th>Multi-benchmark</th>
|
| 71 |
+
<th>Single-benchmark</th>
|
| 72 |
+
<th>Change</th>
|
| 73 |
+
</tr>
|
| 74 |
+
<tr>
|
| 75 |
+
<td>Mean (ms)</td>
|
| 76 |
+
<td>87.9247</td>
|
| 77 |
+
<td>88.4395</td>
|
| 78 |
+
<td>0.6%</td>
|
| 79 |
+
</tr>
|
| 80 |
+
<tr>
|
| 81 |
+
<td>Std (ms)</td>
|
| 82 |
+
<td>19.6745</td>
|
| 83 |
+
<td>15.3584</td>
|
| 84 |
+
<td>-21.9%</td>
|
| 85 |
+
</tr>
|
| 86 |
+
<tr>
|
| 87 |
+
<td>Min (ms)</td>
|
| 88 |
+
<td>67.9033</td>
|
| 89 |
+
<td>60.6458</td>
|
| 90 |
+
<td>-10.7%</td>
|
| 91 |
+
</tr>
|
| 92 |
+
<tr>
|
| 93 |
+
<td>Max (ms)</td>
|
| 94 |
+
<td>138.6737</td>
|
| 95 |
+
<td>213.1680</td>
|
| 96 |
+
<td>53.7%</td>
|
| 97 |
+
</tr>
|
| 98 |
+
<tr>
|
| 99 |
+
<td>Accuracy</td>
|
| 100 |
+
<td>67.0%</td>
|
| 101 |
+
<td>67.0%</td>
|
| 102 |
+
<td>0.0pp</td>
|
| 103 |
+
</tr>
|
| 104 |
+
</table>
|
| 105 |
+
</div>
|
| 106 |
+
|
| 107 |
+
<div class="model-section">
|
| 108 |
+
<div class="model-title">A5b Adaboost</div>
|
| 109 |
+
<table>
|
| 110 |
+
<tr>
|
| 111 |
+
<th>Metric</th>
|
| 112 |
+
<th>Multi-benchmark</th>
|
| 113 |
+
<th>Single-benchmark</th>
|
| 114 |
+
<th>Change</th>
|
| 115 |
+
</tr>
|
| 116 |
+
<tr>
|
| 117 |
+
<td>Mean (ms)</td>
|
| 118 |
+
<td>34.6698</td>
|
| 119 |
+
<td>33.1184</td>
|
| 120 |
+
<td>-4.5%</td>
|
| 121 |
+
</tr>
|
| 122 |
+
<tr>
|
| 123 |
+
<td>Std (ms)</td>
|
| 124 |
+
<td>6.9252</td>
|
| 125 |
+
<td>3.6793</td>
|
| 126 |
+
<td>-46.9%</td>
|
| 127 |
+
</tr>
|
| 128 |
+
<tr>
|
| 129 |
+
<td>Min (ms)</td>
|
| 130 |
+
<td>30.5001</td>
|
| 131 |
+
<td>30.1910</td>
|
| 132 |
+
<td>-1.0%</td>
|
| 133 |
+
</tr>
|
| 134 |
+
<tr>
|
| 135 |
+
<td>Max (ms)</td>
|
| 136 |
+
<td>48.3568</td>
|
| 137 |
+
<td>67.5596</td>
|
| 138 |
+
<td>39.7%</td>
|
| 139 |
+
</tr>
|
| 140 |
+
<tr>
|
| 141 |
+
<td>Accuracy</td>
|
| 142 |
+
<td>52.0%</td>
|
| 143 |
+
<td>52.0%</td>
|
| 144 |
+
<td>0.0pp</td>
|
| 145 |
+
</tr>
|
| 146 |
+
</table>
|
| 147 |
+
</div>
|
| 148 |
+
|
| 149 |
+
<div class="model-section">
|
| 150 |
+
<div class="model-title">A5b Bagging Trees</div>
|
| 151 |
+
<table>
|
| 152 |
+
<tr>
|
| 153 |
+
<th>Metric</th>
|
| 154 |
+
<th>Multi-benchmark</th>
|
| 155 |
+
<th>Single-benchmark</th>
|
| 156 |
+
<th>Change</th>
|
| 157 |
+
</tr>
|
| 158 |
+
<tr>
|
| 159 |
+
<td>Mean (ms)</td>
|
| 160 |
+
<td>6.0758</td>
|
| 161 |
+
<td>3.0341</td>
|
| 162 |
+
<td>-50.1%</td>
|
| 163 |
+
</tr>
|
| 164 |
+
<tr>
|
| 165 |
+
<td>Std (ms)</td>
|
| 166 |
+
<td>1.7927</td>
|
| 167 |
+
<td>1.2043</td>
|
| 168 |
+
<td>-32.8%</td>
|
| 169 |
+
</tr>
|
| 170 |
+
<tr>
|
| 171 |
+
<td>Min (ms)</td>
|
| 172 |
+
<td>3.8333</td>
|
| 173 |
+
<td>2.4478</td>
|
| 174 |
+
<td>-36.1%</td>
|
| 175 |
+
</tr>
|
| 176 |
+
<tr>
|
| 177 |
+
<td>Max (ms)</td>
|
| 178 |
+
<td>9.7910</td>
|
| 179 |
+
<td>17.5220</td>
|
| 180 |
+
<td>79.0%</td>
|
| 181 |
+
</tr>
|
| 182 |
+
<tr>
|
| 183 |
+
<td>Accuracy</td>
|
| 184 |
+
<td>0.0%</td>
|
| 185 |
+
<td>0.0%</td>
|
| 186 |
+
<td>0.0pp</td>
|
| 187 |
+
</tr>
|
| 188 |
+
</table>
|
| 189 |
+
</div>
|
| 190 |
+
|
| 191 |
+
<div class="model-section">
|
| 192 |
+
<div class="model-title">A6 SVM</div>
|
| 193 |
+
<table>
|
| 194 |
+
<tr>
|
| 195 |
+
<th>Metric</th>
|
| 196 |
+
<th>Multi-benchmark</th>
|
| 197 |
+
<th>Single-benchmark</th>
|
| 198 |
+
<th>Change</th>
|
| 199 |
+
</tr>
|
| 200 |
+
<tr>
|
| 201 |
+
<td>Mean (ms)</td>
|
| 202 |
+
<td>9.1022</td>
|
| 203 |
+
<td>0.6455</td>
|
| 204 |
+
<td>-92.9%</td>
|
| 205 |
+
</tr>
|
| 206 |
+
<tr>
|
| 207 |
+
<td>Std (ms)</td>
|
| 208 |
+
<td>0.3233</td>
|
| 209 |
+
<td>0.0336</td>
|
| 210 |
+
<td>-89.6%</td>
|
| 211 |
+
</tr>
|
| 212 |
+
<tr>
|
| 213 |
+
<td>Min (ms)</td>
|
| 214 |
+
<td>8.6898</td>
|
| 215 |
+
<td>0.6043</td>
|
| 216 |
+
<td>-93.0%</td>
|
| 217 |
+
</tr>
|
| 218 |
+
<tr>
|
| 219 |
+
<td>Max (ms)</td>
|
| 220 |
+
<td>9.6271</td>
|
| 221 |
+
<td>1.1998</td>
|
| 222 |
+
<td>-87.5%</td>
|
| 223 |
+
</tr>
|
| 224 |
+
<tr>
|
| 225 |
+
<td>Accuracy</td>
|
| 226 |
+
<td>83.0%</td>
|
| 227 |
+
<td>83.0%</td>
|
| 228 |
+
<td>0.0pp</td>
|
| 229 |
+
</tr>
|
| 230 |
+
</table>
|
| 231 |
+
</div>
|
| 232 |
+
|
| 233 |
+
</body>
|
| 234 |
+
</html>
|
A6/benchmark_results/visualizations/response_time_comparison.png
ADDED
|
Git LFS Details
|
A6/benchmark_results/visualizations/standard_deviation_comparison.png
ADDED
|
Git LFS Details
|
A6/benchmark_results/visualizations/summary_statistics.png
ADDED
|
Git LFS Details
|
A6/benchmark_timing.md
ADDED
|
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Standardized Timing Benchmarking Framework
|
| 2 |
+
|
| 3 |
+
A comprehensive benchmarking framework for fair and consistent comparison of classification models (A4, A5, A5b, A6).
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
This framework provides standardized metrics for model comparison:
|
| 8 |
+
|
| 9 |
+
- **Inference Time**: Mean, standard deviation, min, max, and percentiles (P50, P95, P99)
|
| 10 |
+
- **Memory Usage**: Mean, standard deviation, and peak memory consumption
|
| 11 |
+
- **Prediction Accuracy**: Correct predictions and accuracy percentage
|
| 12 |
+
- **Model Characteristics**: Model size, number of features, model type
|
| 13 |
+
- **Consistent Data Pipeline**: Uses the same data processing for all models
|
| 14 |
+
|
| 15 |
+
## Installation
|
| 16 |
+
|
| 17 |
+
No additional dependencies required. Uses existing project dependencies:
|
| 18 |
+
- `numpy`
|
| 19 |
+
- `pandas`
|
| 20 |
+
- `scikit-learn`
|
| 21 |
+
- `pickle` (standard library)
|
| 22 |
+
|
| 23 |
+
## Usage
|
| 24 |
+
|
| 25 |
+
### Basic Usage
|
| 26 |
+
|
| 27 |
+
```bash
|
| 28 |
+
python benchmark_timing.py
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
### Advanced Usage
|
| 32 |
+
|
| 33 |
+
```bash
|
| 34 |
+
# Specify number of samples and repeats
|
| 35 |
+
python benchmark_timing.py --samples 200 --repeats 20
|
| 36 |
+
|
| 37 |
+
# Save results to specific file
|
| 38 |
+
python benchmark_timing.py --output results/my_benchmark.json
|
| 39 |
+
|
| 40 |
+
# Print comparison table
|
| 41 |
+
python benchmark_timing.py --compare
|
| 42 |
+
|
| 43 |
+
# Print model recommendations
|
| 44 |
+
python benchmark_timing.py --recommend
|
| 45 |
+
|
| 46 |
+
# All options combined
|
| 47 |
+
python benchmark_timing.py -n 150 -r 15 -o results/benchmark.json -c -R
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
### Command Line Arguments
|
| 51 |
+
|
| 52 |
+
| Argument | Short | Description | Default |
|
| 53 |
+
|----------|-------|-------------|---------|
|
| 54 |
+
| `--samples` | `-n` | Number of test samples | 100 |
|
| 55 |
+
| `--repeats` | `-r` | Number of repetitions per sample | 10 |
|
| 56 |
+
| `--output` | `-o` | Output file path for JSON results | Auto-generated |
|
| 57 |
+
| `--compare` | `-c` | Print comparison table | False |
|
| 58 |
+
| `--recommend` | `-R` | Print model recommendations | False |
|
| 59 |
+
|
| 60 |
+
## Output
|
| 61 |
+
|
| 62 |
+
### Console Output
|
| 63 |
+
|
| 64 |
+
The framework prints real-time progress and results:
|
| 65 |
+
|
| 66 |
+
```
|
| 67 |
+
======================================================================
|
| 68 |
+
STANDARDIZED TIMING BENCHMARKING FRAMEWORK
|
| 69 |
+
======================================================================
|
| 70 |
+
|
| 71 |
+
Configuration:
|
| 72 |
+
Number of samples: 100
|
| 73 |
+
Number of repeats per sample: 10
|
| 74 |
+
Total predictions per model: 1000
|
| 75 |
+
|
| 76 |
+
Loading data...
|
| 77 |
+
Movement features shape: (1000, 150)
|
| 78 |
+
Weak link scores shape: (1000, 20)
|
| 79 |
+
Merged dataset shape: (1000, 165)
|
| 80 |
+
Feature matrix shape: (1000, 160)
|
| 81 |
+
Number of features: 160
|
| 82 |
+
Number of classes: 14
|
| 83 |
+
|
| 84 |
+
======================================================================
|
| 85 |
+
Running Benchmarks
|
| 86 |
+
======================================================================
|
| 87 |
+
|
| 88 |
+
Benchmarking A4 Random Forest...
|
| 89 |
+
|
| 90 |
+
A4 Random Forest Results:
|
| 91 |
+
Status: SUCCESS
|
| 92 |
+
Inference Time:
|
| 93 |
+
Mean: 1.234 ms
|
| 94 |
+
Std: 0.123 ms
|
| 95 |
+
P50: 1.200 ms
|
| 96 |
+
P95: 1.500 ms
|
| 97 |
+
P99: 1.800 ms
|
| 98 |
+
Memory Usage:
|
| 99 |
+
Mean: 256.5 KB
|
| 100 |
+
Peak: 512.0 KB
|
| 101 |
+
Accuracy: 78.5% (78/100)
|
| 102 |
+
Model Size: 1250.0 KB
|
| 103 |
+
Features: 160
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
### JSON Results
|
| 107 |
+
|
| 108 |
+
Results are saved to JSON format with all metrics:
|
| 109 |
+
|
| 110 |
+
```json
|
| 111 |
+
{
|
| 112 |
+
"timestamp": "2024-01-15T10:30:45.123456",
|
| 113 |
+
"num_samples": 100,
|
| 114 |
+
"num_repeats": 10,
|
| 115 |
+
"models": {
|
| 116 |
+
"A4 Random Forest": {
|
| 117 |
+
"model_name": "A4 Random Forest",
|
| 118 |
+
"model_path": "../A4/models/weaklink_classifier_rf.pkl",
|
| 119 |
+
"inference_time_mean": 0.001234,
|
| 120 |
+
"inference_time_std": 0.000123,
|
| 121 |
+
"inference_time_min": 0.001000,
|
| 122 |
+
"inference_time_max": 0.001800,
|
| 123 |
+
"inference_time_p50": 0.001200,
|
| 124 |
+
"inference_time_p95": 0.001500,
|
| 125 |
+
"inference_time_p99": 0.001800,
|
| 126 |
+
"memory_usage_mean": 262656.0,
|
| 127 |
+
"memory_usage_std": 10240.0,
|
| 128 |
+
"memory_usage_peak": 524288.0,
|
| 129 |
+
"accuracy": 0.785,
|
| 130 |
+
"predictions_correct": 78,
|
| 131 |
+
"predictions_total": 100,
|
| 132 |
+
"model_size_bytes": 1280000,
|
| 133 |
+
"num_features": 160,
|
| 134 |
+
"num_parameters": 10,
|
| 135 |
+
"model_type": "RandomForestClassifier",
|
| 136 |
+
"timing_samples": [0.0012, 0.0013, ...],
|
| 137 |
+
"memory_samples": [262144, 266240, ...],
|
| 138 |
+
"status": "SUCCESS",
|
| 139 |
+
"error_message": ""
|
| 140 |
+
}
|
| 141 |
+
}
|
| 142 |
+
}
|
| 143 |
+
```
|
| 144 |
+
|
| 145 |
+
## Model Comparison Table
|
| 146 |
+
|
| 147 |
+
With `--compare` flag, prints a formatted comparison:
|
| 148 |
+
|
| 149 |
+
```
|
| 150 |
+
==========================================================================
|
| 151 |
+
MODEL COMPARISON SUMMARY
|
| 152 |
+
==========================================================================
|
| 153 |
+
Model Time (ms) Std P95 Acc (%) Mem (KB) Size (KB)
|
| 154 |
+
--------------------------------------------------------------------------
|
| 155 |
+
A5b Adaboost 0.850 0.050 1.100 75.2 128.5 512.0
|
| 156 |
+
A5 Ensemble 1.100 0.080 1.350 79.8 256.3 768.0
|
| 157 |
+
A4 Random Forest 1.234 0.123 1.500 78.5 256.5 1250.0
|
| 158 |
+
A5b Bagging Trees 1.450 0.150 1.800 77.1 384.2 1024.0
|
| 159 |
+
A6 SVM 2.100 0.200 2.500 81.2 512.0 2048.0
|
| 160 |
+
==========================================================================
|
| 161 |
+
```
|
| 162 |
+
|
| 163 |
+
## Model Recommendations
|
| 164 |
+
|
| 165 |
+
With `--recommend` flag, provides optimal model suggestions:
|
| 166 |
+
|
| 167 |
+
```
|
| 168 |
+
======================================================================
|
| 169 |
+
MODEL RECOMMENDATIONS
|
| 170 |
+
======================================================================
|
| 171 |
+
|
| 172 |
+
Fastest Inference:
|
| 173 |
+
Model: A5b Adaboost
|
| 174 |
+
Inference Time: 0.850 ms
|
| 175 |
+
|
| 176 |
+
Highest Accuracy:
|
| 177 |
+
Model: A6 SVM
|
| 178 |
+
Accuracy: 81.2%
|
| 179 |
+
|
| 180 |
+
Lowest Memory Usage:
|
| 181 |
+
Model: A5b Adaboost
|
| 182 |
+
Memory Usage: 128.5 KB
|
| 183 |
+
|
| 184 |
+
Best Balanced Performance:
|
| 185 |
+
Model: A5 Ensemble
|
| 186 |
+
Inference Time: 1.100 ms
|
| 187 |
+
Accuracy: 79.8%
|
| 188 |
+
Memory Usage: 256.3 KB
|
| 189 |
+
```
|
| 190 |
+
|
| 191 |
+
## Benchmarking Metrics Explained
|
| 192 |
+
|
| 193 |
+
### Inference Time Metrics
|
| 194 |
+
|
| 195 |
+
| Metric | Description |
|
| 196 |
+
|--------|-------------|
|
| 197 |
+
| **Mean** | Average inference time across all repetitions |
|
| 198 |
+
| **Std** | Standard deviation (variability) |
|
| 199 |
+
| **Min/Max** | Fastest and slowest inference times |
|
| 200 |
+
| **P50** | Median (50th percentile) |
|
| 201 |
+
| **P95** | 95th percentile (95% of predictions are faster) |
|
| 202 |
+
| **P99** | 99th percentile (99% of predictions are faster) |
|
| 203 |
+
|
| 204 |
+
### Memory Metrics
|
| 205 |
+
|
| 206 |
+
| Metric | Description |
|
| 207 |
+
|--------|-------------|
|
| 208 |
+
| **Mean** | Average memory usage |
|
| 209 |
+
| **Std** | Standard deviation of memory usage |
|
| 210 |
+
| **Peak** | Maximum memory consumed |
|
| 211 |
+
|
| 212 |
+
### Accuracy Metrics
|
| 213 |
+
|
| 214 |
+
| Metric | Description |
|
| 215 |
+
|--------|-------------|
|
| 216 |
+
| **Accuracy** | Percentage of correct predictions |
|
| 217 |
+
| **Predictions Correct/Total** | Raw counts |
|
| 218 |
+
|
| 219 |
+
## Implementation Details
|
| 220 |
+
|
| 221 |
+
### Data Pipeline
|
| 222 |
+
|
| 223 |
+
All models use the same data loading and preprocessing pipeline:
|
| 224 |
+
1. Load movement features and weaklink scores
|
| 225 |
+
2. Create WeakestLink target column
|
| 226 |
+
3. Merge datasets
|
| 227 |
+
4. Extract features (excluding ID, WeakestLink, EstimatedScore)
|
| 228 |
+
5. Train/test split (80/20, stratified, random_state=42)
|
| 229 |
+
6. StandardScaler fitted on training data
|
| 230 |
+
|
| 231 |
+
### Feature Handling
|
| 232 |
+
|
| 233 |
+
- A4 Random Forest model was trained WITH duplicate NASM columns
|
| 234 |
+
- Other models (A5, A5b, A6) were trained WITHOUT duplicate NASM columns
|
| 235 |
+
- The framework automatically filters features based on each model's expectations
|
| 236 |
+
|
| 237 |
+
### Memory Tracking
|
| 238 |
+
|
| 239 |
+
Uses Python's `tracemalloc` module for accurate memory measurement:
|
| 240 |
+
- Tracks memory before and after each prediction
|
| 241 |
+
- Records both current and peak memory usage
|
| 242 |
+
|
| 243 |
+
### Timing Precision
|
| 244 |
+
|
| 245 |
+
Uses `time.perf_counter()` for high-resolution timing measurements.
|
| 246 |
+
|
| 247 |
+
## Extending the Framework
|
| 248 |
+
|
| 249 |
+
### Adding New Models
|
| 250 |
+
|
| 251 |
+
1. Add model path to `all_classification.py`:
|
| 252 |
+
```python
|
| 253 |
+
a7_new_model = "../A7/models/new_model.pkl"
|
| 254 |
+
```
|
| 255 |
+
|
| 256 |
+
2. Import in `benchmark_timing.py`:
|
| 257 |
+
```python
|
| 258 |
+
from all_classification import (
|
| 259 |
+
a4_rf,
|
| 260 |
+
a5_ensemnble,
|
| 261 |
+
a5b_adaboost,
|
| 262 |
+
a5b_bagging_tree,
|
| 263 |
+
a6_svm,
|
| 264 |
+
a7_new_model, # Add here
|
| 265 |
+
)
|
| 266 |
+
```
|
| 267 |
+
|
| 268 |
+
3. Add to `models_to_benchmark` list in `run_benchmark()`:
|
| 269 |
+
```python
|
| 270 |
+
models_to_benchmark = [
|
| 271 |
+
('A4 Random Forest', a4_rf),
|
| 272 |
+
('A5 Ensemble', a5_ensemnble),
|
| 273 |
+
('A5b Adaboost', a5b_adaboost),
|
| 274 |
+
('A5b Bagging Trees', a5b_bagging_tree),
|
| 275 |
+
('A6 SVM', a6_svm),
|
| 276 |
+
('A7 New Model', a7_new_model), # Add here
|
| 277 |
+
]
|
| 278 |
+
```
|
| 279 |
+
|
| 280 |
+
### Adding New Metrics
|
| 281 |
+
|
| 282 |
+
Extend the `ModelMetrics` dataclass:
|
| 283 |
+
|
| 284 |
+
```python
|
| 285 |
+
@dataclass
|
| 286 |
+
class ModelMetrics:
|
| 287 |
+
# ... existing fields ...
|
| 288 |
+
new_metric: float = 0.0
|
| 289 |
+
```
|
| 290 |
+
|
| 291 |
+
Then calculate it in `benchmark_single_model()`.
|
| 292 |
+
|
| 293 |
+
## Troubleshooting
|
| 294 |
+
|
| 295 |
+
### Model Loading Errors
|
| 296 |
+
|
| 297 |
+
If a model fails to load:
|
| 298 |
+
1. Verify the model file exists at the specified path
|
| 299 |
+
2. Check that all required dependencies are installed
|
| 300 |
+
3. Ensure custom classes (like `AdaBoostEnsemble`) are importable
|
| 301 |
+
|
| 302 |
+
### Inconsistent Results
|
| 303 |
+
|
| 304 |
+
If results vary significantly between runs:
|
| 305 |
+
1. Increase `--repeats` for better averaging
|
| 306 |
+
2. Ensure no other processes are running during benchmarking
|
| 307 |
+
3. Check system resource utilization
|
| 308 |
+
|
| 309 |
+
### Memory Issues
|
| 310 |
+
|
| 311 |
+
If running out of memory:
|
| 312 |
+
1. Reduce `--samples` count
|
| 313 |
+
2. Close other applications
|
| 314 |
+
3. Check available system memory
|
| 315 |
+
|
| 316 |
+
## License
|
| 317 |
+
|
| 318 |
+
Part of the Data Intensive Systems project.
|
| 319 |
+
```
|
| 320 |
+
|
| 321 |
+
Now let me also create a simple comparison script for quick analysis:
|
| 322 |
+
|
| 323 |
+
<tool_call>
|
| 324 |
+
<function=edit_file>
|
| 325 |
+
<parameter=display_description>
|
| 326 |
+
Create simple comparison script for quick analysis
|
| 327 |
+
</parameter>
|
| 328 |
+
<parameter=path>
|
| 329 |
+
/home/cyclonaut/Dokumente/Studium VΓ€xjΓΆ/data intensive project/repo_march/Data-intensive-systems/A6/compare_benchmarks.py
|
| 330 |
+
</parameter>
|
| 331 |
+
<parameter=mode>
|
| 332 |
+
create
|
| 333 |
+
</parameter>
|
| 334 |
+
</function>
|
| 335 |
+
</tool_call>
|
A6/benchmark_timing.py
ADDED
|
@@ -0,0 +1,960 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Standardized Timing Benchmarking Framework for Classification Models
|
| 4 |
+
|
| 5 |
+
This framework provides fair and consistent timing benchmarks for comparing
|
| 6 |
+
classification models (A4, A5, A5b, A6) with metrics for:
|
| 7 |
+
- Inference time (mean, std, min, max, percentiles)
|
| 8 |
+
- Memory usage
|
| 9 |
+
- Prediction accuracy
|
| 10 |
+
- Model size
|
| 11 |
+
- Feature extraction time
|
| 12 |
+
|
| 13 |
+
Usage:
|
| 14 |
+
python benchmark_timing.py [--samples N] [--repeats M] [--output FILE]
|
| 15 |
+
|
| 16 |
+
Author: Benchmark Framework v1.0
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import os
|
| 20 |
+
import sys
|
| 21 |
+
import pickle
|
| 22 |
+
import time
|
| 23 |
+
import tracemalloc
|
| 24 |
+
import warnings
|
| 25 |
+
import json
|
| 26 |
+
import numpy as np
|
| 27 |
+
import pandas as pd
|
| 28 |
+
from pathlib import Path
|
| 29 |
+
from datetime import datetime
|
| 30 |
+
from sklearn.preprocessing import StandardScaler
|
| 31 |
+
from sklearn.model_selection import train_test_split
|
| 32 |
+
from typing import Dict, List, Tuple, Optional, Any
|
| 33 |
+
from dataclasses import dataclass, field, asdict
|
| 34 |
+
from collections import defaultdict
|
| 35 |
+
import statistics
|
| 36 |
+
|
| 37 |
+
# Suppress warnings for cleaner output
|
| 38 |
+
warnings.filterwarnings('ignore')
|
| 39 |
+
|
| 40 |
+
# Add project root to path
|
| 41 |
+
project_root = os.path.abspath(os.path.dirname(__file__))
|
| 42 |
+
sys.path.insert(0, project_root)
|
| 43 |
+
|
| 44 |
+
# Import model paths
|
| 45 |
+
from all_classification import (
|
| 46 |
+
a4_rf,
|
| 47 |
+
a5_ensemnble,
|
| 48 |
+
a5b_adaboost,
|
| 49 |
+
a5b_bagging_tree,
|
| 50 |
+
a6_svm
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
# Import custom classes for unpickling
|
| 54 |
+
from adaboost_classes import (
|
| 55 |
+
AdaBoostEnsemble,
|
| 56 |
+
WeightedDecisionTree
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# ============================================================================
|
| 60 |
+
# Configuration
|
| 61 |
+
# ============================================================================
|
| 62 |
+
|
| 63 |
+
REPO_ROOT = os.path.abspath(os.path.join(project_root, '..'))
|
| 64 |
+
DATA_DIR = os.path.join(REPO_ROOT, 'Datasets_all')
|
| 65 |
+
OUTPUT_DIR = os.path.join(project_root, 'benchmark_results')
|
| 66 |
+
|
| 67 |
+
# Weaklink categories (14 classes)
|
| 68 |
+
WEAKLINK_CATEGORIES = [
|
| 69 |
+
'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward',
|
| 70 |
+
'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward',
|
| 71 |
+
'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward',
|
| 72 |
+
'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward',
|
| 73 |
+
'RightKneeMovesOutward', 'RightShoulderElevation'
|
| 74 |
+
]
|
| 75 |
+
|
| 76 |
+
# Duplicate NASM columns
|
| 77 |
+
DUPLICATE_NASM_COLS = [
|
| 78 |
+
'No_1_NASM_Deviation',
|
| 79 |
+
'No_2_NASM_Deviation',
|
| 80 |
+
'No_3_NASM_Deviation',
|
| 81 |
+
'No_4_NASM_Deviation',
|
| 82 |
+
'No_5_NASM_Deviation',
|
| 83 |
+
]
|
| 84 |
+
|
| 85 |
+
EXCLUDE_COLS = ['ID', 'WeakestLink', 'EstimatedScore']
|
| 86 |
+
EXPECTED_CLASSES = WEAKLINK_CATEGORIES.copy()
|
| 87 |
+
|
| 88 |
+
# Benchmark parameters
|
| 89 |
+
DEFAULT_NUM_SAMPLES = 100
|
| 90 |
+
DEFAULT_NUM_REPEATES = 10
|
| 91 |
+
DEFAULT_OUTPUT_FILE = None
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
# ============================================================================
|
| 95 |
+
# Data Classes for Results
|
| 96 |
+
# ============================================================================
|
| 97 |
+
|
| 98 |
+
@dataclass
|
| 99 |
+
class ModelMetrics:
|
| 100 |
+
"""Metrics for a single model benchmark."""
|
| 101 |
+
model_name: str
|
| 102 |
+
model_path: str
|
| 103 |
+
|
| 104 |
+
# Timing metrics (seconds)
|
| 105 |
+
inference_time_mean: float = 0.0
|
| 106 |
+
inference_time_std: float = 0.0
|
| 107 |
+
inference_time_min: float = 0.0
|
| 108 |
+
inference_time_max: float = 0.0
|
| 109 |
+
inference_time_p50: float = 0.0
|
| 110 |
+
inference_time_p95: float = 0.0
|
| 111 |
+
inference_time_p99: float = 0.0
|
| 112 |
+
|
| 113 |
+
# Memory metrics (bytes)
|
| 114 |
+
memory_usage_mean: float = 0.0
|
| 115 |
+
memory_usage_std: float = 0.0
|
| 116 |
+
memory_usage_peak: float = 0.0
|
| 117 |
+
|
| 118 |
+
# Prediction metrics
|
| 119 |
+
accuracy: float = 0.0
|
| 120 |
+
predictions_correct: int = 0
|
| 121 |
+
predictions_total: int = 0
|
| 122 |
+
|
| 123 |
+
# Model characteristics
|
| 124 |
+
model_size_bytes: int = 0
|
| 125 |
+
num_features: int = 0
|
| 126 |
+
num_parameters: int = 0
|
| 127 |
+
model_type: str = ""
|
| 128 |
+
|
| 129 |
+
# Feature extraction time (seconds)
|
| 130 |
+
feature_extraction_time_mean: float = 0.0
|
| 131 |
+
|
| 132 |
+
# Raw timing samples
|
| 133 |
+
timing_samples: List[float] = field(default_factory=list)
|
| 134 |
+
memory_samples: List[float] = field(default_factory=list)
|
| 135 |
+
|
| 136 |
+
# Status
|
| 137 |
+
status: str = "SUCCESS"
|
| 138 |
+
error_message: str = ""
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
@dataclass
|
| 142 |
+
class BenchmarkResults:
|
| 143 |
+
"""Complete benchmark results for all models."""
|
| 144 |
+
timestamp: str
|
| 145 |
+
num_samples: int
|
| 146 |
+
num_repeats: int
|
| 147 |
+
models: Dict[str, ModelMetrics] = field(default_factory=dict)
|
| 148 |
+
|
| 149 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 150 |
+
"""Convert to dictionary for JSON serialization."""
|
| 151 |
+
return {
|
| 152 |
+
'timestamp': self.timestamp,
|
| 153 |
+
'num_samples': self.num_samples,
|
| 154 |
+
'num_repeats': self.num_repeats,
|
| 155 |
+
'models': {
|
| 156 |
+
name: {
|
| 157 |
+
**asdict(metrics),
|
| 158 |
+
'timing_samples': list(metrics.timing_samples),
|
| 159 |
+
'memory_samples': list(metrics.memory_samples)
|
| 160 |
+
}
|
| 161 |
+
for name, metrics in self.models.items()
|
| 162 |
+
}
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
def to_json(self, filepath: Optional[str] = None) -> str:
|
| 166 |
+
"""Export to JSON string or file."""
|
| 167 |
+
data = self.to_dict()
|
| 168 |
+
json_str = json.dumps(data, indent=2, default=str)
|
| 169 |
+
|
| 170 |
+
if filepath:
|
| 171 |
+
os.makedirs(os.path.dirname(filepath) or '.', exist_ok=True)
|
| 172 |
+
with open(filepath, 'w') as f:
|
| 173 |
+
f.write(json_str)
|
| 174 |
+
|
| 175 |
+
return json_str
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
# ============================================================================
|
| 179 |
+
# Data Loading Functions
|
| 180 |
+
# ============================================================================
|
| 181 |
+
|
| 182 |
+
def load_and_prepare_data() -> Dict[str, Any]:
|
| 183 |
+
"""Load and prepare data following the same pipeline as classification_baseline.py.
|
| 184 |
+
|
| 185 |
+
Returns:
|
| 186 |
+
Dictionary containing:
|
| 187 |
+
- feature_columns: List of feature column names
|
| 188 |
+
- scaler: Fitted StandardScaler
|
| 189 |
+
- X_train, X_test: Feature matrices (unscaled)
|
| 190 |
+
- y_train, y_test: Target arrays
|
| 191 |
+
- merged_df: Merged dataframe
|
| 192 |
+
"""
|
| 193 |
+
# Load datasets
|
| 194 |
+
movement_features_df = pd.read_csv(os.path.join(DATA_DIR, 'aimoscores.csv'))
|
| 195 |
+
weaklink_scores_df = pd.read_csv(os.path.join(DATA_DIR, 'scores_and_weaklink.csv'))
|
| 196 |
+
|
| 197 |
+
print(f' Movement features shape: {movement_features_df.shape}')
|
| 198 |
+
print(f' Weak link scores shape: {weaklink_scores_df.shape}')
|
| 199 |
+
|
| 200 |
+
# Create WeakestLink target column
|
| 201 |
+
weaklink_scores_df['WeakestLink'] = (
|
| 202 |
+
weaklink_scores_df[WEAKLINK_CATEGORIES].idxmax(axis=1)
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
# Merge datasets
|
| 206 |
+
target_df = weaklink_scores_df[['ID', 'WeakestLink']].copy()
|
| 207 |
+
merged_df = movement_features_df.merge(target_df, on='ID', how='inner')
|
| 208 |
+
print(f' Merged dataset shape: {merged_df.shape}')
|
| 209 |
+
|
| 210 |
+
# Extract feature columns - include ALL columns except EXCLUDE_COLS
|
| 211 |
+
feature_columns = [c for c in merged_df.columns if c not in EXCLUDE_COLS]
|
| 212 |
+
|
| 213 |
+
X = merged_df[feature_columns].values
|
| 214 |
+
y = merged_df['WeakestLink'].values
|
| 215 |
+
|
| 216 |
+
print(f' Feature matrix shape: {X.shape}')
|
| 217 |
+
print(f' Number of features: {len(feature_columns)}')
|
| 218 |
+
print(f' Number of classes: {len(np.unique(y))}')
|
| 219 |
+
|
| 220 |
+
# Create train/test split
|
| 221 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 222 |
+
X, y, test_size=0.2, random_state=42, stratify=y
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
# Fit scaler on training data
|
| 226 |
+
scaler = StandardScaler()
|
| 227 |
+
X_train_scaled = scaler.fit_transform(X_train)
|
| 228 |
+
X_test_scaled = scaler.transform(X_test)
|
| 229 |
+
|
| 230 |
+
return {
|
| 231 |
+
'feature_columns': feature_columns,
|
| 232 |
+
'scaler': scaler,
|
| 233 |
+
'X_train': X_train,
|
| 234 |
+
'X_train_scaled': X_train_scaled,
|
| 235 |
+
'y_train': y_train,
|
| 236 |
+
'X_test': X_test,
|
| 237 |
+
'X_test_scaled': X_test_scaled,
|
| 238 |
+
'y_test': y_test,
|
| 239 |
+
'merged_df': merged_df,
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
def create_samples_from_test_data(
|
| 244 |
+
data: Dict[str, Any],
|
| 245 |
+
num_samples: int
|
| 246 |
+
) -> Tuple[np.ndarray, np.ndarray]:
|
| 247 |
+
"""Create samples from test data for benchmarking.
|
| 248 |
+
|
| 249 |
+
Args:
|
| 250 |
+
data: Dictionary from load_and_prepare_data()
|
| 251 |
+
num_samples: Number of samples to select
|
| 252 |
+
|
| 253 |
+
Returns:
|
| 254 |
+
Tuple of (sample_features, true_labels)
|
| 255 |
+
"""
|
| 256 |
+
# Use test data for benchmarking
|
| 257 |
+
X_test = data['X_test']
|
| 258 |
+
y_test = data['y_test']
|
| 259 |
+
|
| 260 |
+
# Select first num_samples from test set
|
| 261 |
+
n_samples = min(num_samples, len(X_test))
|
| 262 |
+
sample_features = X_test[:n_samples]
|
| 263 |
+
true_labels = y_test[:n_samples]
|
| 264 |
+
|
| 265 |
+
return sample_features, true_labels
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
# ============================================================================
|
| 269 |
+
# Model Loading Functions
|
| 270 |
+
# ============================================================================
|
| 271 |
+
|
| 272 |
+
def load_model(model_path: str, model_name: str) -> Tuple[Any, Optional[Any], Optional[List[str]], Any]:
|
| 273 |
+
"""Load a model from a pickle file.
|
| 274 |
+
|
| 275 |
+
Args:
|
| 276 |
+
model_path: Path to the pickle file
|
| 277 |
+
model_name: Name of the model for logging
|
| 278 |
+
|
| 279 |
+
Returns:
|
| 280 |
+
Tuple of (model, scaler, feature_columns, artifact)
|
| 281 |
+
"""
|
| 282 |
+
full_path = os.path.join(project_root, model_path)
|
| 283 |
+
|
| 284 |
+
if not os.path.exists(full_path):
|
| 285 |
+
print(f" β οΈ Model file not found: {full_path}")
|
| 286 |
+
return None, None, None, None
|
| 287 |
+
|
| 288 |
+
try:
|
| 289 |
+
with open(full_path, 'rb') as f:
|
| 290 |
+
artifact = pickle.load(f)
|
| 291 |
+
|
| 292 |
+
# Extract model and scaler based on artifact structure
|
| 293 |
+
if isinstance(artifact, dict):
|
| 294 |
+
model = artifact.get('model')
|
| 295 |
+
scaler = artifact.get('scaler')
|
| 296 |
+
feature_columns = artifact.get('feature_columns')
|
| 297 |
+
else:
|
| 298 |
+
# A6 SVM is a Pipeline object
|
| 299 |
+
model = artifact
|
| 300 |
+
scaler = None
|
| 301 |
+
feature_columns = None
|
| 302 |
+
|
| 303 |
+
# Extract scaler from pipeline if it exists
|
| 304 |
+
if hasattr(model, 'steps') and len(model.steps) >= 1:
|
| 305 |
+
for step_name, step_obj in model.steps:
|
| 306 |
+
if hasattr(step_obj, 'transform'):
|
| 307 |
+
if hasattr(step_obj, 'n_features_in_') and not hasattr(step_obj, 'predict'):
|
| 308 |
+
scaler = step_obj
|
| 309 |
+
break
|
| 310 |
+
|
| 311 |
+
# Extract feature columns from scaler
|
| 312 |
+
if hasattr(model, 'steps') and len(model.steps) > 0:
|
| 313 |
+
first_step = model.steps[0][1]
|
| 314 |
+
if hasattr(first_step, 'get_feature_names_out'):
|
| 315 |
+
try:
|
| 316 |
+
names = first_step.get_feature_names_out()
|
| 317 |
+
import re
|
| 318 |
+
if not all(re.fullmatch(r'x\d+', n) for n in names):
|
| 319 |
+
feature_columns = names
|
| 320 |
+
except:
|
| 321 |
+
pass
|
| 322 |
+
|
| 323 |
+
print(f" β Loaded {model_name}")
|
| 324 |
+
return model, scaler, feature_columns, artifact
|
| 325 |
+
except Exception as e:
|
| 326 |
+
print(f" β Error loading {model_name}: {e}")
|
| 327 |
+
return None, None, None, None
|
| 328 |
+
|
| 329 |
+
|
| 330 |
+
def get_model_info(model: Any) -> Dict[str, Any]:
|
| 331 |
+
"""Extract model information for benchmarking.
|
| 332 |
+
|
| 333 |
+
Args:
|
| 334 |
+
model: The trained model
|
| 335 |
+
|
| 336 |
+
Returns:
|
| 337 |
+
Dictionary with model characteristics
|
| 338 |
+
"""
|
| 339 |
+
info = {
|
| 340 |
+
'model_type': type(model).__name__,
|
| 341 |
+
'num_parameters': 0,
|
| 342 |
+
'num_features': 0
|
| 343 |
+
}
|
| 344 |
+
|
| 345 |
+
# Count parameters based on model type
|
| 346 |
+
if hasattr(model, 'n_estimators'):
|
| 347 |
+
info['num_parameters'] += getattr(model, 'n_estimators', 0)
|
| 348 |
+
|
| 349 |
+
if hasattr(model, 'estimators_'):
|
| 350 |
+
info['num_parameters'] += len(getattr(model, 'estimators_', []))
|
| 351 |
+
|
| 352 |
+
if hasattr(model, 'n_features_in_'):
|
| 353 |
+
info['num_features'] = model.n_features_in_
|
| 354 |
+
|
| 355 |
+
if hasattr(model, 'classes_'):
|
| 356 |
+
info['num_classes'] = len(model.classes_)
|
| 357 |
+
|
| 358 |
+
# For ensemble models
|
| 359 |
+
if hasattr(model, 'estimators_'):
|
| 360 |
+
for est in getattr(model, 'estimators_', []):
|
| 361 |
+
if hasattr(est, 'n_features_in_'):
|
| 362 |
+
info['num_features'] = est.n_features_in_
|
| 363 |
+
break
|
| 364 |
+
|
| 365 |
+
return info
|
| 366 |
+
|
| 367 |
+
|
| 368 |
+
# ============================================================================
|
| 369 |
+
# Benchmarking Functions
|
| 370 |
+
# ============================================================================
|
| 371 |
+
|
| 372 |
+
def measure_inference_time(
|
| 373 |
+
model: Any,
|
| 374 |
+
scaler: Optional[Any],
|
| 375 |
+
sample_features: np.ndarray,
|
| 376 |
+
model_feature_columns: Optional[List[str]],
|
| 377 |
+
feature_columns: List[str],
|
| 378 |
+
num_repeats: int,
|
| 379 |
+
single_sample_mode: bool = False
|
| 380 |
+
) -> Tuple[List[float], List[float], Optional[str]]:
|
| 381 |
+
"""Measure inference time for a model.
|
| 382 |
+
|
| 383 |
+
Args:
|
| 384 |
+
model: The trained model
|
| 385 |
+
scaler: Scaler for feature preprocessing
|
| 386 |
+
sample_features: Input features
|
| 387 |
+
model_feature_columns: Expected feature columns for the model
|
| 388 |
+
feature_columns: All available feature columns
|
| 389 |
+
num_repeats: Number of repetitions for averaging
|
| 390 |
+
single_sample_mode: If True, measure each sample individually (for single sample latency)
|
| 391 |
+
|
| 392 |
+
Returns:
|
| 393 |
+
Tuple of (timing_samples, memory_samples, error_message)
|
| 394 |
+
"""
|
| 395 |
+
timing_samples = []
|
| 396 |
+
memory_samples = []
|
| 397 |
+
|
| 398 |
+
try:
|
| 399 |
+
# Filter features if needed
|
| 400 |
+
if model_feature_columns is not None:
|
| 401 |
+
available_features = [f for f in model_feature_columns if f in feature_columns]
|
| 402 |
+
if len(available_features) > 0:
|
| 403 |
+
# Convert column names to indices for numpy array
|
| 404 |
+
feature_indices = [feature_columns.index(f) for f in available_features]
|
| 405 |
+
test_features = sample_features[:, feature_indices]
|
| 406 |
+
else:
|
| 407 |
+
test_features = sample_features
|
| 408 |
+
else:
|
| 409 |
+
# model_feature_columns is None - likely A6 SVM pipeline
|
| 410 |
+
# Check if we need to drop duplicate NASM columns
|
| 411 |
+
if hasattr(model, 'steps') and len(model.steps) > 0:
|
| 412 |
+
first_step = model.steps[0][1]
|
| 413 |
+
n_expected = getattr(first_step, 'n_features_in_', None)
|
| 414 |
+
if n_expected is not None:
|
| 415 |
+
# Identify indices of duplicate NASM columns
|
| 416 |
+
dup_indices = [i for i, c in enumerate(feature_columns) if c in DUPLICATE_NASM_COLS]
|
| 417 |
+
# Get all indices except duplicate NASM columns
|
| 418 |
+
valid_indices = [i for i in range(len(feature_columns)) if i not in dup_indices]
|
| 419 |
+
if len(valid_indices) == n_expected:
|
| 420 |
+
# Select only the columns that match expected features
|
| 421 |
+
test_features = sample_features[:, valid_indices]
|
| 422 |
+
else:
|
| 423 |
+
# Fallback: slice to expected number of features
|
| 424 |
+
test_features = sample_features[:, :n_expected]
|
| 425 |
+
else:
|
| 426 |
+
test_features = sample_features
|
| 427 |
+
else:
|
| 428 |
+
test_features = sample_features
|
| 429 |
+
|
| 430 |
+
# Handle A6 SVM pipeline (scaler already in pipeline)
|
| 431 |
+
if model_feature_columns is None and hasattr(model, 'steps'):
|
| 432 |
+
scaler_to_use = None
|
| 433 |
+
else:
|
| 434 |
+
scaler_to_use = scaler
|
| 435 |
+
|
| 436 |
+
# Determine how many predictions to make
|
| 437 |
+
if single_sample_mode:
|
| 438 |
+
# For single sample mode: repeat each sample individually
|
| 439 |
+
num_predictions = num_repeats * len(test_features)
|
| 440 |
+
else:
|
| 441 |
+
# For batch mode: num_repeats on all samples
|
| 442 |
+
num_predictions = num_repeats
|
| 443 |
+
|
| 444 |
+
for i in range(num_predictions):
|
| 445 |
+
# Start memory tracking
|
| 446 |
+
tracemalloc.start()
|
| 447 |
+
start_time = time.perf_counter()
|
| 448 |
+
|
| 449 |
+
# Make prediction
|
| 450 |
+
if single_sample_mode:
|
| 451 |
+
# Single sample prediction: use one row at a time
|
| 452 |
+
single_sample = test_features[i % len(test_features)].reshape(1, -1)
|
| 453 |
+
if scaler_to_use is not None:
|
| 454 |
+
features = scaler_to_use.transform(single_sample)
|
| 455 |
+
else:
|
| 456 |
+
features = single_sample
|
| 457 |
+
else:
|
| 458 |
+
# Batch prediction: use all samples
|
| 459 |
+
if scaler_to_use is not None:
|
| 460 |
+
features = scaler_to_use.transform(test_features)
|
| 461 |
+
else:
|
| 462 |
+
features = test_features
|
| 463 |
+
|
| 464 |
+
prediction = model.predict(features)
|
| 465 |
+
|
| 466 |
+
end_time = time.perf_counter()
|
| 467 |
+
current, peak = tracemalloc.get_traced_memory()
|
| 468 |
+
tracemalloc.stop()
|
| 469 |
+
|
| 470 |
+
# Record measurements
|
| 471 |
+
timing_samples.append(end_time - start_time)
|
| 472 |
+
memory_samples.append(peak)
|
| 473 |
+
|
| 474 |
+
return timing_samples, memory_samples, None
|
| 475 |
+
|
| 476 |
+
except Exception as e:
|
| 477 |
+
return [], [], str(e)
|
| 478 |
+
|
| 479 |
+
|
| 480 |
+
def calculate_percentiles(values: List[float]) -> Dict[str, float]:
|
| 481 |
+
"""Calculate percentiles for a list of values.
|
| 482 |
+
|
| 483 |
+
Args:
|
| 484 |
+
values: List of numeric values
|
| 485 |
+
|
| 486 |
+
Returns:
|
| 487 |
+
Dictionary with percentile values
|
| 488 |
+
"""
|
| 489 |
+
if not values:
|
| 490 |
+
return {
|
| 491 |
+
'p50': 0.0,
|
| 492 |
+
'p95': 0.0,
|
| 493 |
+
'p99': 0.0
|
| 494 |
+
}
|
| 495 |
+
|
| 496 |
+
sorted_values = sorted(values)
|
| 497 |
+
n = len(sorted_values)
|
| 498 |
+
|
| 499 |
+
return {
|
| 500 |
+
'p50': sorted_values[int(n * 0.50)],
|
| 501 |
+
'p95': sorted_values[int(n * 0.95)],
|
| 502 |
+
'p99': sorted_values[int(n * 0.99)]
|
| 503 |
+
}
|
| 504 |
+
|
| 505 |
+
|
| 506 |
+
def benchmark_single_model(
|
| 507 |
+
model_name: str,
|
| 508 |
+
model_path: str,
|
| 509 |
+
sample_features: np.ndarray,
|
| 510 |
+
true_labels: np.ndarray,
|
| 511 |
+
feature_columns: List[str],
|
| 512 |
+
num_repeats: int,
|
| 513 |
+
single_sample_mode: bool = False
|
| 514 |
+
) -> ModelMetrics:
|
| 515 |
+
"""Benchmark a single model.
|
| 516 |
+
|
| 517 |
+
Args:
|
| 518 |
+
model_name: Name of the model
|
| 519 |
+
model_path: Path to the model file
|
| 520 |
+
sample_features: Input features for benchmarking
|
| 521 |
+
true_labels: Ground truth labels
|
| 522 |
+
feature_columns: All available feature columns
|
| 523 |
+
num_repeats: Number of repetitions
|
| 524 |
+
single_sample_mode: If True, measure each sample individually (for single sample latency)
|
| 525 |
+
|
| 526 |
+
Returns:
|
| 527 |
+
ModelMetrics object with benchmark results
|
| 528 |
+
"""
|
| 529 |
+
metrics = ModelMetrics(model_name=model_name, model_path=model_path)
|
| 530 |
+
|
| 531 |
+
print(f"\n Benchmarking {model_name}...")
|
| 532 |
+
|
| 533 |
+
# Load model
|
| 534 |
+
model, scaler, model_feature_columns, artifact = load_model(model_path, model_name)
|
| 535 |
+
|
| 536 |
+
if model is None:
|
| 537 |
+
metrics.status = "LOAD_ERROR"
|
| 538 |
+
metrics.error_message = "Failed to load model"
|
| 539 |
+
return metrics
|
| 540 |
+
|
| 541 |
+
# Get model info
|
| 542 |
+
model_info = get_model_info(model)
|
| 543 |
+
metrics.model_type = model_info.get('model_type', type(model).__name__)
|
| 544 |
+
metrics.num_features = model_info.get('num_features', 0)
|
| 545 |
+
|
| 546 |
+
# Get model size
|
| 547 |
+
try:
|
| 548 |
+
model_size = os.path.getsize(os.path.join(project_root, model_path))
|
| 549 |
+
metrics.model_size_bytes = model_size
|
| 550 |
+
except:
|
| 551 |
+
metrics.model_size_bytes = 0
|
| 552 |
+
|
| 553 |
+
# Run inference benchmarks
|
| 554 |
+
timing_samples, memory_samples, error = measure_inference_time(
|
| 555 |
+
model, scaler, sample_features, model_feature_columns,
|
| 556 |
+
feature_columns, num_repeats, single_sample_mode=single_sample_mode
|
| 557 |
+
)
|
| 558 |
+
|
| 559 |
+
if error:
|
| 560 |
+
metrics.status = "INFERENCE_ERROR"
|
| 561 |
+
metrics.error_message = error
|
| 562 |
+
return metrics
|
| 563 |
+
|
| 564 |
+
# Store raw samples
|
| 565 |
+
metrics.timing_samples = timing_samples
|
| 566 |
+
metrics.memory_samples = memory_samples
|
| 567 |
+
|
| 568 |
+
# Calculate timing statistics
|
| 569 |
+
if timing_samples:
|
| 570 |
+
metrics.inference_time_mean = statistics.mean(timing_samples)
|
| 571 |
+
metrics.inference_time_std = statistics.stdev(timing_samples) if len(timing_samples) > 1 else 0.0
|
| 572 |
+
metrics.inference_time_min = min(timing_samples)
|
| 573 |
+
metrics.inference_time_max = max(timing_samples)
|
| 574 |
+
|
| 575 |
+
percentiles = calculate_percentiles(timing_samples)
|
| 576 |
+
metrics.inference_time_p50 = percentiles['p50']
|
| 577 |
+
metrics.inference_time_p95 = percentiles['p95']
|
| 578 |
+
metrics.inference_time_p99 = percentiles['p99']
|
| 579 |
+
|
| 580 |
+
# Calculate memory statistics
|
| 581 |
+
if memory_samples:
|
| 582 |
+
metrics.memory_usage_mean = statistics.mean(memory_samples)
|
| 583 |
+
metrics.memory_usage_std = statistics.stdev(memory_samples) if len(memory_samples) > 1 else 0.0
|
| 584 |
+
metrics.memory_usage_peak = max(memory_samples)
|
| 585 |
+
|
| 586 |
+
# Test accuracy on the same samples
|
| 587 |
+
try:
|
| 588 |
+
# Filter features for prediction
|
| 589 |
+
if model_feature_columns is not None:
|
| 590 |
+
available_features = [f for f in model_feature_columns if f in feature_columns]
|
| 591 |
+
if len(available_features) > 0:
|
| 592 |
+
# Convert column names to indices for numpy array
|
| 593 |
+
feature_indices = [feature_columns.index(f) for f in available_features]
|
| 594 |
+
test_features = sample_features[:, feature_indices]
|
| 595 |
+
else:
|
| 596 |
+
test_features = sample_features
|
| 597 |
+
else:
|
| 598 |
+
# model_feature_columns is None - likely A6 SVM pipeline
|
| 599 |
+
# Check if we need to drop duplicate NASM columns
|
| 600 |
+
if hasattr(model, 'steps') and len(model.steps) > 0:
|
| 601 |
+
first_step = model.steps[0][1]
|
| 602 |
+
n_expected = getattr(first_step, 'n_features_in_', None)
|
| 603 |
+
if n_expected is not None:
|
| 604 |
+
# Identify indices of duplicate NASM columns
|
| 605 |
+
dup_indices = [i for i, c in enumerate(feature_columns) if c in DUPLICATE_NASM_COLS]
|
| 606 |
+
# Get all indices except duplicate NASM columns
|
| 607 |
+
valid_indices = [i for i in range(len(feature_columns)) if i not in dup_indices]
|
| 608 |
+
if len(valid_indices) == n_expected:
|
| 609 |
+
# Select only the columns that match expected features
|
| 610 |
+
test_features = sample_features[:, valid_indices]
|
| 611 |
+
else:
|
| 612 |
+
# Fallback: slice to expected number of features
|
| 613 |
+
test_features = sample_features[:, :n_expected]
|
| 614 |
+
else:
|
| 615 |
+
test_features = sample_features
|
| 616 |
+
else:
|
| 617 |
+
test_features = sample_features
|
| 618 |
+
|
| 619 |
+
# Handle A6 SVM pipeline
|
| 620 |
+
if model_feature_columns is None and hasattr(model, 'steps'):
|
| 621 |
+
scaler_to_use = None
|
| 622 |
+
else:
|
| 623 |
+
scaler_to_use = scaler
|
| 624 |
+
|
| 625 |
+
if scaler_to_use is not None:
|
| 626 |
+
features = scaler_to_use.transform(test_features)
|
| 627 |
+
else:
|
| 628 |
+
features = test_features
|
| 629 |
+
|
| 630 |
+
predictions = model.predict(features)
|
| 631 |
+
|
| 632 |
+
# Calculate accuracy
|
| 633 |
+
correct = np.sum(predictions == true_labels)
|
| 634 |
+
metrics.predictions_correct = int(correct)
|
| 635 |
+
metrics.predictions_total = len(true_labels)
|
| 636 |
+
metrics.accuracy = correct / len(true_labels)
|
| 637 |
+
|
| 638 |
+
except Exception as e:
|
| 639 |
+
print(f" β οΈ Accuracy calculation failed: {e}")
|
| 640 |
+
|
| 641 |
+
metrics.status = "SUCCESS"
|
| 642 |
+
return metrics
|
| 643 |
+
|
| 644 |
+
|
| 645 |
+
def run_benchmark(
|
| 646 |
+
num_samples: int = DEFAULT_NUM_SAMPLES,
|
| 647 |
+
num_repeats: int = DEFAULT_NUM_REPEATES,
|
| 648 |
+
output_file: Optional[str] = None,
|
| 649 |
+
single_sample_mode: bool = False
|
| 650 |
+
) -> BenchmarkResults:
|
| 651 |
+
"""Run complete benchmark on all models.
|
| 652 |
+
|
| 653 |
+
Args:
|
| 654 |
+
num_samples: Number of samples to benchmark
|
| 655 |
+
num_repeats: Number of repetitions per sample
|
| 656 |
+
output_file: Optional output file path for results
|
| 657 |
+
single_sample_mode: If True, measure each sample individually (for single sample latency)
|
| 658 |
+
|
| 659 |
+
Returns:
|
| 660 |
+
BenchmarkResults object with all results
|
| 661 |
+
"""
|
| 662 |
+
print("=" * 70)
|
| 663 |
+
print("STANDARDIZED TIMING BENCHMARKING FRAMEWORK")
|
| 664 |
+
print("=" * 70)
|
| 665 |
+
print(f"\nConfiguration:")
|
| 666 |
+
print(f" Number of samples: {num_samples}")
|
| 667 |
+
print(f" Number of repeats per sample: {num_repeats}")
|
| 668 |
+
print(f" Total predictions per model: {num_samples * num_repeats}")
|
| 669 |
+
print()
|
| 670 |
+
|
| 671 |
+
# Load data
|
| 672 |
+
print("Loading data...")
|
| 673 |
+
data = load_and_prepare_data()
|
| 674 |
+
print()
|
| 675 |
+
|
| 676 |
+
# Create samples
|
| 677 |
+
sample_features, true_labels = create_samples_from_test_data(data, num_samples)
|
| 678 |
+
print(f"Created {num_samples} test samples for benchmarking")
|
| 679 |
+
print()
|
| 680 |
+
|
| 681 |
+
# Define models to benchmark
|
| 682 |
+
models_to_benchmark = [
|
| 683 |
+
('A4 Random Forest', a4_rf),
|
| 684 |
+
('A5 Ensemble', a5_ensemnble),
|
| 685 |
+
('A5b Adaboost', a5b_adaboost),
|
| 686 |
+
('A5b Bagging Trees', a5b_bagging_tree),
|
| 687 |
+
('A6 SVM', a6_svm),
|
| 688 |
+
]
|
| 689 |
+
|
| 690 |
+
# Initialize results
|
| 691 |
+
results = BenchmarkResults(
|
| 692 |
+
timestamp=datetime.now().isoformat(),
|
| 693 |
+
num_samples=num_samples,
|
| 694 |
+
num_repeats=num_repeats
|
| 695 |
+
)
|
| 696 |
+
|
| 697 |
+
# Benchmark each model
|
| 698 |
+
print("=" * 70)
|
| 699 |
+
print("Running Benchmarks")
|
| 700 |
+
print("=" * 70)
|
| 701 |
+
|
| 702 |
+
for model_name, model_path in models_to_benchmark:
|
| 703 |
+
metrics = benchmark_single_model(
|
| 704 |
+
model_name=model_name,
|
| 705 |
+
model_path=model_path,
|
| 706 |
+
sample_features=sample_features,
|
| 707 |
+
true_labels=true_labels,
|
| 708 |
+
feature_columns=data['feature_columns'],
|
| 709 |
+
num_repeats=num_repeats,
|
| 710 |
+
single_sample_mode=single_sample_mode
|
| 711 |
+
)
|
| 712 |
+
results.models[model_name] = metrics
|
| 713 |
+
|
| 714 |
+
# Print summary for this model
|
| 715 |
+
print(f"\n {model_name} Results:")
|
| 716 |
+
print(f" Status: {metrics.status}")
|
| 717 |
+
|
| 718 |
+
if metrics.status == "SUCCESS":
|
| 719 |
+
print(f" Inference Time:")
|
| 720 |
+
print(f" Mean: {metrics.inference_time_mean*1000:.3f} ms")
|
| 721 |
+
print(f" Std: {metrics.inference_time_std*1000:.3f} ms")
|
| 722 |
+
print(f" P50: {metrics.inference_time_p50*1000:.3f} ms")
|
| 723 |
+
print(f" P95: {metrics.inference_time_p95*1000:.3f} ms")
|
| 724 |
+
print(f" P99: {metrics.inference_time_p99*1000:.3f} ms")
|
| 725 |
+
print(f" Memory Usage:")
|
| 726 |
+
print(f" Mean: {metrics.memory_usage_mean/1024:.1f} KB")
|
| 727 |
+
print(f" Peak: {metrics.memory_usage_peak/1024:.1f} KB")
|
| 728 |
+
print(f" Accuracy: {metrics.accuracy*100:.1f}% ({metrics.predictions_correct}/{metrics.predictions_total})")
|
| 729 |
+
print(f" Model Size: {metrics.model_size_bytes/1024:.1f} KB")
|
| 730 |
+
print(f" Features: {metrics.num_features}")
|
| 731 |
+
else:
|
| 732 |
+
print(f" Error: {metrics.error_message}")
|
| 733 |
+
print()
|
| 734 |
+
|
| 735 |
+
# Save results
|
| 736 |
+
if output_file is None:
|
| 737 |
+
output_file = os.path.join(OUTPUT_DIR, f"benchmark_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json")
|
| 738 |
+
|
| 739 |
+
json_output = results.to_json(output_file)
|
| 740 |
+
print(f"Results saved to: {output_file}")
|
| 741 |
+
|
| 742 |
+
return results
|
| 743 |
+
|
| 744 |
+
|
| 745 |
+
def run_single_sample_benchmark(
|
| 746 |
+
num_samples: int = DEFAULT_NUM_SAMPLES,
|
| 747 |
+
num_repeats: int = DEFAULT_NUM_REPEATES,
|
| 748 |
+
output_file: Optional[str] = None
|
| 749 |
+
) -> BenchmarkResults:
|
| 750 |
+
"""Run benchmark with single sample prediction latency measurement.
|
| 751 |
+
|
| 752 |
+
This function measures the latency for individual predictions rather than
|
| 753 |
+
batch predictions, giving a more realistic view of single sample performance.
|
| 754 |
+
|
| 755 |
+
Args:
|
| 756 |
+
num_samples: Number of samples to benchmark
|
| 757 |
+
num_repeats: Number of repetitions per sample
|
| 758 |
+
output_file: Optional output file path for results
|
| 759 |
+
|
| 760 |
+
Returns:
|
| 761 |
+
BenchmarkResults object with all results
|
| 762 |
+
"""
|
| 763 |
+
return run_benchmark(
|
| 764 |
+
num_samples=num_samples,
|
| 765 |
+
num_repeats=num_repeats,
|
| 766 |
+
output_file=output_file,
|
| 767 |
+
single_sample_mode=True
|
| 768 |
+
)
|
| 769 |
+
|
| 770 |
+
|
| 771 |
+
# ============================================================================
|
| 772 |
+
# Comparison and Analysis Functions
|
| 773 |
+
# ============================================================================
|
| 774 |
+
|
| 775 |
+
def print_comparison_table(results: BenchmarkResults):
|
| 776 |
+
"""Print a formatted comparison table of all models."""
|
| 777 |
+
print("\n" + "=" * 90)
|
| 778 |
+
print("MODEL COMPARISON SUMMARY")
|
| 779 |
+
print("=" * 90)
|
| 780 |
+
|
| 781 |
+
# Header
|
| 782 |
+
print(f"{'Model':<20} {'Time (ms)':<15} {'Std':<10} {'P95':<10} {'Acc (%)':<10} {'Mem (KB)':<12} {'Size (KB)':<12}")
|
| 783 |
+
print("-" * 90)
|
| 784 |
+
|
| 785 |
+
# Sort by inference time for comparison
|
| 786 |
+
sorted_models = sorted(
|
| 787 |
+
results.models.items(),
|
| 788 |
+
key=lambda x: x[1].inference_time_mean if x[1].status == "SUCCESS" else float('inf')
|
| 789 |
+
)
|
| 790 |
+
|
| 791 |
+
for model_name, metrics in sorted_models:
|
| 792 |
+
if metrics.status == "SUCCESS":
|
| 793 |
+
time_ms = metrics.inference_time_mean * 1000
|
| 794 |
+
std_ms = metrics.inference_time_std * 1000
|
| 795 |
+
p95_ms = metrics.inference_time_p95 * 1000
|
| 796 |
+
acc = metrics.accuracy * 100
|
| 797 |
+
mem_kb = metrics.memory_usage_mean / 1024
|
| 798 |
+
size_kb = metrics.model_size_bytes / 1024
|
| 799 |
+
|
| 800 |
+
print(f"{model_name:<20} {time_ms:<15.3f} {std_ms:<10.3f} {p95_ms:<10.3f} {acc:<10.1f} {mem_kb:<12.1f} {size_kb:<12.1f}")
|
| 801 |
+
else:
|
| 802 |
+
print(f"{model_name:<20} {'ERROR':<15} {'-':<10} {'-':<10} {'-':<10} {'-':<12} {'-':<12}")
|
| 803 |
+
|
| 804 |
+
print("=" * 90)
|
| 805 |
+
|
| 806 |
+
|
| 807 |
+
def find_optimal_model(results: BenchmarkResults, priority: str = "speed"):
|
| 808 |
+
"""Find the optimal model based on specified criteria.
|
| 809 |
+
|
| 810 |
+
Args:
|
| 811 |
+
results: BenchmarkResults object
|
| 812 |
+
priority: Optimization priority ("speed", "accuracy", "memory", "balanced")
|
| 813 |
+
|
| 814 |
+
Returns:
|
| 815 |
+
Tuple of (best_model_name, best_metrics)
|
| 816 |
+
"""
|
| 817 |
+
valid_models = {
|
| 818 |
+
name: metrics for name, metrics in results.models.items()
|
| 819 |
+
if metrics.status == "SUCCESS"
|
| 820 |
+
}
|
| 821 |
+
|
| 822 |
+
if not valid_models:
|
| 823 |
+
return None, None
|
| 824 |
+
|
| 825 |
+
if priority == "speed":
|
| 826 |
+
# Minimum inference time
|
| 827 |
+
best = min(valid_models.items(), key=lambda x: x[1].inference_time_mean)
|
| 828 |
+
elif priority == "accuracy":
|
| 829 |
+
# Maximum accuracy
|
| 830 |
+
best = max(valid_models.items(), key=lambda x: x[1].accuracy)
|
| 831 |
+
elif priority == "memory":
|
| 832 |
+
# Minimum memory usage
|
| 833 |
+
best = min(valid_models.items(), key=lambda x: x[1].memory_usage_mean)
|
| 834 |
+
elif priority == "balanced":
|
| 835 |
+
# Balanced score: weighted combination
|
| 836 |
+
def balanced_score(item):
|
| 837 |
+
metrics = item[1]
|
| 838 |
+
# Normalize and combine metrics
|
| 839 |
+
time_score = metrics.inference_time_mean
|
| 840 |
+
acc_score = 1 - metrics.accuracy
|
| 841 |
+
mem_score = metrics.memory_usage_mean / 1000000 # Scale down
|
| 842 |
+
|
| 843 |
+
# Weighted sum (weights can be adjusted)
|
| 844 |
+
return 0.5 * time_score + 0.3 * acc_score + 0.2 * mem_score
|
| 845 |
+
|
| 846 |
+
best = min(valid_models.items(), key=balanced_score)
|
| 847 |
+
else:
|
| 848 |
+
best = min(valid_models.items(), key=lambda x: x[1].inference_time_mean)
|
| 849 |
+
|
| 850 |
+
return best
|
| 851 |
+
|
| 852 |
+
|
| 853 |
+
def print_recommendations(results: BenchmarkResults):
|
| 854 |
+
"""Print model recommendations based on different criteria."""
|
| 855 |
+
print("\n" + "=" * 70)
|
| 856 |
+
print("MODEL RECOMMENDATIONS")
|
| 857 |
+
print("=" * 70)
|
| 858 |
+
|
| 859 |
+
criteria = [
|
| 860 |
+
("Fastest Inference", "speed"),
|
| 861 |
+
("Highest Accuracy", "accuracy"),
|
| 862 |
+
("Lowest Memory Usage", "memory"),
|
| 863 |
+
("Best Balanced Performance", "balanced"),
|
| 864 |
+
]
|
| 865 |
+
|
| 866 |
+
for description, priority in criteria:
|
| 867 |
+
model_name, metrics = find_optimal_model(results, priority)
|
| 868 |
+
if model_name:
|
| 869 |
+
print(f"\n{description}:")
|
| 870 |
+
print(f" Model: {model_name}")
|
| 871 |
+
if priority == "speed":
|
| 872 |
+
print(f" Inference Time: {metrics.inference_time_mean*1000:.3f} ms")
|
| 873 |
+
elif priority == "accuracy":
|
| 874 |
+
print(f" Accuracy: {metrics.accuracy*100:.1f}%")
|
| 875 |
+
elif priority == "memory":
|
| 876 |
+
print(f" Memory Usage: {metrics.memory_usage_mean/1024:.1f} KB")
|
| 877 |
+
elif priority == "balanced":
|
| 878 |
+
print(f" Inference Time: {metrics.inference_time_mean*1000:.3f} ms")
|
| 879 |
+
print(f" Accuracy: {metrics.accuracy*100:.1f}%")
|
| 880 |
+
print(f" Memory Usage: {metrics.memory_usage_mean/1024:.1f} KB")
|
| 881 |
+
else:
|
| 882 |
+
print(f"\n{description}:")
|
| 883 |
+
print(" No valid models found")
|
| 884 |
+
|
| 885 |
+
|
| 886 |
+
# ============================================================================
|
| 887 |
+
# Main Entry Point
|
| 888 |
+
# ============================================================================
|
| 889 |
+
|
| 890 |
+
def main():
|
| 891 |
+
"""Main entry point for the benchmarking framework."""
|
| 892 |
+
import argparse
|
| 893 |
+
|
| 894 |
+
parser = argparse.ArgumentParser(
|
| 895 |
+
description='Standardized Timing Benchmarking Framework for Classification Models'
|
| 896 |
+
)
|
| 897 |
+
parser.add_argument(
|
| 898 |
+
'--samples', '-n',
|
| 899 |
+
type=int,
|
| 900 |
+
default=DEFAULT_NUM_SAMPLES,
|
| 901 |
+
help=f'Number of samples to benchmark (default: {DEFAULT_NUM_SAMPLES})'
|
| 902 |
+
)
|
| 903 |
+
parser.add_argument(
|
| 904 |
+
'--repeats', '-r',
|
| 905 |
+
type=int,
|
| 906 |
+
default=DEFAULT_NUM_REPEATES,
|
| 907 |
+
help=f'Number of repeats per sample (default: {DEFAULT_NUM_REPEATES})'
|
| 908 |
+
)
|
| 909 |
+
parser.add_argument(
|
| 910 |
+
'--output', '-o',
|
| 911 |
+
type=str,
|
| 912 |
+
default=DEFAULT_OUTPUT_FILE,
|
| 913 |
+
help='Output file for results (default: benchmark_results/timestamp.json)'
|
| 914 |
+
)
|
| 915 |
+
parser.add_argument(
|
| 916 |
+
'--compare', '-c',
|
| 917 |
+
action='store_true',
|
| 918 |
+
help='Print comparison table after benchmarking'
|
| 919 |
+
)
|
| 920 |
+
parser.add_argument(
|
| 921 |
+
'--recommend', '-R',
|
| 922 |
+
action='store_true',
|
| 923 |
+
help='Print model recommendations after benchmarking'
|
| 924 |
+
)
|
| 925 |
+
parser.add_argument(
|
| 926 |
+
'--single-sample', '-s',
|
| 927 |
+
action='store_true',
|
| 928 |
+
help='Measure single sample prediction latency (default: batch mode)'
|
| 929 |
+
)
|
| 930 |
+
|
| 931 |
+
args = parser.parse_args()
|
| 932 |
+
|
| 933 |
+
# Run benchmark
|
| 934 |
+
if args.single_sample:
|
| 935 |
+
results = run_single_sample_benchmark(
|
| 936 |
+
num_samples=args.samples,
|
| 937 |
+
num_repeats=args.repeats,
|
| 938 |
+
output_file=args.output
|
| 939 |
+
)
|
| 940 |
+
else:
|
| 941 |
+
results = run_benchmark(
|
| 942 |
+
num_samples=args.samples,
|
| 943 |
+
num_repeats=args.repeats,
|
| 944 |
+
output_file=args.output
|
| 945 |
+
)
|
| 946 |
+
|
| 947 |
+
# Print comparison table if requested
|
| 948 |
+
if args.compare:
|
| 949 |
+
print_comparison_table(results)
|
| 950 |
+
|
| 951 |
+
# Print recommendations if requested
|
| 952 |
+
if args.recommend:
|
| 953 |
+
print_recommendations(results)
|
| 954 |
+
|
| 955 |
+
# Return results for programmatic use
|
| 956 |
+
return results
|
| 957 |
+
|
| 958 |
+
|
| 959 |
+
if __name__ == "__main__":
|
| 960 |
+
results = main()
|
A6/check_svm_model.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pickle
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
# Check A6 SVM model
|
| 5 |
+
a6_path = './models/champion_svm.pkl'
|
| 6 |
+
with open(a6_path, 'rb') as f:
|
| 7 |
+
artifact = pickle.load(f)
|
| 8 |
+
#print(artifact)
|
| 9 |
+
#print(artifact.get('feature_columns'))
|
| 10 |
+
print('A6 SVM Model Structure:')
|
| 11 |
+
print(f' Type: {type(artifact)}')
|
| 12 |
+
print(f' Class name: {type(artifact).__name__}')
|
| 13 |
+
if hasattr(artifact, 'steps'):
|
| 14 |
+
print(f' Steps: {[step[0] for step in artifact.steps]}')
|
| 15 |
+
for step_name, step in artifact.steps:
|
| 16 |
+
print(f' {step_name}: {type(step).__name__}')
|
| 17 |
+
if hasattr(step, 'feature_names_in_'):
|
| 18 |
+
print(f' feature_names_in_: {step.feature_names_in_}')
|
| 19 |
+
if hasattr(step, 'get_feature_names_out'):
|
| 20 |
+
try:
|
| 21 |
+
fnames = step.get_feature_names_out()
|
| 22 |
+
print(f' get_feature_names_out(): {fnames}')
|
| 23 |
+
except Exception as e:
|
| 24 |
+
print(f' get_feature_names_out() error: {e}')
|
| 25 |
+
if isinstance(artifact, dict):
|
| 26 |
+
print(f' Keys: {artifact.keys()}')
|
| 27 |
+
if 'feature_columns' in artifact:
|
| 28 |
+
print(f' feature_columns: {artifact["feature_columns"]}')
|
A6/test_classification_loading.py
ADDED
|
@@ -0,0 +1,380 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Script to load and execute all classification models with one sample.
|
| 4 |
+
Tests models from A4, A5, A5b, and A6.
|
| 5 |
+
|
| 6 |
+
Data loading adapted from classification_baseline.py to use the same
|
| 7 |
+
data processing pipeline for consistent feature extraction.
|
| 8 |
+
|
| 9 |
+
NOTE: A4 Random Forest model was trained WITH the 5 duplicate NASM columns
|
| 10 |
+
(No_1_NASM_Deviation through No_5_NASM_Deviation), while other models (A5, A5b, A6)
|
| 11 |
+
were trained WITHOUT them. This script loads data WITH the duplicate columns
|
| 12 |
+
to support the A4 model, and filters them out for other models as needed.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import os
|
| 16 |
+
import sys
|
| 17 |
+
import pickle
|
| 18 |
+
import warnings
|
| 19 |
+
import numpy as np
|
| 20 |
+
import pandas as pd
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
from sklearn.preprocessing import StandardScaler
|
| 23 |
+
from sklearn.model_selection import train_test_split
|
| 24 |
+
|
| 25 |
+
# Suppress warnings for cleaner output
|
| 26 |
+
warnings.filterwarnings('ignore')
|
| 27 |
+
|
| 28 |
+
# Add project root to path
|
| 29 |
+
project_root = os.path.abspath(os.path.dirname(__file__))
|
| 30 |
+
sys.path.insert(0, project_root)
|
| 31 |
+
|
| 32 |
+
# Import model paths from all_classification.py
|
| 33 |
+
sys.path.insert(0, project_root)
|
| 34 |
+
from all_classification import (
|
| 35 |
+
a4_rf,
|
| 36 |
+
a5_ensemnble,
|
| 37 |
+
a5b_adaboost,
|
| 38 |
+
a5b_bagging_tree,
|
| 39 |
+
a6_svm
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
# Import custom classes from A5b classification_adaboost.py
|
| 43 |
+
# These are needed for unpickling the AdaBoost model
|
| 44 |
+
#sys.path.insert(0, os.path.join(project_root, '..', 'A5b'))
|
| 45 |
+
from adaboost_classes import (
|
| 46 |
+
AdaBoostEnsemble,
|
| 47 |
+
WeightedDecisionTree
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
# Data paths
|
| 51 |
+
REPO_ROOT = os.path.abspath(os.path.join(project_root, '..'))
|
| 52 |
+
DATA_DIR = os.path.join(REPO_ROOT, 'Datasets_all')
|
| 53 |
+
|
| 54 |
+
# Weaklink categories (14 classes)
|
| 55 |
+
WEAKLINK_CATEGORIES = [
|
| 56 |
+
'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward',
|
| 57 |
+
'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward',
|
| 58 |
+
'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward',
|
| 59 |
+
'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward',
|
| 60 |
+
'RightKneeMovesOutward', 'RightShoulderElevation'
|
| 61 |
+
]
|
| 62 |
+
|
| 63 |
+
# Duplicate NASM columns to remove (as in classification_baseline.py)
|
| 64 |
+
# NOTE: A4 Random Forest model was trained WITH these 5 duplicate columns,
|
| 65 |
+
# so they must be kept in the data for A4 to work correctly
|
| 66 |
+
DUPLICATE_NASM_COLS = [
|
| 67 |
+
'No_1_NASM_Deviation',
|
| 68 |
+
'No_2_NASM_Deviation',
|
| 69 |
+
'No_3_NASM_Deviation',
|
| 70 |
+
'No_4_NASM_Deviation',
|
| 71 |
+
'No_5_NASM_Deviation',
|
| 72 |
+
]
|
| 73 |
+
|
| 74 |
+
# Columns to exclude when extracting features
|
| 75 |
+
EXCLUDE_COLS = ['ID', 'WeakestLink', 'EstimatedScore']
|
| 76 |
+
|
| 77 |
+
# Expected classification classes (14 weaklink categories)
|
| 78 |
+
EXPECTED_CLASSES = [
|
| 79 |
+
'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward',
|
| 80 |
+
'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward',
|
| 81 |
+
'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward',
|
| 82 |
+
'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward',
|
| 83 |
+
'RightKneeMovesOutward', 'RightShoulderElevation'
|
| 84 |
+
]
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def load_and_prepare_data():
|
| 88 |
+
"""Load and prepare data following the same pipeline as classification_baseline.py.
|
| 89 |
+
|
| 90 |
+
NOTE: This function loads data WITH the 5 duplicate NASM columns because
|
| 91 |
+
the A4 Random Forest model was trained with those columns included.
|
| 92 |
+
Other models (A5, A5b, A6) will filter out these columns based on their feature_columns.
|
| 93 |
+
"""
|
| 94 |
+
# Load datasets
|
| 95 |
+
movement_features_df = pd.read_csv(os.path.join(DATA_DIR, 'aimoscores.csv'))
|
| 96 |
+
weaklink_scores_df = pd.read_csv(os.path.join(DATA_DIR, 'scores_and_weaklink.csv'))
|
| 97 |
+
|
| 98 |
+
print('Movement features shape:', movement_features_df.shape)
|
| 99 |
+
print('Weak link scores shape:', weaklink_scores_df.shape)
|
| 100 |
+
|
| 101 |
+
# NOTE: We do NOT remove duplicate NASM columns here because
|
| 102 |
+
# the A4 Random Forest model was trained WITH these columns
|
| 103 |
+
# The other models (A5, A5b, A6) will filter them out based on their saved feature_columns
|
| 104 |
+
print('NOTE: Keeping duplicate NASM columns for A4 Random Forest model compatibility')
|
| 105 |
+
|
| 106 |
+
# Create WeakestLink target column
|
| 107 |
+
weaklink_scores_df['WeakestLink'] = (
|
| 108 |
+
weaklink_scores_df[WEAKLINK_CATEGORIES].idxmax(axis=1)
|
| 109 |
+
)
|
| 110 |
+
print('Weakest Link class distribution:')
|
| 111 |
+
print(weaklink_scores_df['WeakestLink'].value_counts())
|
| 112 |
+
|
| 113 |
+
# Merge datasets
|
| 114 |
+
target_df = weaklink_scores_df[['ID', 'WeakestLink']].copy()
|
| 115 |
+
merged_df = movement_features_df.merge(target_df, on='ID', how='inner')
|
| 116 |
+
print('Merged dataset shape:', merged_df.shape)
|
| 117 |
+
|
| 118 |
+
# Extract feature columns - include ALL columns except EXCLUDE_COLS
|
| 119 |
+
# This ensures the 5 duplicate NASM columns are included for A4
|
| 120 |
+
feature_columns = [c for c in merged_df.columns if c not in EXCLUDE_COLS]
|
| 121 |
+
|
| 122 |
+
X = merged_df[feature_columns].values
|
| 123 |
+
y = merged_df['WeakestLink'].values
|
| 124 |
+
|
| 125 |
+
print(f'Feature matrix shape : {X.shape}')
|
| 126 |
+
print(f'Number of features : {len(feature_columns)}')
|
| 127 |
+
print(f'Number of classes : {len(np.unique(y))}')
|
| 128 |
+
|
| 129 |
+
# Create train/test split (same as baseline)
|
| 130 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 131 |
+
X, y, test_size=0.2, random_state=42, stratify=y
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
# Fit scaler on training data
|
| 135 |
+
scaler = StandardScaler()
|
| 136 |
+
X_train_scaled = scaler.fit_transform(X_train)
|
| 137 |
+
X_test_scaled = scaler.transform(X_test)
|
| 138 |
+
|
| 139 |
+
return {
|
| 140 |
+
'feature_columns': feature_columns,
|
| 141 |
+
'scaler': scaler,
|
| 142 |
+
'X_train': X_train,
|
| 143 |
+
'X_train_scaled': X_train_scaled,
|
| 144 |
+
'y_train': y_train,
|
| 145 |
+
'X_test': X_test,
|
| 146 |
+
'X_test_scaled': X_test_scaled,
|
| 147 |
+
'y_test': y_test,
|
| 148 |
+
'merged_df': merged_df,
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def load_model(model_path, model_name):
|
| 153 |
+
"""Load a model from a pickle file."""
|
| 154 |
+
full_path = os.path.join(project_root, model_path)
|
| 155 |
+
|
| 156 |
+
if not os.path.exists(full_path):
|
| 157 |
+
print(f" β οΈ Model file not found: {full_path}")
|
| 158 |
+
return None, None, None, None
|
| 159 |
+
|
| 160 |
+
try:
|
| 161 |
+
with open(full_path, 'rb') as f:
|
| 162 |
+
artifact = pickle.load(f)
|
| 163 |
+
|
| 164 |
+
# Extract model and scaler based on artifact structure
|
| 165 |
+
if isinstance(artifact, dict):
|
| 166 |
+
model = artifact.get('model')
|
| 167 |
+
scaler = artifact.get('scaler')
|
| 168 |
+
feature_columns = artifact.get('feature_columns')
|
| 169 |
+
else:
|
| 170 |
+
# A6 SVM is a Pipeline object
|
| 171 |
+
model = artifact
|
| 172 |
+
# Extract scaler from pipeline if it exists
|
| 173 |
+
if hasattr(model, 'steps') and len(model.steps) >= 1:
|
| 174 |
+
# Find the scaler in the pipeline
|
| 175 |
+
scaler = None
|
| 176 |
+
for step_name, step_obj in model.steps:
|
| 177 |
+
if hasattr(step_obj, 'transform'):
|
| 178 |
+
# Check if this is a scaler (has n_features_in_ attribute)
|
| 179 |
+
if hasattr(step_obj, 'n_features_in_') and not hasattr(step_obj, 'predict'):
|
| 180 |
+
scaler = step_obj
|
| 181 |
+
break
|
| 182 |
+
# If no scaler found, try to get it from the first step
|
| 183 |
+
if scaler is None and len(model.steps) > 0:
|
| 184 |
+
first_step = model.steps[0][1]
|
| 185 |
+
if hasattr(first_step, 'transform') and hasattr(first_step, 'n_features_in_'):
|
| 186 |
+
scaler = first_step
|
| 187 |
+
# For A6 SVM pipeline, extract feature columns from the scaler
|
| 188 |
+
feature_columns = None
|
| 189 |
+
if hasattr(model, 'steps') and len(model.steps) > 0:
|
| 190 |
+
# Get feature names from the first step (should be the scaler)
|
| 191 |
+
first_step = model.steps[0][1]
|
| 192 |
+
if hasattr(first_step, 'get_feature_names_out'):
|
| 193 |
+
try:
|
| 194 |
+
names = first_step.get_feature_names_out()
|
| 195 |
+
# Only use feature names if they are real column names,
|
| 196 |
+
# not generic placeholder names like x0, x1, ...
|
| 197 |
+
import re
|
| 198 |
+
if not all(re.fullmatch(r'x\d+', n) for n in names):
|
| 199 |
+
feature_columns = names
|
| 200 |
+
# else: leave feature_columns = None; handled below
|
| 201 |
+
except:
|
| 202 |
+
pass
|
| 203 |
+
|
| 204 |
+
print(f" β Loaded {model_name}")
|
| 205 |
+
#print(model, scaler, feature_columns, artifact)
|
| 206 |
+
return model, scaler, feature_columns, artifact
|
| 207 |
+
except Exception as e:
|
| 208 |
+
print(f" β Error loading {model_name}: {e}")
|
| 209 |
+
return None, None, None, None
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def predict_with_model(model, scaler, sample_features, model_name):
|
| 213 |
+
"""Make a prediction using the model."""
|
| 214 |
+
try:
|
| 215 |
+
features = sample_features.copy()
|
| 216 |
+
|
| 217 |
+
# Apply scaler if available
|
| 218 |
+
if scaler is not None:
|
| 219 |
+
features = scaler.transform(features)
|
| 220 |
+
|
| 221 |
+
# Make prediction
|
| 222 |
+
prediction = model.predict(features)
|
| 223 |
+
prediction_proba = None
|
| 224 |
+
|
| 225 |
+
# Get prediction probabilities if available
|
| 226 |
+
if hasattr(model, 'predict_proba'):
|
| 227 |
+
prediction_proba = model.predict_proba(features)
|
| 228 |
+
|
| 229 |
+
return prediction, prediction_proba, None
|
| 230 |
+
except Exception as e:
|
| 231 |
+
return None, None, str(e)
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
def create_sample_from_training_data(training_data, feature_columns, scaler):
|
| 235 |
+
"""Create a sample from the training data for testing."""
|
| 236 |
+
# Get first sample from training data
|
| 237 |
+
sample = training_data['X_train'][0:1].copy()
|
| 238 |
+
sample_df = pd.DataFrame(sample, columns=feature_columns)
|
| 239 |
+
|
| 240 |
+
# Scale if scaler is available
|
| 241 |
+
if scaler is not None:
|
| 242 |
+
sample_df_scaled = scaler.transform(sample_df)
|
| 243 |
+
return sample_df, sample_df_scaled
|
| 244 |
+
return sample_df, sample_df
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
def filter_features_for_model(sample_df, model_feature_columns):
|
| 248 |
+
"""Filter sample data to only include features the model expects."""
|
| 249 |
+
available_features = [f for f in model_feature_columns if f in sample_df.columns]
|
| 250 |
+
|
| 251 |
+
if len(available_features) == 0:
|
| 252 |
+
print(f" β οΈ No matching features found, using all available")
|
| 253 |
+
available_features = sample_df.columns.tolist()
|
| 254 |
+
|
| 255 |
+
return sample_df[available_features]
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
def main():
|
| 259 |
+
"""Main function to test all models."""
|
| 260 |
+
print("=" * 60)
|
| 261 |
+
print("Testing All Classification Models with One Sample")
|
| 262 |
+
print("=" * 60)
|
| 263 |
+
print()
|
| 264 |
+
|
| 265 |
+
# Load and prepare data using the same pipeline as classification_baseline.py
|
| 266 |
+
# NOTE: Data is loaded WITH the 5 duplicate NASM columns for A4 compatibility
|
| 267 |
+
print("Loading data...")
|
| 268 |
+
data = load_and_prepare_data()
|
| 269 |
+
print()
|
| 270 |
+
|
| 271 |
+
# Create sample from training data
|
| 272 |
+
sample_features, sample_features_scaled = create_sample_from_training_data(
|
| 273 |
+
data, data['feature_columns'], data['scaler']
|
| 274 |
+
)
|
| 275 |
+
print(f"Sample data shape: {sample_features.shape}")
|
| 276 |
+
print(f"Number of features (including duplicates): {len(data['feature_columns'])}")
|
| 277 |
+
print()
|
| 278 |
+
|
| 279 |
+
# Define models to test
|
| 280 |
+
models_to_test = [
|
| 281 |
+
('A4 Random Forest', a4_rf),
|
| 282 |
+
('A5 Ensemble', a5_ensemnble),
|
| 283 |
+
('A5b Adaboost', a5b_adaboost),
|
| 284 |
+
('A5b Bagging Trees', a5b_bagging_tree),
|
| 285 |
+
('A6 SVM', a6_svm),
|
| 286 |
+
]
|
| 287 |
+
|
| 288 |
+
results = []
|
| 289 |
+
|
| 290 |
+
for model_name, model_path in models_to_test:
|
| 291 |
+
print(f"Testing {model_name}...")
|
| 292 |
+
|
| 293 |
+
# Load model
|
| 294 |
+
model, scaler, model_feature_columns, artifact = load_model(model_path, model_name)
|
| 295 |
+
|
| 296 |
+
if model is None:
|
| 297 |
+
print(f" Skipping {model_name} due to load error")
|
| 298 |
+
results.append((model_name, 'LOAD_ERROR', None, None, None))
|
| 299 |
+
print()
|
| 300 |
+
continue
|
| 301 |
+
|
| 302 |
+
# Determine feature columns to use
|
| 303 |
+
if model_feature_columns is not None:
|
| 304 |
+
# Filter sample data to only include features the model expects
|
| 305 |
+
test_features = filter_features_for_model(sample_features, model_feature_columns)
|
| 306 |
+
print(f" Model expects {len(model_feature_columns)} features, using {len(test_features.columns)} available")
|
| 307 |
+
elif hasattr(model, 'steps'):
|
| 308 |
+
# Pipeline with generic/unknown feature names (e.g. A6 SVM trained without
|
| 309 |
+
# the 5 duplicate NASM columns). Drop those duplicate columns so the number
|
| 310 |
+
# of features matches what the pipeline's scaler expects.
|
| 311 |
+
first_step = model.steps[0][1]
|
| 312 |
+
n_expected = getattr(first_step, 'n_features_in_', None)
|
| 313 |
+
cols_without_dupes = [c for c in sample_features.columns if c not in DUPLICATE_NASM_COLS]
|
| 314 |
+
if n_expected is not None and len(cols_without_dupes) == n_expected:
|
| 315 |
+
test_features = sample_features[cols_without_dupes]
|
| 316 |
+
print(f" Pipeline expects {n_expected} features β dropped duplicate NASM cols, using {len(test_features.columns)} features")
|
| 317 |
+
else:
|
| 318 |
+
# Fallback: just take the first n_expected columns
|
| 319 |
+
test_features = sample_features.iloc[:, :n_expected] if n_expected else sample_features
|
| 320 |
+
print(f" Pipeline expects {n_expected} features, sliced sample to {len(test_features.columns)} features")
|
| 321 |
+
else:
|
| 322 |
+
test_features = sample_features
|
| 323 |
+
print(f" Using all {len(sample_features.columns)} available features")
|
| 324 |
+
|
| 325 |
+
# Make prediction
|
| 326 |
+
# For A6 SVM pipeline, don't pass the scaler separately since it's already in the pipeline
|
| 327 |
+
# For other models, pass the scaler if available
|
| 328 |
+
if model_feature_columns is None and hasattr(model, 'steps'):
|
| 329 |
+
# This is likely the A6 SVM pipeline - don't apply scaler separately
|
| 330 |
+
scaler_to_use = None
|
| 331 |
+
else:
|
| 332 |
+
scaler_to_use = scaler
|
| 333 |
+
|
| 334 |
+
prediction, prediction_proba, error = predict_with_model(
|
| 335 |
+
model, scaler_to_use, test_features, model_name
|
| 336 |
+
)
|
| 337 |
+
|
| 338 |
+
if error:
|
| 339 |
+
print(f" β Prediction error: {error}")
|
| 340 |
+
results.append((model_name, 'PREDICTION_ERROR', None, None, error))
|
| 341 |
+
print()
|
| 342 |
+
continue
|
| 343 |
+
|
| 344 |
+
# Display results
|
| 345 |
+
print(f" β Prediction: {prediction[0]}")
|
| 346 |
+
|
| 347 |
+
if prediction_proba is not None:
|
| 348 |
+
print(f" β Prediction probabilities shape: {prediction_proba.shape}")
|
| 349 |
+
top_classes_idx = np.argsort(prediction_proba[0])[-3:][::-1]
|
| 350 |
+
top_classes = [EXPECTED_CLASSES[i] for i in top_classes_idx]
|
| 351 |
+
top_probs = [prediction_proba[0][i] for i in top_classes_idx]
|
| 352 |
+
print(f" β Top 3 classes: {list(zip(top_classes, [f'{p:.3f}' for p in top_probs]))}")
|
| 353 |
+
|
| 354 |
+
print(f" β Model type: {type(model).__name__}")
|
| 355 |
+
|
| 356 |
+
# Check if model has classes attribute
|
| 357 |
+
if hasattr(model, 'classes_'):
|
| 358 |
+
print(f" β Model classes: {list(model.classes_)}")
|
| 359 |
+
|
| 360 |
+
results.append((model_name, 'SUCCESS', prediction, prediction_proba, None))
|
| 361 |
+
print()
|
| 362 |
+
|
| 363 |
+
# Summary
|
| 364 |
+
print("=" * 60)
|
| 365 |
+
print("Summary")
|
| 366 |
+
print("=" * 60)
|
| 367 |
+
|
| 368 |
+
for model_name, status, prediction, proba, error in results:
|
| 369 |
+
if status == 'SUCCESS':
|
| 370 |
+
pred_str = prediction[0] if prediction is not None else 'N/A'
|
| 371 |
+
print(f" {model_name}: β SUCCESS - Prediction: {pred_str}")
|
| 372 |
+
else:
|
| 373 |
+
print(f" {model_name}: β {status} - {error}")
|
| 374 |
+
|
| 375 |
+
print()
|
| 376 |
+
print("All models tested!")
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
if __name__ == "__main__":
|
| 380 |
+
main()
|
A6/time_specification.md
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Hardware Specifications
|
| 2 |
+
|
| 3 |
+
| Component | Specification |
|
| 4 |
+
|-----------|---------------|
|
| 5 |
+
| **CPU** | AMD Ryzen 5 5600U with Radeon Graphics |
|
| 6 |
+
| **CPU Cores/Threads** | 6 cores, 12 threads (2 threads per core) |
|
| 7 |
+
| **CPU Frequency** | 400 MHz - 4289 MHz (max boost) |
|
| 8 |
+
| **Architecture** | x86_64 |
|
| 9 |
+
| **RAM** | 30 GiB (15 GiB available currently) |
|
| 10 |
+
| **Swap** | 31 GiB |
|
| 11 |
+
| **Integrated GPU** | AMD Radeon Vega Mobile Series (Cezanne) |
|
| 12 |
+
| **Storage** | 469 GB NVMe SSD |
|
| 13 |
+
| **Operating System** | Linux (Ubuntu-based, kernel 6.8.0-101-lowlatency) |
|
| 14 |
+
|
| 15 |
+
---
|
| 16 |
+
|
| 17 |
+
## Software Environment
|
| 18 |
+
|
| 19 |
+
| Component | Version/Details |
|
| 20 |
+
|-----------|-----------------|
|
| 21 |
+
| **Python** | 3.12.3 |
|
| 22 |
+
| **Key Packages** | numpy 2.4.2, scikit-learn 1.8.0, pandas 2.2.3 |
|