Bachstelze commited on
Commit
a639edc
Β·
1 Parent(s): 2a23fe1

add time bench and viz

Browse files
A6/adaboost_classes.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Helper module to import AdaBoost classes without running module-level code.
4
+
5
+ This module re-exports the AdaBoostEnsemble and WeightedDecisionTree classes
6
+ from classification_adaboost.py, but without triggering the module-level
7
+ data loading and training code.
8
+ """
9
+ import numpy as np
10
+ from sklearn.base import BaseEstimator, ClassifierMixin
11
+ from sklearn.tree import DecisionTreeClassifier
12
+ from typing import List
13
+
14
+
15
+ class WeightedDecisionTree(DecisionTreeClassifier):
16
+ """
17
+ A wrapper around DecisionTreeClassifier that properly handles sample weights.
18
+ This tree is grown based on weighted training errors.
19
+ """
20
+ def __init__(self, max_depth: int = 5, min_samples_split: int = 2,
21
+ min_samples_leaf: int = 1, random_state: int = 42):
22
+ super().__init__(
23
+ max_depth=max_depth,
24
+ min_samples_split=min_samples_split,
25
+ min_samples_leaf=min_samples_leaf,
26
+ random_state=random_state
27
+ )
28
+
29
+ def fit(self, X, y, sample_weight=None):
30
+ """Fit the decision tree with optional sample weights."""
31
+ return super().fit(X, y, sample_weight=sample_weight)
32
+
33
+
34
+ class AdaBoostEnsemble(BaseEstimator, ClassifierMixin):
35
+ """
36
+ AdaBoost ensemble of decision trees where each tree is grown based on
37
+ weighted training errors. Weights are updated based on the error of
38
+ previous trees.
39
+
40
+ The algorithm:
41
+ 1. Initialize equal weights for all training samples
42
+ 2. For each tree in the ensemble:
43
+ - Train a decision tree on weighted data
44
+ - Calculate weighted error rate
45
+ - Compute tree weight (alpha)
46
+ - Update sample weights (increase for misclassified, decrease for correct)
47
+ - Normalize weights
48
+ 3. Make predictions using weighted voting
49
+ """
50
+
51
+ def __init__(
52
+ self,
53
+ n_estimators: int = 50,
54
+ max_depth: int = 5,
55
+ min_samples_split: int = 2,
56
+ min_samples_leaf: int = 1,
57
+ random_state: int = 42
58
+ ):
59
+ self.n_estimators = n_estimators
60
+ self.max_depth = max_depth
61
+ self.min_samples_split = min_samples_split
62
+ self.min_samples_leaf = min_samples_leaf
63
+ self.random_state = random_state
64
+ self.trees: List[WeightedDecisionTree] = []
65
+ self.tree_weights: List[float] = []
66
+ self.n_classes: int = 0
67
+ self.classes_: np.ndarray = None
68
+
69
+ def _initialize_weights(self, n_samples: int) -> np.ndarray:
70
+ """Initialize equal weights for all samples."""
71
+ return np.ones(n_samples) / n_samples
72
+
73
+ def _update_weights(
74
+ self,
75
+ weights: np.ndarray,
76
+ y_true: np.ndarray,
77
+ y_pred: np.ndarray,
78
+ alpha: float
79
+ ) -> np.ndarray:
80
+ """
81
+ Update sample weights based on prediction errors.
82
+ Increase weight for misclassified samples, decrease for correct.
83
+ """
84
+ # Misclassified samples get multiplied by exp(alpha)
85
+ # Correctly classified samples get multiplied by exp(-alpha)
86
+ misclassified = y_true != y_pred
87
+ updated_weights = weights * np.exp(alpha * misclassified.astype(float))
88
+
89
+ # Normalize weights
90
+ return updated_weights / updated_weights.sum()
91
+
92
+ def _compute_weighted_error(
93
+ self,
94
+ weights: np.ndarray,
95
+ y_true: np.ndarray,
96
+ y_pred: np.ndarray
97
+ ) -> float:
98
+ """Compute weighted error rate."""
99
+ misclassified = (y_true != y_pred).astype(float)
100
+ return np.sum(weights * misclassified) / np.sum(weights)
101
+
102
+ def _compute_alpha(self, error: float) -> float:
103
+ """
104
+ Compute the weight of the classifier.
105
+ Avoid division by zero and log(0).
106
+ """
107
+ if error <= 0:
108
+ return 10.0 # Very high weight for perfect classifier
109
+ if error >= 1:
110
+ return -10.0 # Very negative weight for completely wrong classifier
111
+ return 0.5 * np.log((1 - error) / error)
112
+
113
+ def fit(self, X: np.ndarray, y: np.ndarray) -> 'AdaBoostEnsemble':
114
+ """Fit the AdaBoost ensemble."""
115
+ n_samples, n_features = X.shape
116
+ self.classes_ = np.unique(y)
117
+ self.n_classes = len(self.classes_)
118
+
119
+ # Initialize sample weights
120
+ weights = self._initialize_weights(n_samples)
121
+
122
+ for i in range(self.n_estimators):
123
+ # Create and train decision tree with current weights
124
+ tree = WeightedDecisionTree(
125
+ max_depth=self.max_depth,
126
+ min_samples_split=self.min_samples_split,
127
+ min_samples_leaf=self.min_samples_leaf,
128
+ random_state=self.random_state + i
129
+ )
130
+ tree.fit(X, y, sample_weight=weights)
131
+
132
+ # Make predictions
133
+ y_pred = tree.predict(X)
134
+
135
+ # Calculate weighted error
136
+ error = self._compute_weighted_error(weights, y, y_pred)
137
+
138
+ # Compute tree weight (alpha)
139
+ alpha = self._compute_alpha(error)
140
+
141
+ # Update sample weights
142
+ weights = self._update_weights(weights, y, y_pred, alpha)
143
+
144
+ # Store tree and its weight
145
+ self.trees.append(tree)
146
+ self.tree_weights.append(alpha)
147
+
148
+ print(f"Tree {i+1}/{self.n_estimators}: Error={error:.4f}, Alpha={alpha:.4f}")
149
+
150
+ return self
151
+
152
+ def predict(self, X: np.ndarray) -> np.ndarray:
153
+ """Predict using weighted voting."""
154
+ # Get predictions from all trees
155
+ all_predictions = np.array([tree.predict(X) for tree in self.trees])
156
+
157
+ # Get class labels
158
+ classes = self.classes_
159
+
160
+ # Compute weighted votes for each class
161
+ n_samples = X.shape[0]
162
+ weighted_votes = np.zeros((n_samples, len(classes)))
163
+
164
+ for tree_idx, tree in enumerate(self.trees):
165
+ alpha = self.tree_weights[tree_idx]
166
+ predictions = all_predictions[tree_idx]
167
+
168
+ for class_idx, class_label in enumerate(classes):
169
+ weighted_votes[:, class_idx] += alpha * (predictions == class_label)
170
+
171
+ # Return class with highest weighted vote
172
+ return classes[np.argmax(weighted_votes, axis=1)]
173
+
174
+ def predict_proba(self, X: np.ndarray) -> np.ndarray:
175
+ """Predict class probabilities using weighted voting."""
176
+ # Get predictions from all trees
177
+ all_predictions = np.array([tree.predict(X) for tree in self.trees])
178
+
179
+ # Get class labels
180
+ classes = self.classes_
181
+
182
+ # Compute weighted vote proportions for each class
183
+ n_samples = X.shape[0]
184
+ weighted_votes = np.zeros((n_samples, len(classes)))
185
+
186
+ total_weight = sum(abs(w) for w in self.tree_weights)
187
+
188
+ for tree_idx, tree in enumerate(self.trees):
189
+ alpha = self.tree_weights[tree_idx]
190
+ predictions = all_predictions[tree_idx]
191
+
192
+ for class_idx, class_label in enumerate(classes):
193
+ weighted_votes[:, class_idx] += abs(alpha) * (predictions == class_label)
194
+
195
+ # Normalize to get probabilities
196
+ return weighted_votes / total_weight
A6/all_classification.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ a4_rf = "../A4/models/weaklink_classifier_rf.pkl"
2
+ a5_ensemnble = "../A5/models/ensemble_classification_champion.pkl"
3
+ a5b_adaboost = "../A5b/models/adaboost_classification.pkl"
4
+ a5b_bagging_tree = "../A5b/models/bagging_trees_champion.pkl"
5
+ a6_svm = "models/champion_svm.pkl"
A6/benchmark_results/benchmark_20260310_090052.json ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "2026-03-10T09:00:50.070144",
3
+ "num_samples": 100,
4
+ "num_repeats": 10,
5
+ "models": {
6
+ "A4 Random Forest": {
7
+ "model_name": "A4 Random Forest",
8
+ "model_path": "../A4/models/weaklink_classifier_rf.pkl",
9
+ "inference_time_mean": 0.06072263170499355,
10
+ "inference_time_std": 0.0030473875509894866,
11
+ "inference_time_min": 0.058138252003118396,
12
+ "inference_time_max": 0.06896431901259348,
13
+ "inference_time_p50": 0.060211887990590185,
14
+ "inference_time_p95": 0.06896431901259348,
15
+ "inference_time_p99": 0.06896431901259348,
16
+ "memory_usage_mean": 360134.1,
17
+ "memory_usage_std": 67634.63257081308,
18
+ "memory_usage_peak": 512177,
19
+ "accuracy": 0.89,
20
+ "predictions_correct": 89,
21
+ "predictions_total": 100,
22
+ "model_size_bytes": 16381898,
23
+ "num_features": 41,
24
+ "num_parameters": 0,
25
+ "model_type": "RandomForestClassifier",
26
+ "feature_extraction_time_mean": 0.0,
27
+ "timing_samples": [
28
+ 0.060492476040963084,
29
+ 0.05959970800904557,
30
+ 0.05881448305444792,
31
+ 0.058138252003118396,
32
+ 0.06896431901259348,
33
+ 0.060211887990590185,
34
+ 0.05942972801858559,
35
+ 0.061595859995577484,
36
+ 0.0596357659669593,
37
+ 0.06034383695805445
38
+ ],
39
+ "memory_samples": [
40
+ 512177,
41
+ 377303,
42
+ 302127,
43
+ 358391,
44
+ 379313,
45
+ 354423,
46
+ 380515,
47
+ 281588,
48
+ 379268,
49
+ 276236
50
+ ],
51
+ "status": "SUCCESS",
52
+ "error_message": ""
53
+ },
54
+ "A5 Ensemble": {
55
+ "model_name": "A5 Ensemble",
56
+ "model_path": "../A5/models/ensemble_classification_champion.pkl",
57
+ "inference_time_mean": 0.08792474841466173,
58
+ "inference_time_std": 0.019674506115526187,
59
+ "inference_time_min": 0.067903274029959,
60
+ "inference_time_max": 0.13867365900659934,
61
+ "inference_time_p50": 0.08352956402814016,
62
+ "inference_time_p95": 0.13867365900659934,
63
+ "inference_time_p99": 0.13867365900659934,
64
+ "memory_usage_mean": 404756.5,
65
+ "memory_usage_std": 288156.9877403828,
66
+ "memory_usage_peak": 1210671,
67
+ "accuracy": 0.67,
68
+ "predictions_correct": 67,
69
+ "predictions_total": 100,
70
+ "model_size_bytes": 26660056,
71
+ "num_features": 36,
72
+ "num_parameters": 0,
73
+ "model_type": "VotingClassifier",
74
+ "feature_extraction_time_mean": 0.0,
75
+ "timing_samples": [
76
+ 0.13867365900659934,
77
+ 0.08352956402814016,
78
+ 0.067903274029959,
79
+ 0.08235391502967104,
80
+ 0.09512816503411159,
81
+ 0.09174130897736177,
82
+ 0.07728461700025946,
83
+ 0.07468455104390159,
84
+ 0.07801902701612562,
85
+ 0.0899294029804878
86
+ ],
87
+ "memory_samples": [
88
+ 1210671,
89
+ 276078,
90
+ 257244,
91
+ 374860,
92
+ 258411,
93
+ 374702,
94
+ 277252,
95
+ 270064,
96
+ 372458,
97
+ 375825
98
+ ],
99
+ "status": "SUCCESS",
100
+ "error_message": ""
101
+ },
102
+ "A5b Adaboost": {
103
+ "model_name": "A5b Adaboost",
104
+ "model_path": "../A5b/models/adaboost_classification.pkl",
105
+ "inference_time_mean": 0.03466975499759428,
106
+ "inference_time_std": 0.006925241966045739,
107
+ "inference_time_min": 0.030500065011437982,
108
+ "inference_time_max": 0.048356816987507045,
109
+ "inference_time_p50": 0.032038366014603525,
110
+ "inference_time_p95": 0.048356816987507045,
111
+ "inference_time_p99": 0.048356816987507045,
112
+ "memory_usage_mean": 204768.4,
113
+ "memory_usage_std": 311.91138342662504,
114
+ "memory_usage_peak": 205656,
115
+ "accuracy": 0.52,
116
+ "predictions_correct": 52,
117
+ "predictions_total": 100,
118
+ "model_size_bytes": 725059,
119
+ "num_features": 0,
120
+ "num_parameters": 0,
121
+ "model_type": "AdaBoostEnsemble",
122
+ "feature_extraction_time_mean": 0.0,
123
+ "timing_samples": [
124
+ 0.048356816987507045,
125
+ 0.047088092018384486,
126
+ 0.03258101601386443,
127
+ 0.03238268301356584,
128
+ 0.03146621095947921,
129
+ 0.032038366014603525,
130
+ 0.030500065011437982,
131
+ 0.03090687998337671,
132
+ 0.03052046400262043,
133
+ 0.03085695597110316
134
+ ],
135
+ "memory_samples": [
136
+ 205656,
137
+ 204684,
138
+ 204668,
139
+ 204668,
140
+ 204668,
141
+ 204668,
142
+ 204668,
143
+ 204668,
144
+ 204668,
145
+ 204668
146
+ ],
147
+ "status": "SUCCESS",
148
+ "error_message": ""
149
+ },
150
+ "A5b Bagging Trees": {
151
+ "model_name": "A5b Bagging Trees",
152
+ "model_path": "../A5b/models/bagging_trees_champion.pkl",
153
+ "inference_time_mean": 0.006075771508039907,
154
+ "inference_time_std": 0.0017926972777932554,
155
+ "inference_time_min": 0.0038332950207404792,
156
+ "inference_time_max": 0.00979096203809604,
157
+ "inference_time_p50": 0.006550171005073935,
158
+ "inference_time_p95": 0.00979096203809604,
159
+ "inference_time_p99": 0.00979096203809604,
160
+ "memory_usage_mean": 59716.6,
161
+ "memory_usage_std": 68.09176814335848,
162
+ "memory_usage_peak": 59866,
163
+ "accuracy": 0.0,
164
+ "predictions_correct": 0,
165
+ "predictions_total": 100,
166
+ "model_size_bytes": 6506123,
167
+ "num_features": 36,
168
+ "num_parameters": 0,
169
+ "model_type": "LGBMClassifier",
170
+ "feature_extraction_time_mean": 0.0,
171
+ "timing_samples": [
172
+ 0.006550171005073935,
173
+ 0.0061910360236652195,
174
+ 0.0068354670074768364,
175
+ 0.006988314969930798,
176
+ 0.004823405994102359,
177
+ 0.006920185987837613,
178
+ 0.00979096203809604,
179
+ 0.0038514090119861066,
180
+ 0.0038332950207404792,
181
+ 0.00497346802148968
182
+ ],
183
+ "memory_samples": [
184
+ 59866,
185
+ 59746,
186
+ 59746,
187
+ 59746,
188
+ 59746,
189
+ 59700,
190
+ 59654,
191
+ 59654,
192
+ 59654,
193
+ 59654
194
+ ],
195
+ "status": "SUCCESS",
196
+ "error_message": ""
197
+ },
198
+ "A6 SVM": {
199
+ "model_name": "A6 SVM",
200
+ "model_path": "models/champion_svm.pkl",
201
+ "inference_time_mean": 0.009102203900692985,
202
+ "inference_time_std": 0.0003233410993925297,
203
+ "inference_time_min": 0.008689811977092177,
204
+ "inference_time_max": 0.009627135004848242,
205
+ "inference_time_p50": 0.009107397985644639,
206
+ "inference_time_p95": 0.009627135004848242,
207
+ "inference_time_p99": 0.009627135004848242,
208
+ "memory_usage_mean": 62088.6,
209
+ "memory_usage_std": 193.42021036535397,
210
+ "memory_usage_peak": 62631,
211
+ "accuracy": 0.83,
212
+ "predictions_correct": 83,
213
+ "predictions_total": 100,
214
+ "model_size_bytes": 700346,
215
+ "num_features": 36,
216
+ "num_parameters": 0,
217
+ "model_type": "Pipeline",
218
+ "feature_extraction_time_mean": 0.0,
219
+ "timing_samples": [
220
+ 0.009627135004848242,
221
+ 0.009057053015567362,
222
+ 0.009107397985644639,
223
+ 0.008771255961619318,
224
+ 0.00915416597854346,
225
+ 0.008994235016871244,
226
+ 0.00961044302675873,
227
+ 0.00879047199850902,
228
+ 0.009220069041475654,
229
+ 0.008689811977092177
230
+ ],
231
+ "memory_samples": [
232
+ 62631,
233
+ 62063,
234
+ 62047,
235
+ 62047,
236
+ 61955,
237
+ 62047,
238
+ 62047,
239
+ 62047,
240
+ 62001,
241
+ 62001
242
+ ],
243
+ "status": "SUCCESS",
244
+ "error_message": ""
245
+ }
246
+ }
247
+ }
A6/benchmark_results/single_benchmark_20260310_090011.json ADDED
The diff for this file is too large to render. See raw diff
 
A6/benchmark_results/visualizations/accuracy_vs_inference_time.png ADDED

Git LFS Details

  • SHA256: d9645f6a0c93cdecb0ba32f3466ea37f4230fbd203a3dd9e6e255b4e2aedb449
  • Pointer size: 131 Bytes
  • Size of remote file: 198 kB
A6/benchmark_results/visualizations/compare_benchmarks.py ADDED
@@ -0,0 +1,503 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to compare response times (inference times) from two benchmark JSON files.
4
+ Generates a visualization comparing the models from both benchmarks.
5
+ """
6
+
7
+ import json
8
+ import matplotlib.pyplot as plt
9
+ import numpy as np
10
+ from pathlib import Path
11
+
12
+ # File paths
13
+ benchmark_path = Path(__file__).parent / "../benchmark_20260310_090052.json"
14
+ single_benchmark_path = Path(__file__).parent / "../single_benchmark_20260310_090011.json"
15
+
16
+ # Load benchmark data
17
+ with open(benchmark_path, 'r') as f:
18
+ benchmark_data = json.load(f)
19
+
20
+ with open(single_benchmark_path, 'r') as f:
21
+ single_benchmark_data = json.load(f)
22
+
23
+ # Extract model data
24
+ def extract_model_data(data_dict):
25
+ models = {}
26
+ for model_name, model_info in data_dict.get('models', {}).items():
27
+ models[model_name] = {
28
+ 'mean': model_info.get('inference_time_mean', 0),
29
+ 'std': model_info.get('inference_time_std', 0),
30
+ 'min': model_info.get('inference_time_min', 0),
31
+ 'max': model_info.get('inference_time_max', 0),
32
+ 'p50': model_info.get('inference_time_p50', 0),
33
+ 'p95': model_info.get('inference_time_p95', 0),
34
+ 'p99': model_info.get('inference_time_p99', 0),
35
+ 'accuracy': model_info.get('accuracy', 0),
36
+ 'timing_samples': model_info.get('timing_samples', [])
37
+ }
38
+ return models
39
+
40
+ benchmark_models = extract_model_data(benchmark_data)
41
+ single_benchmark_models = extract_model_data(single_benchmark_data)
42
+
43
+ # Get all model names (should be the same in both)
44
+ all_model_names = sorted(benchmark_models.keys())
45
+
46
+ # Create figure with subplots
47
+ fig = plt.figure(figsize=(16, 10))
48
+
49
+ # 1. Bar chart comparing mean inference times
50
+ ax1 = fig.add_subplot(2, 3, 1)
51
+ x = np.arange(len(all_model_names))
52
+ width = 0.35
53
+
54
+ benchmark_means = [benchmark_models[m]['mean'] * 1000 for m in all_model_names] # Convert to ms
55
+ single_means = [single_benchmark_models[m]['mean'] * 1000 for m in all_model_names] # Convert to ms
56
+
57
+ bars1 = ax1.bar(x - width/2, benchmark_means, width, label='Multi-benchmark (100 samples)', alpha=0.8)
58
+ bars2 = ax1.bar(x + width/2, single_means, width, label='Single-benchmark (10 samples)', alpha=0.8)
59
+
60
+ ax1.set_xlabel('Model')
61
+ ax1.set_ylabel('Mean Inference Time (ms)')
62
+ ax1.set_title('Comparison of Mean Inference Times')
63
+ ax1.set_xticks(x)
64
+ ax1.set_xticklabels(all_model_names, rotation=45, ha='right')
65
+ ax1.legend()
66
+ ax1.grid(axis='y', alpha=0.3)
67
+
68
+ # Add value labels on bars
69
+ for bar in bars1:
70
+ height = bar.get_height()
71
+ ax1.annotate(f'{height:.3f}',
72
+ xy=(bar.get_x() + bar.get_width() / 2, height),
73
+ xytext=(0, 3),
74
+ textcoords="offset points",
75
+ ha='center', va='bottom', fontsize=8)
76
+
77
+ for bar in bars2:
78
+ height = bar.get_height()
79
+ ax1.annotate(f'{height:.3f}',
80
+ xy=(bar.get_x() + bar.get_width() / 2, height),
81
+ xytext=(0, 3),
82
+ textcoords="offset points",
83
+ ha='center', va='bottom', fontsize=8)
84
+
85
+ # 2. Box plot comparing timing distributions
86
+ ax2 = fig.add_subplot(2, 3, 2)
87
+
88
+ # Prepare data for box plot
89
+ all_data = []
90
+ labels = []
91
+ colors = []
92
+
93
+ for i, model_name in enumerate(all_model_names):
94
+ benchmark_samples = benchmark_models[model_name]['timing_samples'][:10] # Use first 10 for comparison
95
+ single_samples = single_benchmark_models[model_name]['timing_samples'][:10] # Use first 10 for comparison
96
+
97
+ # Convert to ms
98
+ benchmark_ms = [s * 1000 for s in benchmark_samples]
99
+ single_ms = [s * 1000 for s in single_samples]
100
+
101
+ all_data.append(benchmark_ms)
102
+ all_data.append(single_ms)
103
+ labels.append(f'{model_name}\nMulti')
104
+ labels.append(f'{model_name}\nSingle')
105
+ colors.extend([f'C{i}', f'C{i}'])
106
+
107
+ bp = ax2.boxplot(all_data, labels=labels, patch_artist=True, vert=True)
108
+ for patch, color in zip(bp['boxes'], colors):
109
+ patch.set_facecolor(color)
110
+ patch.set_alpha(0.6)
111
+
112
+ ax2.set_xlabel('Model (Benchmark Type)')
113
+ ax2.set_ylabel('Inference Time (ms)')
114
+ ax2.set_title('Distribution of Inference Times (Box Plot)')
115
+ ax2.tick_params(axis='x', rotation=45)
116
+ ax2.grid(axis='y', alpha=0.3)
117
+
118
+ # 3. Comparison scatter plot with accuracy
119
+ ax3 = fig.add_subplot(2, 3, 3)
120
+
121
+ benchmark_accs = [benchmark_models[m]['accuracy'] * 100 for m in all_model_names]
122
+ single_accs = [single_benchmark_models[m]['accuracy'] * 100 for m in all_model_names]
123
+ benchmark_times = [benchmark_models[m]['mean'] * 1000 for m in all_model_names]
124
+ single_times = [single_benchmark_models[m]['mean'] * 1000 for m in all_model_names]
125
+
126
+ # Create scatter plot
127
+ for i, model_name in enumerate(all_model_names):
128
+ ax3.scatter([benchmark_times[i]], [benchmark_accs[i]], marker='o', s=100,
129
+ label=f'{model_name} (Multi)', alpha=0.8, color=f'C{i}')
130
+ ax3.scatter([single_times[i]], [single_accs[i]], marker='s', s=100,
131
+ label=f'{model_name} (Single)', alpha=0.8, color=f'C{i}')
132
+
133
+ ax3.set_xlabel('Mean Inference Time (ms)')
134
+ ax3.set_ylabel('Accuracy (%)')
135
+ ax3.set_title('Accuracy vs Inference Time Comparison')
136
+ ax3.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small')
137
+ ax3.grid(True, alpha=0.3)
138
+
139
+ # 4. Percentile comparison
140
+ ax4 = fig.add_subplot(2, 3, 4)
141
+
142
+ x = np.arange(len(all_model_names))
143
+ width = 0.25
144
+
145
+ benchmark_p50 = [benchmark_models[m]['p50'] * 1000 for m in all_model_names]
146
+ benchmark_p95 = [benchmark_models[m]['p95'] * 1000 for m in all_model_names]
147
+ benchmark_p99 = [benchmark_models[m]['p99'] * 1000 for m in all_model_names]
148
+
149
+ single_p50 = [single_benchmark_models[m]['p50'] * 1000 for m in all_model_names]
150
+ single_p95 = [single_benchmark_models[m]['p95'] * 1000 for m in all_model_names]
151
+ single_p99 = [single_benchmark_models[m]['p99'] * 1000 for m in all_model_names]
152
+
153
+ bars_p50 = ax4.bar(x - width, benchmark_p50, width, label='P50 (Multi)', alpha=0.8)
154
+ bars_p95 = ax4.bar(x, benchmark_p95, width, label='P95 (Multi)', alpha=0.8)
155
+ bars_p99 = ax4.bar(x + width, benchmark_p99, width, label='P99 (Multi)', alpha=0.8)
156
+
157
+ # Single benchmark percentiles (offset)
158
+ ax4.bar(x - width + 0.05, single_p50, width*0.8, label='P50 (Single)', alpha=0.6, hatch='//')
159
+ ax4.bar(x + 0.05, single_p95, width*0.8, label='P95 (Single)', alpha=0.6, hatch='//')
160
+ ax4.bar(x + width + 0.05, single_p99, width*0.8, label='P99 (Single)', alpha=0.6, hatch='//')
161
+
162
+ ax4.set_xlabel('Model')
163
+ ax4.set_ylabel('Inference Time (ms)')
164
+ ax4.set_title('Percentile Comparison (P50, P95, P99)')
165
+ ax4.set_xticks(x)
166
+ ax4.set_xticklabels(all_model_names, rotation=45, ha='right')
167
+ ax4.legend(fontsize='small')
168
+ ax4.grid(axis='y', alpha=0.3)
169
+
170
+ # 5. Standard deviation comparison
171
+ ax5 = fig.add_subplot(2, 3, 5)
172
+
173
+ benchmark_std = [benchmark_models[m]['std'] * 1000 for m in all_model_names]
174
+ single_std = [single_benchmark_models[m]['std'] * 1000 for m in all_model_names]
175
+
176
+ x = np.arange(len(all_model_names))
177
+ width = 0.35
178
+
179
+ bars_std1 = ax5.bar(x - width/2, benchmark_std, width, label='Multi-benchmark', alpha=0.8)
180
+ bars_std2 = ax5.bar(x + width/2, single_std, width, label='Single-benchmark', alpha=0.8)
181
+
182
+ ax5.set_xlabel('Model')
183
+ ax5.set_ylabel('Standard Deviation (ms)')
184
+ ax5.set_title('Standard Deviation of Inference Times')
185
+ ax5.set_xticks(x)
186
+ ax5.set_xticklabels(all_model_names, rotation=45, ha='right')
187
+ ax5.legend()
188
+ ax5.grid(axis='y', alpha=0.3)
189
+
190
+ # Add value labels
191
+ for bar in bars_std1:
192
+ height = bar.get_height()
193
+ ax5.annotate(f'{height:.4f}',
194
+ xy=(bar.get_x() + bar.get_width() / 2, height),
195
+ xytext=(0, 3),
196
+ textcoords="offset points",
197
+ ha='center', va='bottom', fontsize=7)
198
+
199
+ for bar in bars_std2:
200
+ height = bar.get_height()
201
+ ax5.annotate(f'{height:.4f}',
202
+ xy=(bar.get_x() + bar.get_width() / 2, height),
203
+ xytext=(0, 3),
204
+ textcoords="offset points",
205
+ ha='center', va='bottom', fontsize=7)
206
+
207
+ # 6. Summary statistics table
208
+ ax6 = fig.add_subplot(2, 3, 6)
209
+ ax6.axis('off')
210
+
211
+ # Create table data
212
+ table_data = []
213
+ for model_name in all_model_names:
214
+ row = [
215
+ model_name,
216
+ f"{benchmark_models[model_name]['mean']*1000:.3f} Β± {benchmark_models[model_name]['std']*1000:.3f}",
217
+ f"{benchmark_models[model_name]['min']*1000:.3f}",
218
+ f"{benchmark_models[model_name]['max']*1000:.3f}",
219
+ f"{benchmark_models[model_name]['accuracy']*100:.1f}%",
220
+ f"{single_benchmark_models[model_name]['mean']*1000:.3f} Β± {single_benchmark_models[model_name]['std']*1000:.3f}",
221
+ f"{single_benchmark_models[model_name]['min']*1000:.3f}",
222
+ f"{single_benchmark_models[model_name]['max']*1000:.3f}",
223
+ f"{single_benchmark_models[model_name]['accuracy']*100:.1f}%"
224
+ ]
225
+ table_data.append(row)
226
+
227
+ columns = ['Model', 'Mean Β± Std (ms)', 'Min (ms)', 'Max (ms)', 'Acc (%)',
228
+ 'Mean Β± Std (ms)', 'Min (ms)', 'Max (ms)', 'Acc (%)']
229
+ row_labels = ['Multi', 'Single'] * len(all_model_names)
230
+
231
+ # Create table
232
+ table = ax6.table(cellText=table_data, colLabels=columns, cellLoc='center', loc='center')
233
+ table.auto_set_font_size(False)
234
+ table.set_fontsize(9)
235
+ table.scale(1.1, 1.8)
236
+
237
+ # Style the table
238
+ for i in range(len(all_model_names)):
239
+ for j in range(len(columns)):
240
+ cell = table[(i+1, j)]
241
+ cell.set_height(0.4)
242
+ if j < 5:
243
+ cell.set_facecolor('#f0f0f0') # Light gray for multi-benchmark columns
244
+ else:
245
+ cell.set_facecolor('#e0e0f0') # Light blue for single-benchmark columns
246
+
247
+ ax6.set_title('Summary Statistics Comparison', fontsize=12, pad=20)
248
+
249
+ # Save each subplot as a separate PNG image
250
+ output_dir = Path(__file__).parent
251
+
252
+ # 1. Bar chart comparing mean inference times
253
+ fig1, ax1_single = plt.subplots(figsize=(10, 6))
254
+ x = np.arange(len(all_model_names))
255
+ width = 0.35
256
+ benchmark_means = [benchmark_models[m]['mean'] * 1000 for m in all_model_names]
257
+ single_means = [single_benchmark_models[m]['mean'] * 1000 for m in all_model_names]
258
+ bars1 = ax1_single.bar(x - width/2, benchmark_means, width, label='Multi-benchmark (100 samples)', alpha=0.8)
259
+ bars2 = ax1_single.bar(x + width/2, single_means, width, label='Single-benchmark (10 samples)', alpha=0.8)
260
+ ax1_single.set_xlabel('Model')
261
+ ax1_single.set_ylabel('Mean Inference Time (ms)')
262
+ ax1_single.set_title('Comparison of Mean Inference Times')
263
+ ax1_single.set_xticks(x)
264
+ ax1_single.set_xticklabels(all_model_names, rotation=45, ha='right')
265
+ ax1_single.legend()
266
+ ax1_single.grid(axis='y', alpha=0.3)
267
+ for bar in bars1:
268
+ height = bar.get_height()
269
+ ax1_single.annotate(f'{height:.3f}', xy=(bar.get_x() + bar.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=8)
270
+ for bar in bars2:
271
+ height = bar.get_height()
272
+ ax1_single.annotate(f'{height:.3f}', xy=(bar.get_x() + bar.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=8)
273
+ plt.tight_layout()
274
+ plt.savefig(output_dir / "mean_inference_times.png", dpi=300, bbox_inches='tight')
275
+ plt.close(fig1)
276
+ print(f"Saved: mean_inference_times.png")
277
+
278
+ # 2. Box plot comparing timing distributions
279
+ fig2, ax2_single = plt.subplots(figsize=(12, 6))
280
+ all_data = []
281
+ labels = []
282
+ colors = []
283
+ for i, model_name in enumerate(all_model_names):
284
+ benchmark_samples = benchmark_models[model_name]['timing_samples'][:10]
285
+ single_samples = single_benchmark_models[model_name]['timing_samples'][:10]
286
+ benchmark_ms = [s * 1000 for s in benchmark_samples]
287
+ single_ms = [s * 1000 for s in single_samples]
288
+ all_data.append(benchmark_ms)
289
+ all_data.append(single_ms)
290
+ labels.append(f'{model_name}\nMulti')
291
+ labels.append(f'{model_name}\nSingle')
292
+ colors.extend([f'C{i}', f'C{i}'])
293
+ bp = ax2_single.boxplot(all_data, labels=labels, patch_artist=True, vert=True)
294
+ for patch, color in zip(bp['boxes'], colors):
295
+ patch.set_facecolor(color)
296
+ patch.set_alpha(0.6)
297
+ ax2_single.set_xlabel('Model (Benchmark Type)')
298
+ ax2_single.set_ylabel('Inference Time (ms)')
299
+ ax2_single.set_title('Distribution of Inference Times (Box Plot)')
300
+ ax2_single.tick_params(axis='x', rotation=45)
301
+ ax2_single.grid(axis='y', alpha=0.3)
302
+ plt.tight_layout()
303
+ plt.savefig(output_dir / "inference_time_distribution.png", dpi=300, bbox_inches='tight')
304
+ plt.close(fig2)
305
+ print(f"Saved: inference_time_distribution.png")
306
+
307
+ # 3. Comparison scatter plot with accuracy
308
+ fig3, ax3_single = plt.subplots(figsize=(10, 6))
309
+ benchmark_accs = [benchmark_models[m]['accuracy'] * 100 for m in all_model_names]
310
+ single_accs = [single_benchmark_models[m]['accuracy'] * 100 for m in all_model_names]
311
+ benchmark_times = [benchmark_models[m]['mean'] * 1000 for m in all_model_names]
312
+ single_times = [single_benchmark_models[m]['mean'] * 1000 for m in all_model_names]
313
+ for i, model_name in enumerate(all_model_names):
314
+ ax3_single.scatter([benchmark_times[i]], [benchmark_accs[i]], marker='o', s=100, label=f'{model_name} (Multi)', alpha=0.8, color=f'C{i}')
315
+ ax3_single.scatter([single_times[i]], [single_accs[i]], marker='s', s=100, label=f'{model_name} (Single)', alpha=0.8, color=f'C{i}')
316
+ ax3_single.set_xlabel('Mean Inference Time (ms)')
317
+ ax3_single.set_ylabel('Accuracy (%)')
318
+ ax3_single.set_title('Accuracy vs Inference Time Comparison')
319
+ ax3_single.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small')
320
+ ax3_single.grid(True, alpha=0.3)
321
+ plt.tight_layout()
322
+ plt.savefig(output_dir / "accuracy_vs_inference_time.png", dpi=300, bbox_inches='tight')
323
+ plt.close(fig3)
324
+ print(f"Saved: accuracy_vs_inference_time.png")
325
+
326
+ # 4. Percentile comparison
327
+ fig4, ax4_single = plt.subplots(figsize=(12, 6))
328
+ x = np.arange(len(all_model_names))
329
+ width = 0.25
330
+ benchmark_p50 = [benchmark_models[m]['p50'] * 1000 for m in all_model_names]
331
+ benchmark_p95 = [benchmark_models[m]['p95'] * 1000 for m in all_model_names]
332
+ benchmark_p99 = [benchmark_models[m]['p99'] * 1000 for m in all_model_names]
333
+ single_p50 = [single_benchmark_models[m]['p50'] * 1000 for m in all_model_names]
334
+ single_p95 = [single_benchmark_models[m]['p95'] * 1000 for m in all_model_names]
335
+ single_p99 = [single_benchmark_models[m]['p99'] * 1000 for m in all_model_names]
336
+ bars_p50 = ax4_single.bar(x - width, benchmark_p50, width, label='P50 (Multi)', alpha=0.8)
337
+ bars_p95 = ax4_single.bar(x, benchmark_p95, width, label='P95 (Multi)', alpha=0.8)
338
+ bars_p99 = ax4_single.bar(x + width, benchmark_p99, width, label='P99 (Multi)', alpha=0.8)
339
+ ax4_single.bar(x - width + 0.05, single_p50, width*0.8, label='P50 (Single)', alpha=0.6, hatch='//')
340
+ ax4_single.bar(x + 0.05, single_p95, width*0.8, label='P95 (Single)', alpha=0.6, hatch='//')
341
+ ax4_single.bar(x + width + 0.05, single_p99, width*0.8, label='P99 (Single)', alpha=0.6, hatch='//')
342
+ ax4_single.set_xlabel('Model')
343
+ ax4_single.set_ylabel('Inference Time (ms)')
344
+ ax4_single.set_title('Percentile Comparison (P50, P95, P99)')
345
+ ax4_single.set_xticks(x)
346
+ ax4_single.set_xticklabels(all_model_names, rotation=45, ha='right')
347
+ ax4_single.legend(fontsize='small')
348
+ ax4_single.grid(axis='y', alpha=0.3)
349
+ plt.tight_layout()
350
+ plt.savefig(output_dir / "percentile_comparison.png", dpi=300, bbox_inches='tight')
351
+ plt.close(fig4)
352
+ print(f"Saved: percentile_comparison.png")
353
+
354
+ # 5. Standard deviation comparison
355
+ fig5, ax5_single = plt.subplots(figsize=(10, 6))
356
+ benchmark_std = [benchmark_models[m]['std'] * 1000 for m in all_model_names]
357
+ single_std = [single_benchmark_models[m]['std'] * 1000 for m in all_model_names]
358
+ x = np.arange(len(all_model_names))
359
+ width = 0.35
360
+ bars_std1 = ax5_single.bar(x - width/2, benchmark_std, width, label='Multi-benchmark', alpha=0.8)
361
+ bars_std2 = ax5_single.bar(x + width/2, single_std, width, label='Single-benchmark', alpha=0.8)
362
+ ax5_single.set_xlabel('Model')
363
+ ax5_single.set_ylabel('Standard Deviation (ms)')
364
+ ax5_single.set_title('Standard Deviation of Inference Times')
365
+ ax5_single.set_xticks(x)
366
+ ax5_single.set_xticklabels(all_model_names, rotation=45, ha='right')
367
+ ax5_single.legend()
368
+ ax5_single.grid(axis='y', alpha=0.3)
369
+ for bar in bars_std1:
370
+ height = bar.get_height()
371
+ ax5_single.annotate(f'{height:.4f}', xy=(bar.get_x() + bar.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=7)
372
+ for bar in bars_std2:
373
+ height = bar.get_height()
374
+ ax5_single.annotate(f'{height:.4f}', xy=(bar.get_x() + bar.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=7)
375
+ plt.tight_layout()
376
+ plt.savefig(output_dir / "standard_deviation_comparison.png", dpi=300, bbox_inches='tight')
377
+ plt.close(fig5)
378
+ print(f"Saved: standard_deviation_comparison.png")
379
+
380
+ # 6. Summary statistics table
381
+ fig6, ax6_single = plt.subplots(figsize=(14, 6))
382
+ ax6_single.axis('off')
383
+ table_data = []
384
+ for model_name in all_model_names:
385
+ row = [
386
+ model_name,
387
+ f"{benchmark_models[model_name]['mean']*1000:.3f} Β± {benchmark_models[model_name]['std']*1000:.3f}",
388
+ f"{benchmark_models[model_name]['min']*1000:.3f}",
389
+ f"{benchmark_models[model_name]['max']*1000:.3f}",
390
+ f"{benchmark_models[model_name]['accuracy']*100:.1f}%",
391
+ f"{single_benchmark_models[model_name]['mean']*1000:.3f} Β± {single_benchmark_models[model_name]['std']*1000:.3f}",
392
+ f"{single_benchmark_models[model_name]['min']*1000:.3f}",
393
+ f"{single_benchmark_models[model_name]['max']*1000:.3f}",
394
+ f"{single_benchmark_models[model_name]['accuracy']*100:.1f}%"
395
+ ]
396
+ table_data.append(row)
397
+ columns = ['Model', 'Mean Β± Std (ms)', 'Min (ms)', 'Max (ms)', 'Acc (%)',
398
+ 'Mean Β± Std (ms)', 'Min (ms)', 'Max (ms)', 'Acc (%)']
399
+ table = ax6_single.table(cellText=table_data, colLabels=columns, cellLoc='center', loc='center')
400
+ table.auto_set_font_size(False)
401
+ table.set_fontsize(9)
402
+ table.scale(1.1, 1.8)
403
+ for i in range(len(all_model_names)):
404
+ for j in range(len(columns)):
405
+ cell = table[(i+1, j)]
406
+ cell.set_height(0.4)
407
+ if j < 5:
408
+ cell.set_facecolor('#f0f0f0')
409
+ else:
410
+ cell.set_facecolor('#e0e0f0')
411
+ ax6_single.set_title('Summary Statistics Comparison', fontsize=12, pad=20)
412
+ plt.tight_layout()
413
+ plt.savefig(output_dir / "summary_statistics.png", dpi=300, bbox_inches='tight')
414
+ plt.close(fig6)
415
+ print(f"Saved: summary_statistics.png")
416
+
417
+ print(f"\nAll individual visualizations saved to: {output_dir}")
418
+
419
+ # Also save as interactive HTML
420
+ html_output = Path(__file__).parent / "response_time_comparison.html"
421
+ with open(html_output, 'w') as f:
422
+ f.write(f"""<!DOCTYPE html>
423
+ <html>
424
+ <head>
425
+ <title>Benchmark Response Time Comparison</title>
426
+ <style>
427
+ body {{ font-family: Arial, sans-serif; margin: 20px; }}
428
+ h1 {{ text-align: center; }}
429
+ .chart {{ max-width: 1200px; margin: 0 auto; }}
430
+ .model-section {{ margin: 20px 0; padding: 15px; border: 1px solid #ddd; border-radius: 5px; }}
431
+ .model-title {{ font-weight: bold; font-size: 1.2em; margin-bottom: 10px; }}
432
+ table {{ width: 100%; border-collapse: collapse; }}
433
+ th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
434
+ th {{ background-color: #f4f4f4; }}
435
+ </style>
436
+ </head>
437
+ <body>
438
+ <h1>Benchmark Response Time Comparison</h1>
439
+ <p><strong>Multi-benchmark:</strong> {benchmark_data['num_samples']} samples, {benchmark_data['num_repeats']} repeats</p>
440
+ <p><strong>Single-benchmark:</strong> {single_benchmark_data['num_samples']} samples, {single_benchmark_data['num_repeats']} repeats</p>
441
+ <p><img src="response_time_comparison.png" alt="Comparison Chart" class="chart"></p>
442
+ <h2>Detailed Statistics</h2>
443
+ """)
444
+ for model_name in all_model_names:
445
+ f.write(f"""
446
+ <div class="model-section">
447
+ <div class="model-title">{model_name}</div>
448
+ <table>
449
+ <tr>
450
+ <th>Metric</th>
451
+ <th>Multi-benchmark</th>
452
+ <th>Single-benchmark</th>
453
+ <th>Change</th>
454
+ </tr>
455
+ <tr>
456
+ <td>Mean (ms)</td>
457
+ <td>{benchmark_models[model_name]['mean']*1000:.4f}</td>
458
+ <td>{single_benchmark_models[model_name]['mean']*1000:.4f}</td>
459
+ <td>{((single_benchmark_models[model_name]['mean'] - benchmark_models[model_name]['mean']) / benchmark_models[model_name]['mean'] * 100):.1f}%</td>
460
+ </tr>
461
+ <tr>
462
+ <td>Std (ms)</td>
463
+ <td>{benchmark_models[model_name]['std']*1000:.4f}</td>
464
+ <td>{single_benchmark_models[model_name]['std']*1000:.4f}</td>
465
+ <td>{((single_benchmark_models[model_name]['std'] - benchmark_models[model_name]['std']) / benchmark_models[model_name]['std'] * 100):.1f}%</td>
466
+ </tr>
467
+ <tr>
468
+ <td>Min (ms)</td>
469
+ <td>{benchmark_models[model_name]['min']*1000:.4f}</td>
470
+ <td>{single_benchmark_models[model_name]['min']*1000:.4f}</td>
471
+ <td>{((single_benchmark_models[model_name]['min'] - benchmark_models[model_name]['min']) / benchmark_models[model_name]['min'] * 100):.1f}%</td>
472
+ </tr>
473
+ <tr>
474
+ <td>Max (ms)</td>
475
+ <td>{benchmark_models[model_name]['max']*1000:.4f}</td>
476
+ <td>{single_benchmark_models[model_name]['max']*1000:.4f}</td>
477
+ <td>{((single_benchmark_models[model_name]['max'] - benchmark_models[model_name]['max']) / benchmark_models[model_name]['max'] * 100):.1f}%</td>
478
+ </tr>
479
+ <tr>
480
+ <td>Accuracy</td>
481
+ <td>{benchmark_models[model_name]['accuracy']*100:.1f}%</td>
482
+ <td>{single_benchmark_models[model_name]['accuracy']*100:.1f}%</td>
483
+ <td>{(single_benchmark_models[model_name]['accuracy'] - benchmark_models[model_name]['accuracy']) * 100:.1f}pp</td>
484
+ </tr>
485
+ </table>
486
+ </div>
487
+ """)
488
+ f.write("""
489
+ </body>
490
+ </html>""")
491
+ print(f"HTML report saved to: {html_output}")
492
+
493
+ # Print summary to console
494
+ print("\n=== Summary ===")
495
+ print(f"Multi-benchmark: {benchmark_data['num_samples']} samples, {benchmark_data['num_repeats']} repeats")
496
+ print(f"Single-benchmark: {single_benchmark_data['num_samples']} samples, {single_benchmark_data['num_repeats']} repeats")
497
+ print("\nModel Comparison:")
498
+ print("-" * 80)
499
+ for model_name in all_model_names:
500
+ b_mean = benchmark_models[model_name]['mean'] * 1000
501
+ s_mean = single_benchmark_models[model_name]['mean'] * 1000
502
+ change = ((s_mean - b_mean) / b_mean * 100)
503
+ print(f"{model_name:20s} | Multi: {b_mean:6.3f}ms | Single: {s_mean:6.3f}ms | Change: {change:+6.1f}%")
A6/benchmark_results/visualizations/inference_time_distribution.png ADDED

Git LFS Details

  • SHA256: b7198c081720eb9d8266ccd15f6209879e4819c1ed121e01daa1925bfc1dd1dd
  • Pointer size: 131 Bytes
  • Size of remote file: 206 kB
A6/benchmark_results/visualizations/mean_inference_times.png ADDED

Git LFS Details

  • SHA256: eade239b378cec9616bca5cecb80b61b16f9dcac3faf55b1aa6b634a5f19df8f
  • Pointer size: 131 Bytes
  • Size of remote file: 207 kB
A6/benchmark_results/visualizations/percentile_comparison.png ADDED

Git LFS Details

  • SHA256: 2f8e98332885b80b4d6c62a3ba4af9211fd1dff4881fbb144b6d91e7b8894db4
  • Pointer size: 131 Bytes
  • Size of remote file: 238 kB
A6/benchmark_results/visualizations/response_time_comparison.html ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>Benchmark Response Time Comparison</title>
5
+ <style>
6
+ body { font-family: Arial, sans-serif; margin: 20px; }
7
+ h1 { text-align: center; }
8
+ .chart { max-width: 1200px; margin: 0 auto; }
9
+ .model-section { margin: 20px 0; padding: 15px; border: 1px solid #ddd; border-radius: 5px; }
10
+ .model-title { font-weight: bold; font-size: 1.2em; margin-bottom: 10px; }
11
+ table { width: 100%; border-collapse: collapse; }
12
+ th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
13
+ th { background-color: #f4f4f4; }
14
+ </style>
15
+ </head>
16
+ <body>
17
+ <h1>Benchmark Response Time Comparison</h1>
18
+ <p><strong>Multi-benchmark:</strong> 100 samples, 10 repeats</p>
19
+ <p><strong>Single-benchmark:</strong> 100 samples, 10 repeats</p>
20
+ <p><img src="response_time_comparison.png" alt="Comparison Chart" class="chart"></p>
21
+ <h2>Detailed Statistics</h2>
22
+
23
+ <div class="model-section">
24
+ <div class="model-title">A4 Random Forest</div>
25
+ <table>
26
+ <tr>
27
+ <th>Metric</th>
28
+ <th>Multi-benchmark</th>
29
+ <th>Single-benchmark</th>
30
+ <th>Change</th>
31
+ </tr>
32
+ <tr>
33
+ <td>Mean (ms)</td>
34
+ <td>60.7226</td>
35
+ <td>54.1178</td>
36
+ <td>-10.9%</td>
37
+ </tr>
38
+ <tr>
39
+ <td>Std (ms)</td>
40
+ <td>3.0474</td>
41
+ <td>8.3909</td>
42
+ <td>175.3%</td>
43
+ </tr>
44
+ <tr>
45
+ <td>Min (ms)</td>
46
+ <td>58.1383</td>
47
+ <td>41.5801</td>
48
+ <td>-28.5%</td>
49
+ </tr>
50
+ <tr>
51
+ <td>Max (ms)</td>
52
+ <td>68.9643</td>
53
+ <td>139.2800</td>
54
+ <td>102.0%</td>
55
+ </tr>
56
+ <tr>
57
+ <td>Accuracy</td>
58
+ <td>89.0%</td>
59
+ <td>89.0%</td>
60
+ <td>0.0pp</td>
61
+ </tr>
62
+ </table>
63
+ </div>
64
+
65
+ <div class="model-section">
66
+ <div class="model-title">A5 Ensemble</div>
67
+ <table>
68
+ <tr>
69
+ <th>Metric</th>
70
+ <th>Multi-benchmark</th>
71
+ <th>Single-benchmark</th>
72
+ <th>Change</th>
73
+ </tr>
74
+ <tr>
75
+ <td>Mean (ms)</td>
76
+ <td>87.9247</td>
77
+ <td>88.4395</td>
78
+ <td>0.6%</td>
79
+ </tr>
80
+ <tr>
81
+ <td>Std (ms)</td>
82
+ <td>19.6745</td>
83
+ <td>15.3584</td>
84
+ <td>-21.9%</td>
85
+ </tr>
86
+ <tr>
87
+ <td>Min (ms)</td>
88
+ <td>67.9033</td>
89
+ <td>60.6458</td>
90
+ <td>-10.7%</td>
91
+ </tr>
92
+ <tr>
93
+ <td>Max (ms)</td>
94
+ <td>138.6737</td>
95
+ <td>213.1680</td>
96
+ <td>53.7%</td>
97
+ </tr>
98
+ <tr>
99
+ <td>Accuracy</td>
100
+ <td>67.0%</td>
101
+ <td>67.0%</td>
102
+ <td>0.0pp</td>
103
+ </tr>
104
+ </table>
105
+ </div>
106
+
107
+ <div class="model-section">
108
+ <div class="model-title">A5b Adaboost</div>
109
+ <table>
110
+ <tr>
111
+ <th>Metric</th>
112
+ <th>Multi-benchmark</th>
113
+ <th>Single-benchmark</th>
114
+ <th>Change</th>
115
+ </tr>
116
+ <tr>
117
+ <td>Mean (ms)</td>
118
+ <td>34.6698</td>
119
+ <td>33.1184</td>
120
+ <td>-4.5%</td>
121
+ </tr>
122
+ <tr>
123
+ <td>Std (ms)</td>
124
+ <td>6.9252</td>
125
+ <td>3.6793</td>
126
+ <td>-46.9%</td>
127
+ </tr>
128
+ <tr>
129
+ <td>Min (ms)</td>
130
+ <td>30.5001</td>
131
+ <td>30.1910</td>
132
+ <td>-1.0%</td>
133
+ </tr>
134
+ <tr>
135
+ <td>Max (ms)</td>
136
+ <td>48.3568</td>
137
+ <td>67.5596</td>
138
+ <td>39.7%</td>
139
+ </tr>
140
+ <tr>
141
+ <td>Accuracy</td>
142
+ <td>52.0%</td>
143
+ <td>52.0%</td>
144
+ <td>0.0pp</td>
145
+ </tr>
146
+ </table>
147
+ </div>
148
+
149
+ <div class="model-section">
150
+ <div class="model-title">A5b Bagging Trees</div>
151
+ <table>
152
+ <tr>
153
+ <th>Metric</th>
154
+ <th>Multi-benchmark</th>
155
+ <th>Single-benchmark</th>
156
+ <th>Change</th>
157
+ </tr>
158
+ <tr>
159
+ <td>Mean (ms)</td>
160
+ <td>6.0758</td>
161
+ <td>3.0341</td>
162
+ <td>-50.1%</td>
163
+ </tr>
164
+ <tr>
165
+ <td>Std (ms)</td>
166
+ <td>1.7927</td>
167
+ <td>1.2043</td>
168
+ <td>-32.8%</td>
169
+ </tr>
170
+ <tr>
171
+ <td>Min (ms)</td>
172
+ <td>3.8333</td>
173
+ <td>2.4478</td>
174
+ <td>-36.1%</td>
175
+ </tr>
176
+ <tr>
177
+ <td>Max (ms)</td>
178
+ <td>9.7910</td>
179
+ <td>17.5220</td>
180
+ <td>79.0%</td>
181
+ </tr>
182
+ <tr>
183
+ <td>Accuracy</td>
184
+ <td>0.0%</td>
185
+ <td>0.0%</td>
186
+ <td>0.0pp</td>
187
+ </tr>
188
+ </table>
189
+ </div>
190
+
191
+ <div class="model-section">
192
+ <div class="model-title">A6 SVM</div>
193
+ <table>
194
+ <tr>
195
+ <th>Metric</th>
196
+ <th>Multi-benchmark</th>
197
+ <th>Single-benchmark</th>
198
+ <th>Change</th>
199
+ </tr>
200
+ <tr>
201
+ <td>Mean (ms)</td>
202
+ <td>9.1022</td>
203
+ <td>0.6455</td>
204
+ <td>-92.9%</td>
205
+ </tr>
206
+ <tr>
207
+ <td>Std (ms)</td>
208
+ <td>0.3233</td>
209
+ <td>0.0336</td>
210
+ <td>-89.6%</td>
211
+ </tr>
212
+ <tr>
213
+ <td>Min (ms)</td>
214
+ <td>8.6898</td>
215
+ <td>0.6043</td>
216
+ <td>-93.0%</td>
217
+ </tr>
218
+ <tr>
219
+ <td>Max (ms)</td>
220
+ <td>9.6271</td>
221
+ <td>1.1998</td>
222
+ <td>-87.5%</td>
223
+ </tr>
224
+ <tr>
225
+ <td>Accuracy</td>
226
+ <td>83.0%</td>
227
+ <td>83.0%</td>
228
+ <td>0.0pp</td>
229
+ </tr>
230
+ </table>
231
+ </div>
232
+
233
+ </body>
234
+ </html>
A6/benchmark_results/visualizations/response_time_comparison.png ADDED

Git LFS Details

  • SHA256: 0ec363043b93e8d327675bec5ae0946b321bd1cea2a28bcccad4ce98c63fcfe2
  • Pointer size: 131 Bytes
  • Size of remote file: 960 kB
A6/benchmark_results/visualizations/standard_deviation_comparison.png ADDED

Git LFS Details

  • SHA256: 1a9479a4d33aa87e5c7e8ae7aba6cdae32b3d794f2e8e456bd9a415923a325a8
  • Pointer size: 131 Bytes
  • Size of remote file: 200 kB
A6/benchmark_results/visualizations/summary_statistics.png ADDED

Git LFS Details

  • SHA256: 2e6904ca54eed3c36235aede9e33c43db26a03a495a2c766c0900ce1cf0acd99
  • Pointer size: 131 Bytes
  • Size of remote file: 212 kB
A6/benchmark_timing.md ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Standardized Timing Benchmarking Framework
2
+
3
+ A comprehensive benchmarking framework for fair and consistent comparison of classification models (A4, A5, A5b, A6).
4
+
5
+ ## Features
6
+
7
+ This framework provides standardized metrics for model comparison:
8
+
9
+ - **Inference Time**: Mean, standard deviation, min, max, and percentiles (P50, P95, P99)
10
+ - **Memory Usage**: Mean, standard deviation, and peak memory consumption
11
+ - **Prediction Accuracy**: Correct predictions and accuracy percentage
12
+ - **Model Characteristics**: Model size, number of features, model type
13
+ - **Consistent Data Pipeline**: Uses the same data processing for all models
14
+
15
+ ## Installation
16
+
17
+ No additional dependencies required. Uses existing project dependencies:
18
+ - `numpy`
19
+ - `pandas`
20
+ - `scikit-learn`
21
+ - `pickle` (standard library)
22
+
23
+ ## Usage
24
+
25
+ ### Basic Usage
26
+
27
+ ```bash
28
+ python benchmark_timing.py
29
+ ```
30
+
31
+ ### Advanced Usage
32
+
33
+ ```bash
34
+ # Specify number of samples and repeats
35
+ python benchmark_timing.py --samples 200 --repeats 20
36
+
37
+ # Save results to specific file
38
+ python benchmark_timing.py --output results/my_benchmark.json
39
+
40
+ # Print comparison table
41
+ python benchmark_timing.py --compare
42
+
43
+ # Print model recommendations
44
+ python benchmark_timing.py --recommend
45
+
46
+ # All options combined
47
+ python benchmark_timing.py -n 150 -r 15 -o results/benchmark.json -c -R
48
+ ```
49
+
50
+ ### Command Line Arguments
51
+
52
+ | Argument | Short | Description | Default |
53
+ |----------|-------|-------------|---------|
54
+ | `--samples` | `-n` | Number of test samples | 100 |
55
+ | `--repeats` | `-r` | Number of repetitions per sample | 10 |
56
+ | `--output` | `-o` | Output file path for JSON results | Auto-generated |
57
+ | `--compare` | `-c` | Print comparison table | False |
58
+ | `--recommend` | `-R` | Print model recommendations | False |
59
+
60
+ ## Output
61
+
62
+ ### Console Output
63
+
64
+ The framework prints real-time progress and results:
65
+
66
+ ```
67
+ ======================================================================
68
+ STANDARDIZED TIMING BENCHMARKING FRAMEWORK
69
+ ======================================================================
70
+
71
+ Configuration:
72
+ Number of samples: 100
73
+ Number of repeats per sample: 10
74
+ Total predictions per model: 1000
75
+
76
+ Loading data...
77
+ Movement features shape: (1000, 150)
78
+ Weak link scores shape: (1000, 20)
79
+ Merged dataset shape: (1000, 165)
80
+ Feature matrix shape: (1000, 160)
81
+ Number of features: 160
82
+ Number of classes: 14
83
+
84
+ ======================================================================
85
+ Running Benchmarks
86
+ ======================================================================
87
+
88
+ Benchmarking A4 Random Forest...
89
+
90
+ A4 Random Forest Results:
91
+ Status: SUCCESS
92
+ Inference Time:
93
+ Mean: 1.234 ms
94
+ Std: 0.123 ms
95
+ P50: 1.200 ms
96
+ P95: 1.500 ms
97
+ P99: 1.800 ms
98
+ Memory Usage:
99
+ Mean: 256.5 KB
100
+ Peak: 512.0 KB
101
+ Accuracy: 78.5% (78/100)
102
+ Model Size: 1250.0 KB
103
+ Features: 160
104
+ ```
105
+
106
+ ### JSON Results
107
+
108
+ Results are saved to JSON format with all metrics:
109
+
110
+ ```json
111
+ {
112
+ "timestamp": "2024-01-15T10:30:45.123456",
113
+ "num_samples": 100,
114
+ "num_repeats": 10,
115
+ "models": {
116
+ "A4 Random Forest": {
117
+ "model_name": "A4 Random Forest",
118
+ "model_path": "../A4/models/weaklink_classifier_rf.pkl",
119
+ "inference_time_mean": 0.001234,
120
+ "inference_time_std": 0.000123,
121
+ "inference_time_min": 0.001000,
122
+ "inference_time_max": 0.001800,
123
+ "inference_time_p50": 0.001200,
124
+ "inference_time_p95": 0.001500,
125
+ "inference_time_p99": 0.001800,
126
+ "memory_usage_mean": 262656.0,
127
+ "memory_usage_std": 10240.0,
128
+ "memory_usage_peak": 524288.0,
129
+ "accuracy": 0.785,
130
+ "predictions_correct": 78,
131
+ "predictions_total": 100,
132
+ "model_size_bytes": 1280000,
133
+ "num_features": 160,
134
+ "num_parameters": 10,
135
+ "model_type": "RandomForestClassifier",
136
+ "timing_samples": [0.0012, 0.0013, ...],
137
+ "memory_samples": [262144, 266240, ...],
138
+ "status": "SUCCESS",
139
+ "error_message": ""
140
+ }
141
+ }
142
+ }
143
+ ```
144
+
145
+ ## Model Comparison Table
146
+
147
+ With `--compare` flag, prints a formatted comparison:
148
+
149
+ ```
150
+ ==========================================================================
151
+ MODEL COMPARISON SUMMARY
152
+ ==========================================================================
153
+ Model Time (ms) Std P95 Acc (%) Mem (KB) Size (KB)
154
+ --------------------------------------------------------------------------
155
+ A5b Adaboost 0.850 0.050 1.100 75.2 128.5 512.0
156
+ A5 Ensemble 1.100 0.080 1.350 79.8 256.3 768.0
157
+ A4 Random Forest 1.234 0.123 1.500 78.5 256.5 1250.0
158
+ A5b Bagging Trees 1.450 0.150 1.800 77.1 384.2 1024.0
159
+ A6 SVM 2.100 0.200 2.500 81.2 512.0 2048.0
160
+ ==========================================================================
161
+ ```
162
+
163
+ ## Model Recommendations
164
+
165
+ With `--recommend` flag, provides optimal model suggestions:
166
+
167
+ ```
168
+ ======================================================================
169
+ MODEL RECOMMENDATIONS
170
+ ======================================================================
171
+
172
+ Fastest Inference:
173
+ Model: A5b Adaboost
174
+ Inference Time: 0.850 ms
175
+
176
+ Highest Accuracy:
177
+ Model: A6 SVM
178
+ Accuracy: 81.2%
179
+
180
+ Lowest Memory Usage:
181
+ Model: A5b Adaboost
182
+ Memory Usage: 128.5 KB
183
+
184
+ Best Balanced Performance:
185
+ Model: A5 Ensemble
186
+ Inference Time: 1.100 ms
187
+ Accuracy: 79.8%
188
+ Memory Usage: 256.3 KB
189
+ ```
190
+
191
+ ## Benchmarking Metrics Explained
192
+
193
+ ### Inference Time Metrics
194
+
195
+ | Metric | Description |
196
+ |--------|-------------|
197
+ | **Mean** | Average inference time across all repetitions |
198
+ | **Std** | Standard deviation (variability) |
199
+ | **Min/Max** | Fastest and slowest inference times |
200
+ | **P50** | Median (50th percentile) |
201
+ | **P95** | 95th percentile (95% of predictions are faster) |
202
+ | **P99** | 99th percentile (99% of predictions are faster) |
203
+
204
+ ### Memory Metrics
205
+
206
+ | Metric | Description |
207
+ |--------|-------------|
208
+ | **Mean** | Average memory usage |
209
+ | **Std** | Standard deviation of memory usage |
210
+ | **Peak** | Maximum memory consumed |
211
+
212
+ ### Accuracy Metrics
213
+
214
+ | Metric | Description |
215
+ |--------|-------------|
216
+ | **Accuracy** | Percentage of correct predictions |
217
+ | **Predictions Correct/Total** | Raw counts |
218
+
219
+ ## Implementation Details
220
+
221
+ ### Data Pipeline
222
+
223
+ All models use the same data loading and preprocessing pipeline:
224
+ 1. Load movement features and weaklink scores
225
+ 2. Create WeakestLink target column
226
+ 3. Merge datasets
227
+ 4. Extract features (excluding ID, WeakestLink, EstimatedScore)
228
+ 5. Train/test split (80/20, stratified, random_state=42)
229
+ 6. StandardScaler fitted on training data
230
+
231
+ ### Feature Handling
232
+
233
+ - A4 Random Forest model was trained WITH duplicate NASM columns
234
+ - Other models (A5, A5b, A6) were trained WITHOUT duplicate NASM columns
235
+ - The framework automatically filters features based on each model's expectations
236
+
237
+ ### Memory Tracking
238
+
239
+ Uses Python's `tracemalloc` module for accurate memory measurement:
240
+ - Tracks memory before and after each prediction
241
+ - Records both current and peak memory usage
242
+
243
+ ### Timing Precision
244
+
245
+ Uses `time.perf_counter()` for high-resolution timing measurements.
246
+
247
+ ## Extending the Framework
248
+
249
+ ### Adding New Models
250
+
251
+ 1. Add model path to `all_classification.py`:
252
+ ```python
253
+ a7_new_model = "../A7/models/new_model.pkl"
254
+ ```
255
+
256
+ 2. Import in `benchmark_timing.py`:
257
+ ```python
258
+ from all_classification import (
259
+ a4_rf,
260
+ a5_ensemnble,
261
+ a5b_adaboost,
262
+ a5b_bagging_tree,
263
+ a6_svm,
264
+ a7_new_model, # Add here
265
+ )
266
+ ```
267
+
268
+ 3. Add to `models_to_benchmark` list in `run_benchmark()`:
269
+ ```python
270
+ models_to_benchmark = [
271
+ ('A4 Random Forest', a4_rf),
272
+ ('A5 Ensemble', a5_ensemnble),
273
+ ('A5b Adaboost', a5b_adaboost),
274
+ ('A5b Bagging Trees', a5b_bagging_tree),
275
+ ('A6 SVM', a6_svm),
276
+ ('A7 New Model', a7_new_model), # Add here
277
+ ]
278
+ ```
279
+
280
+ ### Adding New Metrics
281
+
282
+ Extend the `ModelMetrics` dataclass:
283
+
284
+ ```python
285
+ @dataclass
286
+ class ModelMetrics:
287
+ # ... existing fields ...
288
+ new_metric: float = 0.0
289
+ ```
290
+
291
+ Then calculate it in `benchmark_single_model()`.
292
+
293
+ ## Troubleshooting
294
+
295
+ ### Model Loading Errors
296
+
297
+ If a model fails to load:
298
+ 1. Verify the model file exists at the specified path
299
+ 2. Check that all required dependencies are installed
300
+ 3. Ensure custom classes (like `AdaBoostEnsemble`) are importable
301
+
302
+ ### Inconsistent Results
303
+
304
+ If results vary significantly between runs:
305
+ 1. Increase `--repeats` for better averaging
306
+ 2. Ensure no other processes are running during benchmarking
307
+ 3. Check system resource utilization
308
+
309
+ ### Memory Issues
310
+
311
+ If running out of memory:
312
+ 1. Reduce `--samples` count
313
+ 2. Close other applications
314
+ 3. Check available system memory
315
+
316
+ ## License
317
+
318
+ Part of the Data Intensive Systems project.
319
+ ```
320
+
321
+ Now let me also create a simple comparison script for quick analysis:
322
+
323
+ <tool_call>
324
+ <function=edit_file>
325
+ <parameter=display_description>
326
+ Create simple comparison script for quick analysis
327
+ </parameter>
328
+ <parameter=path>
329
+ /home/cyclonaut/Dokumente/Studium VΓ€xjΓΆ/data intensive project/repo_march/Data-intensive-systems/A6/compare_benchmarks.py
330
+ </parameter>
331
+ <parameter=mode>
332
+ create
333
+ </parameter>
334
+ </function>
335
+ </tool_call>
A6/benchmark_timing.py ADDED
@@ -0,0 +1,960 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Standardized Timing Benchmarking Framework for Classification Models
4
+
5
+ This framework provides fair and consistent timing benchmarks for comparing
6
+ classification models (A4, A5, A5b, A6) with metrics for:
7
+ - Inference time (mean, std, min, max, percentiles)
8
+ - Memory usage
9
+ - Prediction accuracy
10
+ - Model size
11
+ - Feature extraction time
12
+
13
+ Usage:
14
+ python benchmark_timing.py [--samples N] [--repeats M] [--output FILE]
15
+
16
+ Author: Benchmark Framework v1.0
17
+ """
18
+
19
+ import os
20
+ import sys
21
+ import pickle
22
+ import time
23
+ import tracemalloc
24
+ import warnings
25
+ import json
26
+ import numpy as np
27
+ import pandas as pd
28
+ from pathlib import Path
29
+ from datetime import datetime
30
+ from sklearn.preprocessing import StandardScaler
31
+ from sklearn.model_selection import train_test_split
32
+ from typing import Dict, List, Tuple, Optional, Any
33
+ from dataclasses import dataclass, field, asdict
34
+ from collections import defaultdict
35
+ import statistics
36
+
37
+ # Suppress warnings for cleaner output
38
+ warnings.filterwarnings('ignore')
39
+
40
+ # Add project root to path
41
+ project_root = os.path.abspath(os.path.dirname(__file__))
42
+ sys.path.insert(0, project_root)
43
+
44
+ # Import model paths
45
+ from all_classification import (
46
+ a4_rf,
47
+ a5_ensemnble,
48
+ a5b_adaboost,
49
+ a5b_bagging_tree,
50
+ a6_svm
51
+ )
52
+
53
+ # Import custom classes for unpickling
54
+ from adaboost_classes import (
55
+ AdaBoostEnsemble,
56
+ WeightedDecisionTree
57
+ )
58
+
59
+ # ============================================================================
60
+ # Configuration
61
+ # ============================================================================
62
+
63
+ REPO_ROOT = os.path.abspath(os.path.join(project_root, '..'))
64
+ DATA_DIR = os.path.join(REPO_ROOT, 'Datasets_all')
65
+ OUTPUT_DIR = os.path.join(project_root, 'benchmark_results')
66
+
67
+ # Weaklink categories (14 classes)
68
+ WEAKLINK_CATEGORIES = [
69
+ 'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward',
70
+ 'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward',
71
+ 'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward',
72
+ 'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward',
73
+ 'RightKneeMovesOutward', 'RightShoulderElevation'
74
+ ]
75
+
76
+ # Duplicate NASM columns
77
+ DUPLICATE_NASM_COLS = [
78
+ 'No_1_NASM_Deviation',
79
+ 'No_2_NASM_Deviation',
80
+ 'No_3_NASM_Deviation',
81
+ 'No_4_NASM_Deviation',
82
+ 'No_5_NASM_Deviation',
83
+ ]
84
+
85
+ EXCLUDE_COLS = ['ID', 'WeakestLink', 'EstimatedScore']
86
+ EXPECTED_CLASSES = WEAKLINK_CATEGORIES.copy()
87
+
88
+ # Benchmark parameters
89
+ DEFAULT_NUM_SAMPLES = 100
90
+ DEFAULT_NUM_REPEATES = 10
91
+ DEFAULT_OUTPUT_FILE = None
92
+
93
+
94
+ # ============================================================================
95
+ # Data Classes for Results
96
+ # ============================================================================
97
+
98
+ @dataclass
99
+ class ModelMetrics:
100
+ """Metrics for a single model benchmark."""
101
+ model_name: str
102
+ model_path: str
103
+
104
+ # Timing metrics (seconds)
105
+ inference_time_mean: float = 0.0
106
+ inference_time_std: float = 0.0
107
+ inference_time_min: float = 0.0
108
+ inference_time_max: float = 0.0
109
+ inference_time_p50: float = 0.0
110
+ inference_time_p95: float = 0.0
111
+ inference_time_p99: float = 0.0
112
+
113
+ # Memory metrics (bytes)
114
+ memory_usage_mean: float = 0.0
115
+ memory_usage_std: float = 0.0
116
+ memory_usage_peak: float = 0.0
117
+
118
+ # Prediction metrics
119
+ accuracy: float = 0.0
120
+ predictions_correct: int = 0
121
+ predictions_total: int = 0
122
+
123
+ # Model characteristics
124
+ model_size_bytes: int = 0
125
+ num_features: int = 0
126
+ num_parameters: int = 0
127
+ model_type: str = ""
128
+
129
+ # Feature extraction time (seconds)
130
+ feature_extraction_time_mean: float = 0.0
131
+
132
+ # Raw timing samples
133
+ timing_samples: List[float] = field(default_factory=list)
134
+ memory_samples: List[float] = field(default_factory=list)
135
+
136
+ # Status
137
+ status: str = "SUCCESS"
138
+ error_message: str = ""
139
+
140
+
141
+ @dataclass
142
+ class BenchmarkResults:
143
+ """Complete benchmark results for all models."""
144
+ timestamp: str
145
+ num_samples: int
146
+ num_repeats: int
147
+ models: Dict[str, ModelMetrics] = field(default_factory=dict)
148
+
149
+ def to_dict(self) -> Dict[str, Any]:
150
+ """Convert to dictionary for JSON serialization."""
151
+ return {
152
+ 'timestamp': self.timestamp,
153
+ 'num_samples': self.num_samples,
154
+ 'num_repeats': self.num_repeats,
155
+ 'models': {
156
+ name: {
157
+ **asdict(metrics),
158
+ 'timing_samples': list(metrics.timing_samples),
159
+ 'memory_samples': list(metrics.memory_samples)
160
+ }
161
+ for name, metrics in self.models.items()
162
+ }
163
+ }
164
+
165
+ def to_json(self, filepath: Optional[str] = None) -> str:
166
+ """Export to JSON string or file."""
167
+ data = self.to_dict()
168
+ json_str = json.dumps(data, indent=2, default=str)
169
+
170
+ if filepath:
171
+ os.makedirs(os.path.dirname(filepath) or '.', exist_ok=True)
172
+ with open(filepath, 'w') as f:
173
+ f.write(json_str)
174
+
175
+ return json_str
176
+
177
+
178
+ # ============================================================================
179
+ # Data Loading Functions
180
+ # ============================================================================
181
+
182
+ def load_and_prepare_data() -> Dict[str, Any]:
183
+ """Load and prepare data following the same pipeline as classification_baseline.py.
184
+
185
+ Returns:
186
+ Dictionary containing:
187
+ - feature_columns: List of feature column names
188
+ - scaler: Fitted StandardScaler
189
+ - X_train, X_test: Feature matrices (unscaled)
190
+ - y_train, y_test: Target arrays
191
+ - merged_df: Merged dataframe
192
+ """
193
+ # Load datasets
194
+ movement_features_df = pd.read_csv(os.path.join(DATA_DIR, 'aimoscores.csv'))
195
+ weaklink_scores_df = pd.read_csv(os.path.join(DATA_DIR, 'scores_and_weaklink.csv'))
196
+
197
+ print(f' Movement features shape: {movement_features_df.shape}')
198
+ print(f' Weak link scores shape: {weaklink_scores_df.shape}')
199
+
200
+ # Create WeakestLink target column
201
+ weaklink_scores_df['WeakestLink'] = (
202
+ weaklink_scores_df[WEAKLINK_CATEGORIES].idxmax(axis=1)
203
+ )
204
+
205
+ # Merge datasets
206
+ target_df = weaklink_scores_df[['ID', 'WeakestLink']].copy()
207
+ merged_df = movement_features_df.merge(target_df, on='ID', how='inner')
208
+ print(f' Merged dataset shape: {merged_df.shape}')
209
+
210
+ # Extract feature columns - include ALL columns except EXCLUDE_COLS
211
+ feature_columns = [c for c in merged_df.columns if c not in EXCLUDE_COLS]
212
+
213
+ X = merged_df[feature_columns].values
214
+ y = merged_df['WeakestLink'].values
215
+
216
+ print(f' Feature matrix shape: {X.shape}')
217
+ print(f' Number of features: {len(feature_columns)}')
218
+ print(f' Number of classes: {len(np.unique(y))}')
219
+
220
+ # Create train/test split
221
+ X_train, X_test, y_train, y_test = train_test_split(
222
+ X, y, test_size=0.2, random_state=42, stratify=y
223
+ )
224
+
225
+ # Fit scaler on training data
226
+ scaler = StandardScaler()
227
+ X_train_scaled = scaler.fit_transform(X_train)
228
+ X_test_scaled = scaler.transform(X_test)
229
+
230
+ return {
231
+ 'feature_columns': feature_columns,
232
+ 'scaler': scaler,
233
+ 'X_train': X_train,
234
+ 'X_train_scaled': X_train_scaled,
235
+ 'y_train': y_train,
236
+ 'X_test': X_test,
237
+ 'X_test_scaled': X_test_scaled,
238
+ 'y_test': y_test,
239
+ 'merged_df': merged_df,
240
+ }
241
+
242
+
243
+ def create_samples_from_test_data(
244
+ data: Dict[str, Any],
245
+ num_samples: int
246
+ ) -> Tuple[np.ndarray, np.ndarray]:
247
+ """Create samples from test data for benchmarking.
248
+
249
+ Args:
250
+ data: Dictionary from load_and_prepare_data()
251
+ num_samples: Number of samples to select
252
+
253
+ Returns:
254
+ Tuple of (sample_features, true_labels)
255
+ """
256
+ # Use test data for benchmarking
257
+ X_test = data['X_test']
258
+ y_test = data['y_test']
259
+
260
+ # Select first num_samples from test set
261
+ n_samples = min(num_samples, len(X_test))
262
+ sample_features = X_test[:n_samples]
263
+ true_labels = y_test[:n_samples]
264
+
265
+ return sample_features, true_labels
266
+
267
+
268
+ # ============================================================================
269
+ # Model Loading Functions
270
+ # ============================================================================
271
+
272
+ def load_model(model_path: str, model_name: str) -> Tuple[Any, Optional[Any], Optional[List[str]], Any]:
273
+ """Load a model from a pickle file.
274
+
275
+ Args:
276
+ model_path: Path to the pickle file
277
+ model_name: Name of the model for logging
278
+
279
+ Returns:
280
+ Tuple of (model, scaler, feature_columns, artifact)
281
+ """
282
+ full_path = os.path.join(project_root, model_path)
283
+
284
+ if not os.path.exists(full_path):
285
+ print(f" ⚠️ Model file not found: {full_path}")
286
+ return None, None, None, None
287
+
288
+ try:
289
+ with open(full_path, 'rb') as f:
290
+ artifact = pickle.load(f)
291
+
292
+ # Extract model and scaler based on artifact structure
293
+ if isinstance(artifact, dict):
294
+ model = artifact.get('model')
295
+ scaler = artifact.get('scaler')
296
+ feature_columns = artifact.get('feature_columns')
297
+ else:
298
+ # A6 SVM is a Pipeline object
299
+ model = artifact
300
+ scaler = None
301
+ feature_columns = None
302
+
303
+ # Extract scaler from pipeline if it exists
304
+ if hasattr(model, 'steps') and len(model.steps) >= 1:
305
+ for step_name, step_obj in model.steps:
306
+ if hasattr(step_obj, 'transform'):
307
+ if hasattr(step_obj, 'n_features_in_') and not hasattr(step_obj, 'predict'):
308
+ scaler = step_obj
309
+ break
310
+
311
+ # Extract feature columns from scaler
312
+ if hasattr(model, 'steps') and len(model.steps) > 0:
313
+ first_step = model.steps[0][1]
314
+ if hasattr(first_step, 'get_feature_names_out'):
315
+ try:
316
+ names = first_step.get_feature_names_out()
317
+ import re
318
+ if not all(re.fullmatch(r'x\d+', n) for n in names):
319
+ feature_columns = names
320
+ except:
321
+ pass
322
+
323
+ print(f" βœ“ Loaded {model_name}")
324
+ return model, scaler, feature_columns, artifact
325
+ except Exception as e:
326
+ print(f" βœ— Error loading {model_name}: {e}")
327
+ return None, None, None, None
328
+
329
+
330
+ def get_model_info(model: Any) -> Dict[str, Any]:
331
+ """Extract model information for benchmarking.
332
+
333
+ Args:
334
+ model: The trained model
335
+
336
+ Returns:
337
+ Dictionary with model characteristics
338
+ """
339
+ info = {
340
+ 'model_type': type(model).__name__,
341
+ 'num_parameters': 0,
342
+ 'num_features': 0
343
+ }
344
+
345
+ # Count parameters based on model type
346
+ if hasattr(model, 'n_estimators'):
347
+ info['num_parameters'] += getattr(model, 'n_estimators', 0)
348
+
349
+ if hasattr(model, 'estimators_'):
350
+ info['num_parameters'] += len(getattr(model, 'estimators_', []))
351
+
352
+ if hasattr(model, 'n_features_in_'):
353
+ info['num_features'] = model.n_features_in_
354
+
355
+ if hasattr(model, 'classes_'):
356
+ info['num_classes'] = len(model.classes_)
357
+
358
+ # For ensemble models
359
+ if hasattr(model, 'estimators_'):
360
+ for est in getattr(model, 'estimators_', []):
361
+ if hasattr(est, 'n_features_in_'):
362
+ info['num_features'] = est.n_features_in_
363
+ break
364
+
365
+ return info
366
+
367
+
368
+ # ============================================================================
369
+ # Benchmarking Functions
370
+ # ============================================================================
371
+
372
+ def measure_inference_time(
373
+ model: Any,
374
+ scaler: Optional[Any],
375
+ sample_features: np.ndarray,
376
+ model_feature_columns: Optional[List[str]],
377
+ feature_columns: List[str],
378
+ num_repeats: int,
379
+ single_sample_mode: bool = False
380
+ ) -> Tuple[List[float], List[float], Optional[str]]:
381
+ """Measure inference time for a model.
382
+
383
+ Args:
384
+ model: The trained model
385
+ scaler: Scaler for feature preprocessing
386
+ sample_features: Input features
387
+ model_feature_columns: Expected feature columns for the model
388
+ feature_columns: All available feature columns
389
+ num_repeats: Number of repetitions for averaging
390
+ single_sample_mode: If True, measure each sample individually (for single sample latency)
391
+
392
+ Returns:
393
+ Tuple of (timing_samples, memory_samples, error_message)
394
+ """
395
+ timing_samples = []
396
+ memory_samples = []
397
+
398
+ try:
399
+ # Filter features if needed
400
+ if model_feature_columns is not None:
401
+ available_features = [f for f in model_feature_columns if f in feature_columns]
402
+ if len(available_features) > 0:
403
+ # Convert column names to indices for numpy array
404
+ feature_indices = [feature_columns.index(f) for f in available_features]
405
+ test_features = sample_features[:, feature_indices]
406
+ else:
407
+ test_features = sample_features
408
+ else:
409
+ # model_feature_columns is None - likely A6 SVM pipeline
410
+ # Check if we need to drop duplicate NASM columns
411
+ if hasattr(model, 'steps') and len(model.steps) > 0:
412
+ first_step = model.steps[0][1]
413
+ n_expected = getattr(first_step, 'n_features_in_', None)
414
+ if n_expected is not None:
415
+ # Identify indices of duplicate NASM columns
416
+ dup_indices = [i for i, c in enumerate(feature_columns) if c in DUPLICATE_NASM_COLS]
417
+ # Get all indices except duplicate NASM columns
418
+ valid_indices = [i for i in range(len(feature_columns)) if i not in dup_indices]
419
+ if len(valid_indices) == n_expected:
420
+ # Select only the columns that match expected features
421
+ test_features = sample_features[:, valid_indices]
422
+ else:
423
+ # Fallback: slice to expected number of features
424
+ test_features = sample_features[:, :n_expected]
425
+ else:
426
+ test_features = sample_features
427
+ else:
428
+ test_features = sample_features
429
+
430
+ # Handle A6 SVM pipeline (scaler already in pipeline)
431
+ if model_feature_columns is None and hasattr(model, 'steps'):
432
+ scaler_to_use = None
433
+ else:
434
+ scaler_to_use = scaler
435
+
436
+ # Determine how many predictions to make
437
+ if single_sample_mode:
438
+ # For single sample mode: repeat each sample individually
439
+ num_predictions = num_repeats * len(test_features)
440
+ else:
441
+ # For batch mode: num_repeats on all samples
442
+ num_predictions = num_repeats
443
+
444
+ for i in range(num_predictions):
445
+ # Start memory tracking
446
+ tracemalloc.start()
447
+ start_time = time.perf_counter()
448
+
449
+ # Make prediction
450
+ if single_sample_mode:
451
+ # Single sample prediction: use one row at a time
452
+ single_sample = test_features[i % len(test_features)].reshape(1, -1)
453
+ if scaler_to_use is not None:
454
+ features = scaler_to_use.transform(single_sample)
455
+ else:
456
+ features = single_sample
457
+ else:
458
+ # Batch prediction: use all samples
459
+ if scaler_to_use is not None:
460
+ features = scaler_to_use.transform(test_features)
461
+ else:
462
+ features = test_features
463
+
464
+ prediction = model.predict(features)
465
+
466
+ end_time = time.perf_counter()
467
+ current, peak = tracemalloc.get_traced_memory()
468
+ tracemalloc.stop()
469
+
470
+ # Record measurements
471
+ timing_samples.append(end_time - start_time)
472
+ memory_samples.append(peak)
473
+
474
+ return timing_samples, memory_samples, None
475
+
476
+ except Exception as e:
477
+ return [], [], str(e)
478
+
479
+
480
+ def calculate_percentiles(values: List[float]) -> Dict[str, float]:
481
+ """Calculate percentiles for a list of values.
482
+
483
+ Args:
484
+ values: List of numeric values
485
+
486
+ Returns:
487
+ Dictionary with percentile values
488
+ """
489
+ if not values:
490
+ return {
491
+ 'p50': 0.0,
492
+ 'p95': 0.0,
493
+ 'p99': 0.0
494
+ }
495
+
496
+ sorted_values = sorted(values)
497
+ n = len(sorted_values)
498
+
499
+ return {
500
+ 'p50': sorted_values[int(n * 0.50)],
501
+ 'p95': sorted_values[int(n * 0.95)],
502
+ 'p99': sorted_values[int(n * 0.99)]
503
+ }
504
+
505
+
506
+ def benchmark_single_model(
507
+ model_name: str,
508
+ model_path: str,
509
+ sample_features: np.ndarray,
510
+ true_labels: np.ndarray,
511
+ feature_columns: List[str],
512
+ num_repeats: int,
513
+ single_sample_mode: bool = False
514
+ ) -> ModelMetrics:
515
+ """Benchmark a single model.
516
+
517
+ Args:
518
+ model_name: Name of the model
519
+ model_path: Path to the model file
520
+ sample_features: Input features for benchmarking
521
+ true_labels: Ground truth labels
522
+ feature_columns: All available feature columns
523
+ num_repeats: Number of repetitions
524
+ single_sample_mode: If True, measure each sample individually (for single sample latency)
525
+
526
+ Returns:
527
+ ModelMetrics object with benchmark results
528
+ """
529
+ metrics = ModelMetrics(model_name=model_name, model_path=model_path)
530
+
531
+ print(f"\n Benchmarking {model_name}...")
532
+
533
+ # Load model
534
+ model, scaler, model_feature_columns, artifact = load_model(model_path, model_name)
535
+
536
+ if model is None:
537
+ metrics.status = "LOAD_ERROR"
538
+ metrics.error_message = "Failed to load model"
539
+ return metrics
540
+
541
+ # Get model info
542
+ model_info = get_model_info(model)
543
+ metrics.model_type = model_info.get('model_type', type(model).__name__)
544
+ metrics.num_features = model_info.get('num_features', 0)
545
+
546
+ # Get model size
547
+ try:
548
+ model_size = os.path.getsize(os.path.join(project_root, model_path))
549
+ metrics.model_size_bytes = model_size
550
+ except:
551
+ metrics.model_size_bytes = 0
552
+
553
+ # Run inference benchmarks
554
+ timing_samples, memory_samples, error = measure_inference_time(
555
+ model, scaler, sample_features, model_feature_columns,
556
+ feature_columns, num_repeats, single_sample_mode=single_sample_mode
557
+ )
558
+
559
+ if error:
560
+ metrics.status = "INFERENCE_ERROR"
561
+ metrics.error_message = error
562
+ return metrics
563
+
564
+ # Store raw samples
565
+ metrics.timing_samples = timing_samples
566
+ metrics.memory_samples = memory_samples
567
+
568
+ # Calculate timing statistics
569
+ if timing_samples:
570
+ metrics.inference_time_mean = statistics.mean(timing_samples)
571
+ metrics.inference_time_std = statistics.stdev(timing_samples) if len(timing_samples) > 1 else 0.0
572
+ metrics.inference_time_min = min(timing_samples)
573
+ metrics.inference_time_max = max(timing_samples)
574
+
575
+ percentiles = calculate_percentiles(timing_samples)
576
+ metrics.inference_time_p50 = percentiles['p50']
577
+ metrics.inference_time_p95 = percentiles['p95']
578
+ metrics.inference_time_p99 = percentiles['p99']
579
+
580
+ # Calculate memory statistics
581
+ if memory_samples:
582
+ metrics.memory_usage_mean = statistics.mean(memory_samples)
583
+ metrics.memory_usage_std = statistics.stdev(memory_samples) if len(memory_samples) > 1 else 0.0
584
+ metrics.memory_usage_peak = max(memory_samples)
585
+
586
+ # Test accuracy on the same samples
587
+ try:
588
+ # Filter features for prediction
589
+ if model_feature_columns is not None:
590
+ available_features = [f for f in model_feature_columns if f in feature_columns]
591
+ if len(available_features) > 0:
592
+ # Convert column names to indices for numpy array
593
+ feature_indices = [feature_columns.index(f) for f in available_features]
594
+ test_features = sample_features[:, feature_indices]
595
+ else:
596
+ test_features = sample_features
597
+ else:
598
+ # model_feature_columns is None - likely A6 SVM pipeline
599
+ # Check if we need to drop duplicate NASM columns
600
+ if hasattr(model, 'steps') and len(model.steps) > 0:
601
+ first_step = model.steps[0][1]
602
+ n_expected = getattr(first_step, 'n_features_in_', None)
603
+ if n_expected is not None:
604
+ # Identify indices of duplicate NASM columns
605
+ dup_indices = [i for i, c in enumerate(feature_columns) if c in DUPLICATE_NASM_COLS]
606
+ # Get all indices except duplicate NASM columns
607
+ valid_indices = [i for i in range(len(feature_columns)) if i not in dup_indices]
608
+ if len(valid_indices) == n_expected:
609
+ # Select only the columns that match expected features
610
+ test_features = sample_features[:, valid_indices]
611
+ else:
612
+ # Fallback: slice to expected number of features
613
+ test_features = sample_features[:, :n_expected]
614
+ else:
615
+ test_features = sample_features
616
+ else:
617
+ test_features = sample_features
618
+
619
+ # Handle A6 SVM pipeline
620
+ if model_feature_columns is None and hasattr(model, 'steps'):
621
+ scaler_to_use = None
622
+ else:
623
+ scaler_to_use = scaler
624
+
625
+ if scaler_to_use is not None:
626
+ features = scaler_to_use.transform(test_features)
627
+ else:
628
+ features = test_features
629
+
630
+ predictions = model.predict(features)
631
+
632
+ # Calculate accuracy
633
+ correct = np.sum(predictions == true_labels)
634
+ metrics.predictions_correct = int(correct)
635
+ metrics.predictions_total = len(true_labels)
636
+ metrics.accuracy = correct / len(true_labels)
637
+
638
+ except Exception as e:
639
+ print(f" ⚠️ Accuracy calculation failed: {e}")
640
+
641
+ metrics.status = "SUCCESS"
642
+ return metrics
643
+
644
+
645
+ def run_benchmark(
646
+ num_samples: int = DEFAULT_NUM_SAMPLES,
647
+ num_repeats: int = DEFAULT_NUM_REPEATES,
648
+ output_file: Optional[str] = None,
649
+ single_sample_mode: bool = False
650
+ ) -> BenchmarkResults:
651
+ """Run complete benchmark on all models.
652
+
653
+ Args:
654
+ num_samples: Number of samples to benchmark
655
+ num_repeats: Number of repetitions per sample
656
+ output_file: Optional output file path for results
657
+ single_sample_mode: If True, measure each sample individually (for single sample latency)
658
+
659
+ Returns:
660
+ BenchmarkResults object with all results
661
+ """
662
+ print("=" * 70)
663
+ print("STANDARDIZED TIMING BENCHMARKING FRAMEWORK")
664
+ print("=" * 70)
665
+ print(f"\nConfiguration:")
666
+ print(f" Number of samples: {num_samples}")
667
+ print(f" Number of repeats per sample: {num_repeats}")
668
+ print(f" Total predictions per model: {num_samples * num_repeats}")
669
+ print()
670
+
671
+ # Load data
672
+ print("Loading data...")
673
+ data = load_and_prepare_data()
674
+ print()
675
+
676
+ # Create samples
677
+ sample_features, true_labels = create_samples_from_test_data(data, num_samples)
678
+ print(f"Created {num_samples} test samples for benchmarking")
679
+ print()
680
+
681
+ # Define models to benchmark
682
+ models_to_benchmark = [
683
+ ('A4 Random Forest', a4_rf),
684
+ ('A5 Ensemble', a5_ensemnble),
685
+ ('A5b Adaboost', a5b_adaboost),
686
+ ('A5b Bagging Trees', a5b_bagging_tree),
687
+ ('A6 SVM', a6_svm),
688
+ ]
689
+
690
+ # Initialize results
691
+ results = BenchmarkResults(
692
+ timestamp=datetime.now().isoformat(),
693
+ num_samples=num_samples,
694
+ num_repeats=num_repeats
695
+ )
696
+
697
+ # Benchmark each model
698
+ print("=" * 70)
699
+ print("Running Benchmarks")
700
+ print("=" * 70)
701
+
702
+ for model_name, model_path in models_to_benchmark:
703
+ metrics = benchmark_single_model(
704
+ model_name=model_name,
705
+ model_path=model_path,
706
+ sample_features=sample_features,
707
+ true_labels=true_labels,
708
+ feature_columns=data['feature_columns'],
709
+ num_repeats=num_repeats,
710
+ single_sample_mode=single_sample_mode
711
+ )
712
+ results.models[model_name] = metrics
713
+
714
+ # Print summary for this model
715
+ print(f"\n {model_name} Results:")
716
+ print(f" Status: {metrics.status}")
717
+
718
+ if metrics.status == "SUCCESS":
719
+ print(f" Inference Time:")
720
+ print(f" Mean: {metrics.inference_time_mean*1000:.3f} ms")
721
+ print(f" Std: {metrics.inference_time_std*1000:.3f} ms")
722
+ print(f" P50: {metrics.inference_time_p50*1000:.3f} ms")
723
+ print(f" P95: {metrics.inference_time_p95*1000:.3f} ms")
724
+ print(f" P99: {metrics.inference_time_p99*1000:.3f} ms")
725
+ print(f" Memory Usage:")
726
+ print(f" Mean: {metrics.memory_usage_mean/1024:.1f} KB")
727
+ print(f" Peak: {metrics.memory_usage_peak/1024:.1f} KB")
728
+ print(f" Accuracy: {metrics.accuracy*100:.1f}% ({metrics.predictions_correct}/{metrics.predictions_total})")
729
+ print(f" Model Size: {metrics.model_size_bytes/1024:.1f} KB")
730
+ print(f" Features: {metrics.num_features}")
731
+ else:
732
+ print(f" Error: {metrics.error_message}")
733
+ print()
734
+
735
+ # Save results
736
+ if output_file is None:
737
+ output_file = os.path.join(OUTPUT_DIR, f"benchmark_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json")
738
+
739
+ json_output = results.to_json(output_file)
740
+ print(f"Results saved to: {output_file}")
741
+
742
+ return results
743
+
744
+
745
+ def run_single_sample_benchmark(
746
+ num_samples: int = DEFAULT_NUM_SAMPLES,
747
+ num_repeats: int = DEFAULT_NUM_REPEATES,
748
+ output_file: Optional[str] = None
749
+ ) -> BenchmarkResults:
750
+ """Run benchmark with single sample prediction latency measurement.
751
+
752
+ This function measures the latency for individual predictions rather than
753
+ batch predictions, giving a more realistic view of single sample performance.
754
+
755
+ Args:
756
+ num_samples: Number of samples to benchmark
757
+ num_repeats: Number of repetitions per sample
758
+ output_file: Optional output file path for results
759
+
760
+ Returns:
761
+ BenchmarkResults object with all results
762
+ """
763
+ return run_benchmark(
764
+ num_samples=num_samples,
765
+ num_repeats=num_repeats,
766
+ output_file=output_file,
767
+ single_sample_mode=True
768
+ )
769
+
770
+
771
+ # ============================================================================
772
+ # Comparison and Analysis Functions
773
+ # ============================================================================
774
+
775
+ def print_comparison_table(results: BenchmarkResults):
776
+ """Print a formatted comparison table of all models."""
777
+ print("\n" + "=" * 90)
778
+ print("MODEL COMPARISON SUMMARY")
779
+ print("=" * 90)
780
+
781
+ # Header
782
+ print(f"{'Model':<20} {'Time (ms)':<15} {'Std':<10} {'P95':<10} {'Acc (%)':<10} {'Mem (KB)':<12} {'Size (KB)':<12}")
783
+ print("-" * 90)
784
+
785
+ # Sort by inference time for comparison
786
+ sorted_models = sorted(
787
+ results.models.items(),
788
+ key=lambda x: x[1].inference_time_mean if x[1].status == "SUCCESS" else float('inf')
789
+ )
790
+
791
+ for model_name, metrics in sorted_models:
792
+ if metrics.status == "SUCCESS":
793
+ time_ms = metrics.inference_time_mean * 1000
794
+ std_ms = metrics.inference_time_std * 1000
795
+ p95_ms = metrics.inference_time_p95 * 1000
796
+ acc = metrics.accuracy * 100
797
+ mem_kb = metrics.memory_usage_mean / 1024
798
+ size_kb = metrics.model_size_bytes / 1024
799
+
800
+ print(f"{model_name:<20} {time_ms:<15.3f} {std_ms:<10.3f} {p95_ms:<10.3f} {acc:<10.1f} {mem_kb:<12.1f} {size_kb:<12.1f}")
801
+ else:
802
+ print(f"{model_name:<20} {'ERROR':<15} {'-':<10} {'-':<10} {'-':<10} {'-':<12} {'-':<12}")
803
+
804
+ print("=" * 90)
805
+
806
+
807
+ def find_optimal_model(results: BenchmarkResults, priority: str = "speed"):
808
+ """Find the optimal model based on specified criteria.
809
+
810
+ Args:
811
+ results: BenchmarkResults object
812
+ priority: Optimization priority ("speed", "accuracy", "memory", "balanced")
813
+
814
+ Returns:
815
+ Tuple of (best_model_name, best_metrics)
816
+ """
817
+ valid_models = {
818
+ name: metrics for name, metrics in results.models.items()
819
+ if metrics.status == "SUCCESS"
820
+ }
821
+
822
+ if not valid_models:
823
+ return None, None
824
+
825
+ if priority == "speed":
826
+ # Minimum inference time
827
+ best = min(valid_models.items(), key=lambda x: x[1].inference_time_mean)
828
+ elif priority == "accuracy":
829
+ # Maximum accuracy
830
+ best = max(valid_models.items(), key=lambda x: x[1].accuracy)
831
+ elif priority == "memory":
832
+ # Minimum memory usage
833
+ best = min(valid_models.items(), key=lambda x: x[1].memory_usage_mean)
834
+ elif priority == "balanced":
835
+ # Balanced score: weighted combination
836
+ def balanced_score(item):
837
+ metrics = item[1]
838
+ # Normalize and combine metrics
839
+ time_score = metrics.inference_time_mean
840
+ acc_score = 1 - metrics.accuracy
841
+ mem_score = metrics.memory_usage_mean / 1000000 # Scale down
842
+
843
+ # Weighted sum (weights can be adjusted)
844
+ return 0.5 * time_score + 0.3 * acc_score + 0.2 * mem_score
845
+
846
+ best = min(valid_models.items(), key=balanced_score)
847
+ else:
848
+ best = min(valid_models.items(), key=lambda x: x[1].inference_time_mean)
849
+
850
+ return best
851
+
852
+
853
+ def print_recommendations(results: BenchmarkResults):
854
+ """Print model recommendations based on different criteria."""
855
+ print("\n" + "=" * 70)
856
+ print("MODEL RECOMMENDATIONS")
857
+ print("=" * 70)
858
+
859
+ criteria = [
860
+ ("Fastest Inference", "speed"),
861
+ ("Highest Accuracy", "accuracy"),
862
+ ("Lowest Memory Usage", "memory"),
863
+ ("Best Balanced Performance", "balanced"),
864
+ ]
865
+
866
+ for description, priority in criteria:
867
+ model_name, metrics = find_optimal_model(results, priority)
868
+ if model_name:
869
+ print(f"\n{description}:")
870
+ print(f" Model: {model_name}")
871
+ if priority == "speed":
872
+ print(f" Inference Time: {metrics.inference_time_mean*1000:.3f} ms")
873
+ elif priority == "accuracy":
874
+ print(f" Accuracy: {metrics.accuracy*100:.1f}%")
875
+ elif priority == "memory":
876
+ print(f" Memory Usage: {metrics.memory_usage_mean/1024:.1f} KB")
877
+ elif priority == "balanced":
878
+ print(f" Inference Time: {metrics.inference_time_mean*1000:.3f} ms")
879
+ print(f" Accuracy: {metrics.accuracy*100:.1f}%")
880
+ print(f" Memory Usage: {metrics.memory_usage_mean/1024:.1f} KB")
881
+ else:
882
+ print(f"\n{description}:")
883
+ print(" No valid models found")
884
+
885
+
886
+ # ============================================================================
887
+ # Main Entry Point
888
+ # ============================================================================
889
+
890
+ def main():
891
+ """Main entry point for the benchmarking framework."""
892
+ import argparse
893
+
894
+ parser = argparse.ArgumentParser(
895
+ description='Standardized Timing Benchmarking Framework for Classification Models'
896
+ )
897
+ parser.add_argument(
898
+ '--samples', '-n',
899
+ type=int,
900
+ default=DEFAULT_NUM_SAMPLES,
901
+ help=f'Number of samples to benchmark (default: {DEFAULT_NUM_SAMPLES})'
902
+ )
903
+ parser.add_argument(
904
+ '--repeats', '-r',
905
+ type=int,
906
+ default=DEFAULT_NUM_REPEATES,
907
+ help=f'Number of repeats per sample (default: {DEFAULT_NUM_REPEATES})'
908
+ )
909
+ parser.add_argument(
910
+ '--output', '-o',
911
+ type=str,
912
+ default=DEFAULT_OUTPUT_FILE,
913
+ help='Output file for results (default: benchmark_results/timestamp.json)'
914
+ )
915
+ parser.add_argument(
916
+ '--compare', '-c',
917
+ action='store_true',
918
+ help='Print comparison table after benchmarking'
919
+ )
920
+ parser.add_argument(
921
+ '--recommend', '-R',
922
+ action='store_true',
923
+ help='Print model recommendations after benchmarking'
924
+ )
925
+ parser.add_argument(
926
+ '--single-sample', '-s',
927
+ action='store_true',
928
+ help='Measure single sample prediction latency (default: batch mode)'
929
+ )
930
+
931
+ args = parser.parse_args()
932
+
933
+ # Run benchmark
934
+ if args.single_sample:
935
+ results = run_single_sample_benchmark(
936
+ num_samples=args.samples,
937
+ num_repeats=args.repeats,
938
+ output_file=args.output
939
+ )
940
+ else:
941
+ results = run_benchmark(
942
+ num_samples=args.samples,
943
+ num_repeats=args.repeats,
944
+ output_file=args.output
945
+ )
946
+
947
+ # Print comparison table if requested
948
+ if args.compare:
949
+ print_comparison_table(results)
950
+
951
+ # Print recommendations if requested
952
+ if args.recommend:
953
+ print_recommendations(results)
954
+
955
+ # Return results for programmatic use
956
+ return results
957
+
958
+
959
+ if __name__ == "__main__":
960
+ results = main()
A6/check_svm_model.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import os
3
+
4
+ # Check A6 SVM model
5
+ a6_path = './models/champion_svm.pkl'
6
+ with open(a6_path, 'rb') as f:
7
+ artifact = pickle.load(f)
8
+ #print(artifact)
9
+ #print(artifact.get('feature_columns'))
10
+ print('A6 SVM Model Structure:')
11
+ print(f' Type: {type(artifact)}')
12
+ print(f' Class name: {type(artifact).__name__}')
13
+ if hasattr(artifact, 'steps'):
14
+ print(f' Steps: {[step[0] for step in artifact.steps]}')
15
+ for step_name, step in artifact.steps:
16
+ print(f' {step_name}: {type(step).__name__}')
17
+ if hasattr(step, 'feature_names_in_'):
18
+ print(f' feature_names_in_: {step.feature_names_in_}')
19
+ if hasattr(step, 'get_feature_names_out'):
20
+ try:
21
+ fnames = step.get_feature_names_out()
22
+ print(f' get_feature_names_out(): {fnames}')
23
+ except Exception as e:
24
+ print(f' get_feature_names_out() error: {e}')
25
+ if isinstance(artifact, dict):
26
+ print(f' Keys: {artifact.keys()}')
27
+ if 'feature_columns' in artifact:
28
+ print(f' feature_columns: {artifact["feature_columns"]}')
A6/test_classification_loading.py ADDED
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to load and execute all classification models with one sample.
4
+ Tests models from A4, A5, A5b, and A6.
5
+
6
+ Data loading adapted from classification_baseline.py to use the same
7
+ data processing pipeline for consistent feature extraction.
8
+
9
+ NOTE: A4 Random Forest model was trained WITH the 5 duplicate NASM columns
10
+ (No_1_NASM_Deviation through No_5_NASM_Deviation), while other models (A5, A5b, A6)
11
+ were trained WITHOUT them. This script loads data WITH the duplicate columns
12
+ to support the A4 model, and filters them out for other models as needed.
13
+ """
14
+
15
+ import os
16
+ import sys
17
+ import pickle
18
+ import warnings
19
+ import numpy as np
20
+ import pandas as pd
21
+ from pathlib import Path
22
+ from sklearn.preprocessing import StandardScaler
23
+ from sklearn.model_selection import train_test_split
24
+
25
+ # Suppress warnings for cleaner output
26
+ warnings.filterwarnings('ignore')
27
+
28
+ # Add project root to path
29
+ project_root = os.path.abspath(os.path.dirname(__file__))
30
+ sys.path.insert(0, project_root)
31
+
32
+ # Import model paths from all_classification.py
33
+ sys.path.insert(0, project_root)
34
+ from all_classification import (
35
+ a4_rf,
36
+ a5_ensemnble,
37
+ a5b_adaboost,
38
+ a5b_bagging_tree,
39
+ a6_svm
40
+ )
41
+
42
+ # Import custom classes from A5b classification_adaboost.py
43
+ # These are needed for unpickling the AdaBoost model
44
+ #sys.path.insert(0, os.path.join(project_root, '..', 'A5b'))
45
+ from adaboost_classes import (
46
+ AdaBoostEnsemble,
47
+ WeightedDecisionTree
48
+ )
49
+
50
+ # Data paths
51
+ REPO_ROOT = os.path.abspath(os.path.join(project_root, '..'))
52
+ DATA_DIR = os.path.join(REPO_ROOT, 'Datasets_all')
53
+
54
+ # Weaklink categories (14 classes)
55
+ WEAKLINK_CATEGORIES = [
56
+ 'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward',
57
+ 'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward',
58
+ 'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward',
59
+ 'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward',
60
+ 'RightKneeMovesOutward', 'RightShoulderElevation'
61
+ ]
62
+
63
+ # Duplicate NASM columns to remove (as in classification_baseline.py)
64
+ # NOTE: A4 Random Forest model was trained WITH these 5 duplicate columns,
65
+ # so they must be kept in the data for A4 to work correctly
66
+ DUPLICATE_NASM_COLS = [
67
+ 'No_1_NASM_Deviation',
68
+ 'No_2_NASM_Deviation',
69
+ 'No_3_NASM_Deviation',
70
+ 'No_4_NASM_Deviation',
71
+ 'No_5_NASM_Deviation',
72
+ ]
73
+
74
+ # Columns to exclude when extracting features
75
+ EXCLUDE_COLS = ['ID', 'WeakestLink', 'EstimatedScore']
76
+
77
+ # Expected classification classes (14 weaklink categories)
78
+ EXPECTED_CLASSES = [
79
+ 'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward',
80
+ 'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward',
81
+ 'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward',
82
+ 'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward',
83
+ 'RightKneeMovesOutward', 'RightShoulderElevation'
84
+ ]
85
+
86
+
87
+ def load_and_prepare_data():
88
+ """Load and prepare data following the same pipeline as classification_baseline.py.
89
+
90
+ NOTE: This function loads data WITH the 5 duplicate NASM columns because
91
+ the A4 Random Forest model was trained with those columns included.
92
+ Other models (A5, A5b, A6) will filter out these columns based on their feature_columns.
93
+ """
94
+ # Load datasets
95
+ movement_features_df = pd.read_csv(os.path.join(DATA_DIR, 'aimoscores.csv'))
96
+ weaklink_scores_df = pd.read_csv(os.path.join(DATA_DIR, 'scores_and_weaklink.csv'))
97
+
98
+ print('Movement features shape:', movement_features_df.shape)
99
+ print('Weak link scores shape:', weaklink_scores_df.shape)
100
+
101
+ # NOTE: We do NOT remove duplicate NASM columns here because
102
+ # the A4 Random Forest model was trained WITH these columns
103
+ # The other models (A5, A5b, A6) will filter them out based on their saved feature_columns
104
+ print('NOTE: Keeping duplicate NASM columns for A4 Random Forest model compatibility')
105
+
106
+ # Create WeakestLink target column
107
+ weaklink_scores_df['WeakestLink'] = (
108
+ weaklink_scores_df[WEAKLINK_CATEGORIES].idxmax(axis=1)
109
+ )
110
+ print('Weakest Link class distribution:')
111
+ print(weaklink_scores_df['WeakestLink'].value_counts())
112
+
113
+ # Merge datasets
114
+ target_df = weaklink_scores_df[['ID', 'WeakestLink']].copy()
115
+ merged_df = movement_features_df.merge(target_df, on='ID', how='inner')
116
+ print('Merged dataset shape:', merged_df.shape)
117
+
118
+ # Extract feature columns - include ALL columns except EXCLUDE_COLS
119
+ # This ensures the 5 duplicate NASM columns are included for A4
120
+ feature_columns = [c for c in merged_df.columns if c not in EXCLUDE_COLS]
121
+
122
+ X = merged_df[feature_columns].values
123
+ y = merged_df['WeakestLink'].values
124
+
125
+ print(f'Feature matrix shape : {X.shape}')
126
+ print(f'Number of features : {len(feature_columns)}')
127
+ print(f'Number of classes : {len(np.unique(y))}')
128
+
129
+ # Create train/test split (same as baseline)
130
+ X_train, X_test, y_train, y_test = train_test_split(
131
+ X, y, test_size=0.2, random_state=42, stratify=y
132
+ )
133
+
134
+ # Fit scaler on training data
135
+ scaler = StandardScaler()
136
+ X_train_scaled = scaler.fit_transform(X_train)
137
+ X_test_scaled = scaler.transform(X_test)
138
+
139
+ return {
140
+ 'feature_columns': feature_columns,
141
+ 'scaler': scaler,
142
+ 'X_train': X_train,
143
+ 'X_train_scaled': X_train_scaled,
144
+ 'y_train': y_train,
145
+ 'X_test': X_test,
146
+ 'X_test_scaled': X_test_scaled,
147
+ 'y_test': y_test,
148
+ 'merged_df': merged_df,
149
+ }
150
+
151
+
152
+ def load_model(model_path, model_name):
153
+ """Load a model from a pickle file."""
154
+ full_path = os.path.join(project_root, model_path)
155
+
156
+ if not os.path.exists(full_path):
157
+ print(f" ⚠️ Model file not found: {full_path}")
158
+ return None, None, None, None
159
+
160
+ try:
161
+ with open(full_path, 'rb') as f:
162
+ artifact = pickle.load(f)
163
+
164
+ # Extract model and scaler based on artifact structure
165
+ if isinstance(artifact, dict):
166
+ model = artifact.get('model')
167
+ scaler = artifact.get('scaler')
168
+ feature_columns = artifact.get('feature_columns')
169
+ else:
170
+ # A6 SVM is a Pipeline object
171
+ model = artifact
172
+ # Extract scaler from pipeline if it exists
173
+ if hasattr(model, 'steps') and len(model.steps) >= 1:
174
+ # Find the scaler in the pipeline
175
+ scaler = None
176
+ for step_name, step_obj in model.steps:
177
+ if hasattr(step_obj, 'transform'):
178
+ # Check if this is a scaler (has n_features_in_ attribute)
179
+ if hasattr(step_obj, 'n_features_in_') and not hasattr(step_obj, 'predict'):
180
+ scaler = step_obj
181
+ break
182
+ # If no scaler found, try to get it from the first step
183
+ if scaler is None and len(model.steps) > 0:
184
+ first_step = model.steps[0][1]
185
+ if hasattr(first_step, 'transform') and hasattr(first_step, 'n_features_in_'):
186
+ scaler = first_step
187
+ # For A6 SVM pipeline, extract feature columns from the scaler
188
+ feature_columns = None
189
+ if hasattr(model, 'steps') and len(model.steps) > 0:
190
+ # Get feature names from the first step (should be the scaler)
191
+ first_step = model.steps[0][1]
192
+ if hasattr(first_step, 'get_feature_names_out'):
193
+ try:
194
+ names = first_step.get_feature_names_out()
195
+ # Only use feature names if they are real column names,
196
+ # not generic placeholder names like x0, x1, ...
197
+ import re
198
+ if not all(re.fullmatch(r'x\d+', n) for n in names):
199
+ feature_columns = names
200
+ # else: leave feature_columns = None; handled below
201
+ except:
202
+ pass
203
+
204
+ print(f" βœ“ Loaded {model_name}")
205
+ #print(model, scaler, feature_columns, artifact)
206
+ return model, scaler, feature_columns, artifact
207
+ except Exception as e:
208
+ print(f" βœ— Error loading {model_name}: {e}")
209
+ return None, None, None, None
210
+
211
+
212
+ def predict_with_model(model, scaler, sample_features, model_name):
213
+ """Make a prediction using the model."""
214
+ try:
215
+ features = sample_features.copy()
216
+
217
+ # Apply scaler if available
218
+ if scaler is not None:
219
+ features = scaler.transform(features)
220
+
221
+ # Make prediction
222
+ prediction = model.predict(features)
223
+ prediction_proba = None
224
+
225
+ # Get prediction probabilities if available
226
+ if hasattr(model, 'predict_proba'):
227
+ prediction_proba = model.predict_proba(features)
228
+
229
+ return prediction, prediction_proba, None
230
+ except Exception as e:
231
+ return None, None, str(e)
232
+
233
+
234
+ def create_sample_from_training_data(training_data, feature_columns, scaler):
235
+ """Create a sample from the training data for testing."""
236
+ # Get first sample from training data
237
+ sample = training_data['X_train'][0:1].copy()
238
+ sample_df = pd.DataFrame(sample, columns=feature_columns)
239
+
240
+ # Scale if scaler is available
241
+ if scaler is not None:
242
+ sample_df_scaled = scaler.transform(sample_df)
243
+ return sample_df, sample_df_scaled
244
+ return sample_df, sample_df
245
+
246
+
247
+ def filter_features_for_model(sample_df, model_feature_columns):
248
+ """Filter sample data to only include features the model expects."""
249
+ available_features = [f for f in model_feature_columns if f in sample_df.columns]
250
+
251
+ if len(available_features) == 0:
252
+ print(f" ⚠️ No matching features found, using all available")
253
+ available_features = sample_df.columns.tolist()
254
+
255
+ return sample_df[available_features]
256
+
257
+
258
+ def main():
259
+ """Main function to test all models."""
260
+ print("=" * 60)
261
+ print("Testing All Classification Models with One Sample")
262
+ print("=" * 60)
263
+ print()
264
+
265
+ # Load and prepare data using the same pipeline as classification_baseline.py
266
+ # NOTE: Data is loaded WITH the 5 duplicate NASM columns for A4 compatibility
267
+ print("Loading data...")
268
+ data = load_and_prepare_data()
269
+ print()
270
+
271
+ # Create sample from training data
272
+ sample_features, sample_features_scaled = create_sample_from_training_data(
273
+ data, data['feature_columns'], data['scaler']
274
+ )
275
+ print(f"Sample data shape: {sample_features.shape}")
276
+ print(f"Number of features (including duplicates): {len(data['feature_columns'])}")
277
+ print()
278
+
279
+ # Define models to test
280
+ models_to_test = [
281
+ ('A4 Random Forest', a4_rf),
282
+ ('A5 Ensemble', a5_ensemnble),
283
+ ('A5b Adaboost', a5b_adaboost),
284
+ ('A5b Bagging Trees', a5b_bagging_tree),
285
+ ('A6 SVM', a6_svm),
286
+ ]
287
+
288
+ results = []
289
+
290
+ for model_name, model_path in models_to_test:
291
+ print(f"Testing {model_name}...")
292
+
293
+ # Load model
294
+ model, scaler, model_feature_columns, artifact = load_model(model_path, model_name)
295
+
296
+ if model is None:
297
+ print(f" Skipping {model_name} due to load error")
298
+ results.append((model_name, 'LOAD_ERROR', None, None, None))
299
+ print()
300
+ continue
301
+
302
+ # Determine feature columns to use
303
+ if model_feature_columns is not None:
304
+ # Filter sample data to only include features the model expects
305
+ test_features = filter_features_for_model(sample_features, model_feature_columns)
306
+ print(f" Model expects {len(model_feature_columns)} features, using {len(test_features.columns)} available")
307
+ elif hasattr(model, 'steps'):
308
+ # Pipeline with generic/unknown feature names (e.g. A6 SVM trained without
309
+ # the 5 duplicate NASM columns). Drop those duplicate columns so the number
310
+ # of features matches what the pipeline's scaler expects.
311
+ first_step = model.steps[0][1]
312
+ n_expected = getattr(first_step, 'n_features_in_', None)
313
+ cols_without_dupes = [c for c in sample_features.columns if c not in DUPLICATE_NASM_COLS]
314
+ if n_expected is not None and len(cols_without_dupes) == n_expected:
315
+ test_features = sample_features[cols_without_dupes]
316
+ print(f" Pipeline expects {n_expected} features β€” dropped duplicate NASM cols, using {len(test_features.columns)} features")
317
+ else:
318
+ # Fallback: just take the first n_expected columns
319
+ test_features = sample_features.iloc[:, :n_expected] if n_expected else sample_features
320
+ print(f" Pipeline expects {n_expected} features, sliced sample to {len(test_features.columns)} features")
321
+ else:
322
+ test_features = sample_features
323
+ print(f" Using all {len(sample_features.columns)} available features")
324
+
325
+ # Make prediction
326
+ # For A6 SVM pipeline, don't pass the scaler separately since it's already in the pipeline
327
+ # For other models, pass the scaler if available
328
+ if model_feature_columns is None and hasattr(model, 'steps'):
329
+ # This is likely the A6 SVM pipeline - don't apply scaler separately
330
+ scaler_to_use = None
331
+ else:
332
+ scaler_to_use = scaler
333
+
334
+ prediction, prediction_proba, error = predict_with_model(
335
+ model, scaler_to_use, test_features, model_name
336
+ )
337
+
338
+ if error:
339
+ print(f" βœ— Prediction error: {error}")
340
+ results.append((model_name, 'PREDICTION_ERROR', None, None, error))
341
+ print()
342
+ continue
343
+
344
+ # Display results
345
+ print(f" βœ“ Prediction: {prediction[0]}")
346
+
347
+ if prediction_proba is not None:
348
+ print(f" βœ“ Prediction probabilities shape: {prediction_proba.shape}")
349
+ top_classes_idx = np.argsort(prediction_proba[0])[-3:][::-1]
350
+ top_classes = [EXPECTED_CLASSES[i] for i in top_classes_idx]
351
+ top_probs = [prediction_proba[0][i] for i in top_classes_idx]
352
+ print(f" βœ“ Top 3 classes: {list(zip(top_classes, [f'{p:.3f}' for p in top_probs]))}")
353
+
354
+ print(f" βœ“ Model type: {type(model).__name__}")
355
+
356
+ # Check if model has classes attribute
357
+ if hasattr(model, 'classes_'):
358
+ print(f" βœ“ Model classes: {list(model.classes_)}")
359
+
360
+ results.append((model_name, 'SUCCESS', prediction, prediction_proba, None))
361
+ print()
362
+
363
+ # Summary
364
+ print("=" * 60)
365
+ print("Summary")
366
+ print("=" * 60)
367
+
368
+ for model_name, status, prediction, proba, error in results:
369
+ if status == 'SUCCESS':
370
+ pred_str = prediction[0] if prediction is not None else 'N/A'
371
+ print(f" {model_name}: βœ“ SUCCESS - Prediction: {pred_str}")
372
+ else:
373
+ print(f" {model_name}: βœ— {status} - {error}")
374
+
375
+ print()
376
+ print("All models tested!")
377
+
378
+
379
+ if __name__ == "__main__":
380
+ main()
A6/time_specification.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Hardware Specifications
2
+
3
+ | Component | Specification |
4
+ |-----------|---------------|
5
+ | **CPU** | AMD Ryzen 5 5600U with Radeon Graphics |
6
+ | **CPU Cores/Threads** | 6 cores, 12 threads (2 threads per core) |
7
+ | **CPU Frequency** | 400 MHz - 4289 MHz (max boost) |
8
+ | **Architecture** | x86_64 |
9
+ | **RAM** | 30 GiB (15 GiB available currently) |
10
+ | **Swap** | 31 GiB |
11
+ | **Integrated GPU** | AMD Radeon Vega Mobile Series (Cezanne) |
12
+ | **Storage** | 469 GB NVMe SSD |
13
+ | **Operating System** | Linux (Ubuntu-based, kernel 6.8.0-101-lowlatency) |
14
+
15
+ ---
16
+
17
+ ## Software Environment
18
+
19
+ | Component | Version/Details |
20
+ |-----------|-----------------|
21
+ | **Python** | 3.12.3 |
22
+ | **Key Packages** | numpy 2.4.2, scikit-learn 1.8.0, pandas 2.2.3 |