Bachstelze commited on
Commit
f5e4068
·
1 Parent(s): 54eac1a

test baseline with cv only

Browse files
A5b/classification_adaboost.py ADDED
@@ -0,0 +1,466 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import warnings
4
+ import numpy as np
5
+ import pandas as pd
6
+ import matplotlib.pyplot as plt
7
+ import seaborn as sns
8
+ from pathlib import Path
9
+ from scipy import stats
10
+ from typing import List, Tuple, Dict, Any
11
+
12
+ from sklearn.model_selection import (
13
+ train_test_split, StratifiedKFold, cross_validate
14
+ )
15
+ from sklearn.base import BaseEstimator, ClassifierMixin
16
+ from sklearn.preprocessing import StandardScaler
17
+ from sklearn.metrics import (
18
+ accuracy_score, precision_score, recall_score, f1_score,
19
+ classification_report, confusion_matrix
20
+ )
21
+ from sklearn.tree import DecisionTreeClassifier
22
+ from sklearn.ensemble import (
23
+ RandomForestClassifier,
24
+ VotingClassifier,
25
+ BaggingClassifier,
26
+ StackingClassifier,
27
+ )
28
+ import xgboost as xgb
29
+ import lightgbm as lgb
30
+
31
+ warnings.filterwarnings('ignore')
32
+ np.random.seed(42)
33
+
34
+ REPO_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
35
+ DATA_DIR = os.path.join(REPO_ROOT, 'Datasets_all')
36
+ OUT_DIR = Path('models')
37
+ OUT_DIR.mkdir(exist_ok=True)
38
+
39
+ RANDOM_STATE = 42
40
+ N_SPLITS = 5
41
+ CHAMPION_F1 = 0.6110 # Score from A4
42
+
43
+
44
+ class WeightedDecisionTree(DecisionTreeClassifier):
45
+ """
46
+ A wrapper around DecisionTreeClassifier that properly handles sample weights.
47
+ This tree is grown based on weighted training errors.
48
+ """
49
+ def __init__(self, max_depth: int = 5, min_samples_split: int = 2,
50
+ min_samples_leaf: int = 1, random_state: int = 42):
51
+ super().__init__(
52
+ max_depth=max_depth,
53
+ min_samples_split=min_samples_split,
54
+ min_samples_leaf=min_samples_leaf,
55
+ random_state=random_state
56
+ )
57
+
58
+ def fit(self, X, y, sample_weight=None):
59
+ """Fit the decision tree with optional sample weights."""
60
+ return super().fit(X, y, sample_weight=sample_weight)
61
+
62
+
63
+ class AdaBoostEnsemble(BaseEstimator, ClassifierMixin):
64
+ """
65
+ AdaBoost ensemble of decision trees where each tree is grown based on
66
+ weighted training errors. Weights are updated based on the error of
67
+ previous trees.
68
+
69
+ The algorithm:
70
+ 1. Initialize equal weights for all training samples
71
+ 2. For each tree in the ensemble:
72
+ - Train a decision tree on weighted data
73
+ - Calculate weighted error rate
74
+ - Compute tree weight (alpha)
75
+ - Update sample weights (increase for misclassified, decrease for correct)
76
+ - Normalize weights
77
+ 3. Make predictions using weighted voting
78
+ """
79
+
80
+ def __init__(
81
+ self,
82
+ n_estimators: int = 50,
83
+ max_depth: int = 5,
84
+ min_samples_split: int = 2,
85
+ min_samples_leaf: int = 1,
86
+ random_state: int = 42
87
+ ):
88
+ self.n_estimators = n_estimators
89
+ self.max_depth = max_depth
90
+ self.min_samples_split = min_samples_split
91
+ self.min_samples_leaf = min_samples_leaf
92
+ self.random_state = random_state
93
+ self.trees: List[WeightedDecisionTree] = []
94
+ self.tree_weights: List[float] = []
95
+ self.n_classes: int = 0
96
+ self.classes_: np.ndarray = None
97
+
98
+ def _initialize_weights(self, n_samples: int) -> np.ndarray:
99
+ """Initialize equal weights for all samples."""
100
+ return np.ones(n_samples) / n_samples
101
+
102
+ def _update_weights(
103
+ self,
104
+ weights: np.ndarray,
105
+ y_true: np.ndarray,
106
+ y_pred: np.ndarray,
107
+ alpha: float
108
+ ) -> np.ndarray:
109
+ """
110
+ Update sample weights based on prediction errors.
111
+ Increase weight for misclassified samples, decrease for correct.
112
+ """
113
+ # Misclassified samples get multiplied by exp(alpha)
114
+ # Correctly classified samples get multiplied by exp(-alpha)
115
+ misclassified = y_true != y_pred
116
+ updated_weights = weights * np.exp(alpha * misclassified.astype(float))
117
+
118
+ # Normalize weights
119
+ return updated_weights / updated_weights.sum()
120
+
121
+ def _compute_weighted_error(
122
+ self,
123
+ weights: np.ndarray,
124
+ y_true: np.ndarray,
125
+ y_pred: np.ndarray
126
+ ) -> float:
127
+ """Compute weighted error rate."""
128
+ misclassified = (y_true != y_pred).astype(float)
129
+ return np.sum(weights * misclassified) / np.sum(weights)
130
+
131
+ def _compute_alpha(self, error: float) -> float:
132
+ """
133
+ Compute the weight of the classifier.
134
+ Avoid division by zero and log(0).
135
+ """
136
+ if error <= 0:
137
+ return 10.0 # Very high weight for perfect classifier
138
+ if error >= 1:
139
+ return -10.0 # Very negative weight for completely wrong classifier
140
+ return 0.5 * np.log((1 - error) / error)
141
+
142
+ def fit(self, X: np.ndarray, y: np.ndarray) -> 'AdaBoostEnsemble':
143
+ """Fit the AdaBoost ensemble."""
144
+ n_samples, n_features = X.shape
145
+ self.classes_ = np.unique(y)
146
+ self.n_classes = len(self.classes_)
147
+
148
+ # Initialize sample weights
149
+ weights = self._initialize_weights(n_samples)
150
+
151
+ for i in range(self.n_estimators):
152
+ # Create and train decision tree with current weights
153
+ tree = WeightedDecisionTree(
154
+ max_depth=self.max_depth,
155
+ min_samples_split=self.min_samples_split,
156
+ min_samples_leaf=self.min_samples_leaf,
157
+ random_state=self.random_state + i
158
+ )
159
+ tree.fit(X, y, sample_weight=weights)
160
+
161
+ # Make predictions
162
+ y_pred = tree.predict(X)
163
+
164
+ # Calculate weighted error
165
+ error = self._compute_weighted_error(weights, y, y_pred)
166
+
167
+ # Compute tree weight (alpha)
168
+ alpha = self._compute_alpha(error)
169
+
170
+ # Update sample weights
171
+ weights = self._update_weights(weights, y, y_pred, alpha)
172
+
173
+ # Store tree and its weight
174
+ self.trees.append(tree)
175
+ self.tree_weights.append(alpha)
176
+
177
+ print(f"Tree {i+1}/{self.n_estimators}: Error={error:.4f}, Alpha={alpha:.4f}")
178
+
179
+ return self
180
+
181
+ def predict(self, X: np.ndarray) -> np.ndarray:
182
+ """Predict using weighted voting."""
183
+ # Get predictions from all trees
184
+ all_predictions = np.array([tree.predict(X) for tree in self.trees])
185
+
186
+ # Get class labels
187
+ classes = self.classes_
188
+
189
+ # Compute weighted votes for each class
190
+ n_samples = X.shape[0]
191
+ weighted_votes = np.zeros((n_samples, len(classes)))
192
+
193
+ for tree_idx, tree in enumerate(self.trees):
194
+ alpha = self.tree_weights[tree_idx]
195
+ predictions = all_predictions[tree_idx]
196
+
197
+ for class_idx, class_label in enumerate(classes):
198
+ weighted_votes[:, class_idx] += alpha * (predictions == class_label)
199
+
200
+ # Return class with highest weighted vote
201
+ return classes[np.argmax(weighted_votes, axis=1)]
202
+
203
+ def predict_proba(self, X: np.ndarray) -> np.ndarray:
204
+ """Predict class probabilities using weighted voting."""
205
+ # Get predictions from all trees
206
+ all_predictions = np.array([tree.predict(X) for tree in self.trees])
207
+
208
+ # Get class labels
209
+ classes = self.classes_
210
+
211
+ # Compute weighted vote proportions for each class
212
+ n_samples = X.shape[0]
213
+ weighted_votes = np.zeros((n_samples, len(classes)))
214
+
215
+ total_weight = sum(abs(w) for w in self.tree_weights)
216
+
217
+ for tree_idx, tree in enumerate(self.trees):
218
+ alpha = self.tree_weights[tree_idx]
219
+ predictions = all_predictions[tree_idx]
220
+
221
+ for class_idx, class_label in enumerate(classes):
222
+ weighted_votes[:, class_idx] += abs(alpha) * (predictions == class_label)
223
+
224
+ # Normalize to get probabilities
225
+ return weighted_votes / total_weight
226
+
227
+
228
+ def evaluate_cv(model, X, y, cv, name='Model'):
229
+ """Evaluate model using cross-validation."""
230
+ scoring = {
231
+ 'accuracy' : 'accuracy',
232
+ 'f1' : 'f1_weighted',
233
+ 'precision': 'precision_weighted',
234
+ 'recall' : 'recall_weighted',
235
+ }
236
+ cv_res = cross_validate(model, X, y, cv=cv, scoring=scoring)
237
+ return {
238
+ 'Model' : name,
239
+ 'Accuracy_mean' : cv_res['test_accuracy'].mean(),
240
+ 'Accuracy_std' : cv_res['test_accuracy'].std(),
241
+ 'F1_mean' : cv_res['test_f1'].mean(),
242
+ 'F1_std' : cv_res['test_f1'].std(),
243
+ 'Precision_mean': cv_res['test_precision'].mean(),
244
+ 'Recall_mean' : cv_res['test_recall'].mean(),
245
+ '_f1_scores' : cv_res['test_f1'],
246
+ }
247
+
248
+
249
+ # Load data
250
+ movement_features_df = pd.read_csv(os.path.join(DATA_DIR, 'aimoscores.csv'))
251
+ weaklink_scores_df = pd.read_csv(os.path.join(DATA_DIR, 'scores_and_weaklink.csv'))
252
+
253
+ print('Movement features shape:', movement_features_df.shape)
254
+ print('Weak link scores shape:', weaklink_scores_df.shape)
255
+
256
+ DUPLICATE_NASM_COLS = [
257
+ 'No_1_NASM_Deviation',
258
+ 'No_2_NASM_Deviation',
259
+ 'No_3_NASM_Deviation',
260
+ 'No_4_NASM_Deviation',
261
+ 'No_5_NASM_Deviation',
262
+ ]
263
+
264
+ movement_features_df = movement_features_df.drop(columns=DUPLICATE_NASM_COLS)
265
+ print('Shape after duplicate removal:', movement_features_df.shape)
266
+
267
+ weaklink_categories = [
268
+ 'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward',
269
+ 'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward',
270
+ 'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward',
271
+ 'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward',
272
+ 'RightKneeMovesOutward', 'RightShoulderElevation',
273
+ ]
274
+
275
+ weaklink_scores_df['WeakestLink'] = (
276
+ weaklink_scores_df[weaklink_categories].idxmax(axis=1)
277
+ )
278
+ print('Weakest Link class distribution:')
279
+ print(weaklink_scores_df['WeakestLink'].value_counts())
280
+
281
+ # Merge Datasets
282
+ target_df = weaklink_scores_df[['ID', 'WeakestLink']].copy()
283
+ merged_df = movement_features_df.merge(target_df, on='ID', how='inner')
284
+ print('Merged dataset shape:', merged_df.shape)
285
+
286
+ EXCLUDE_COLS = ['ID', 'WeakestLink', 'EstimatedScore']
287
+ feature_columns = [c for c in merged_df.columns if c not in EXCLUDE_COLS]
288
+
289
+ X = merged_df[feature_columns].values
290
+ y = merged_df['WeakestLink'].values
291
+
292
+ print(f'Feature matrix shape : {X.shape}')
293
+ print(f'Number of features : {len(feature_columns)}')
294
+ print(f'Number of classes : {len(np.unique(y))}')
295
+
296
+ # Split data
297
+ X_train, X_test, y_train, y_test = train_test_split(
298
+ X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
299
+ )
300
+
301
+ scaler = StandardScaler()
302
+ X_train_scaled = scaler.fit_transform(X_train)
303
+ X_test_scaled = scaler.transform(X_test)
304
+
305
+ print(f'Training samples : {X_train.shape[0]}')
306
+ print(f'Test samples : {X_test.shape[0]}')
307
+
308
+ cv_strategy = StratifiedKFold(
309
+ n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE
310
+ )
311
+
312
+ # Train AdaBoost ensemble
313
+ print("\n" + "="*60)
314
+ print("TRAINING ADABOOST ENSEMBLE")
315
+ print("="*60)
316
+
317
+ adaboost_model = AdaBoostEnsemble(
318
+ n_estimators=50,
319
+ max_depth=5,
320
+ min_samples_split=5,
321
+ min_samples_leaf=2,
322
+ random_state=RANDOM_STATE
323
+ )
324
+
325
+ adaboost_model.fit(X_train_scaled, y_train)
326
+
327
+ # Cross-validation
328
+ adaboost_cv = evaluate_cv(
329
+ adaboost_model, X_train_scaled, y_train, cv_strategy,
330
+ name='AdaBoost Ensemble'
331
+ )
332
+
333
+ # Test set evaluation
334
+ adaboost_model.fit(X_train_scaled, y_train)
335
+ y_pred_adaboost = adaboost_model.predict(X_test_scaled)
336
+
337
+ test_f1_adaboost = f1_score(y_test, y_pred_adaboost, average='weighted')
338
+ test_acc_adaboost = accuracy_score(y_test, y_pred_adaboost)
339
+ test_prec_adaboost = precision_score(y_test, y_pred_adaboost, average='weighted', zero_division=0)
340
+ test_rec_adaboost = recall_score(y_test, y_pred_adaboost, average='weighted', zero_division=0)
341
+
342
+ print("\n" + "="*60)
343
+ print("ADABOOST RESULTS")
344
+ print("="*60)
345
+ print(f'CV F1: {adaboost_cv["F1_mean"]:.4f} +/- {adaboost_cv["F1_std"]:.4f}')
346
+ print(f'Test F1: {test_f1_adaboost:.4f}')
347
+ print(f'Test Accuracy: {test_acc_adaboost:.4f}')
348
+ print(f'Test Precision: {test_prec_adaboost:.4f}')
349
+ print(f'Test Recall: {test_rec_adaboost:.4f}')
350
+
351
+ # Compare with baseline models
352
+ rf_champion = RandomForestClassifier(
353
+ n_estimators=200, max_depth=15,
354
+ min_samples_split=5, min_samples_leaf=2,
355
+ class_weight='balanced',
356
+ random_state=RANDOM_STATE, n_jobs=-1
357
+ )
358
+
359
+ rf_cv = evaluate_cv(
360
+ rf_champion, X_train_scaled, y_train, cv_strategy,
361
+ name='Random Forest (Baseline)'
362
+ )
363
+
364
+ rf_champion.fit(X_train_scaled, y_train)
365
+ y_pred_rf = rf_champion.predict(X_test_scaled)
366
+ test_f1_rf = f1_score(y_test, y_pred_rf, average='weighted')
367
+
368
+ print("\n" + "="*60)
369
+ print("COMPARISON WITH BASELINE")
370
+ print("="*60)
371
+ print(f'Random Forest CV F1: {rf_cv["F1_mean"]:.4f} +/- {rf_cv["F1_std"]:.4f}')
372
+ print(f'Random Forest Test F1: {test_f1_rf:.4f}')
373
+
374
+ # Statistical significance test
375
+ def corrected_resampled_ttest(scores_a, scores_b, n_train, n_test):
376
+ k = len(scores_a)
377
+ diff = scores_a - scores_b
378
+ d_bar = diff.mean()
379
+ s_sq = diff.var(ddof=1)
380
+ var_corr = (1/k + n_test/n_train) * s_sq
381
+ t_stat = d_bar / np.sqrt(var_corr)
382
+ p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df=k-1))
383
+ return float(t_stat), float(p_value)
384
+
385
+ n_total = len(X_train_scaled)
386
+ n_test_fold = n_total // N_SPLITS
387
+ n_train_fold = n_total - n_test_fold
388
+
389
+ result_map = {
390
+ 'AdaBoost Ensemble': adaboost_cv['_f1_scores'],
391
+ 'Random Forest': rf_cv['_f1_scores']
392
+ }
393
+
394
+ adaboost_scores = result_map['AdaBoost Ensemble']
395
+ rf_scores = result_map['Random Forest']
396
+
397
+ t, p = corrected_resampled_ttest(adaboost_scores, rf_scores, n_train_fold, n_test_fold)
398
+ print(f"\nStatistical Test (AdaBoost vs Random Forest):")
399
+ print(f" t-statistic: {t:+.3f}")
400
+ print(f" p-value: {p:.4f}")
401
+ print(f" Significant at α=0.05: {'Yes' if p < 0.05 else 'No'}")
402
+
403
+ # Save model
404
+ artifact = {
405
+ 'model' : adaboost_model,
406
+ 'model_name' : 'AdaBoost Ensemble',
407
+ 'scaler' : scaler,
408
+ 'feature_columns' : feature_columns,
409
+ 'cv_metrics': {
410
+ 'f1_mean' : float(adaboost_cv['F1_mean']),
411
+ 'f1_std' : float(adaboost_cv['F1_std']),
412
+ 'accuracy_mean': float(adaboost_cv['Accuracy_mean']),
413
+ },
414
+ 'test_metrics': {
415
+ 'f1' : float(test_f1_adaboost),
416
+ 'accuracy' : float(test_acc_adaboost),
417
+ 'precision': float(test_prec_adaboost),
418
+ 'recall' : float(test_rec_adaboost),
419
+ },
420
+ 'a4_champion_f1' : CHAMPION_F1,
421
+ 'improvement_pct': float((test_f1_adaboost - CHAMPION_F1) / CHAMPION_F1 * 100),
422
+ }
423
+
424
+ out_path = OUT_DIR / 'adaboost_classification.pkl'
425
+ with open(out_path, 'wb') as f:
426
+ pickle.dump(artifact, f)
427
+
428
+ print(f'\nSaved model to: {out_path}')
429
+
430
+ # Classification report
431
+ print('\nCLASSIFICATION REPORT: AdaBoost Ensemble')
432
+ print(classification_report(y_test, y_pred_adaboost, zero_division=0))
433
+
434
+ # Feature importance analysis (simplified)
435
+ print("\n" + "="*60)
436
+ print("FEATURE IMPORTANCE ANALYSIS")
437
+ print("="*60)
438
+
439
+ # Calculate feature importance as average across all trees
440
+ all_importances = np.zeros(len(feature_columns))
441
+ for tree in adaboost_model.trees:
442
+ all_importances += tree.feature_importances_
443
+
444
+ avg_importances = all_importances / len(adaboost_model.trees)
445
+ importance_df = pd.DataFrame({
446
+ 'Feature': feature_columns,
447
+ 'Importance': avg_importances
448
+ }).sort_values('Importance', ascending=False)
449
+
450
+ print("\nTop 10 Most Important Features:")
451
+ print(importance_df.head(10).to_string(index=False))
452
+
453
+ # Plot feature importance
454
+ plt.figure(figsize=(12, 8))
455
+ top_features = importance_df.head(15)
456
+ plt.barh(range(len(top_features)), top_features['Importance'].values)
457
+ plt.yticks(range(len(top_features)), top_features['Feature'].values)
458
+ plt.xlabel('Average Feature Importance')
459
+ plt.ylabel('Features')
460
+ plt.title('Top 15 Feature Importance - AdaBoost Ensemble')
461
+ plt.gca().invert_yaxis()
462
+ plt.tight_layout()
463
+ plt.savefig(OUT_DIR / 'adaboost_feature_importance.png', dpi=150)
464
+ plt.close()
465
+
466
+ print(f"\nSaved feature importance plot to: {OUT_DIR / 'adaboost_feature_importance.png'}")
A5b/classification_bagging_trees.py ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import warnings
4
+ import numpy as np
5
+ import pandas as pd
6
+ import matplotlib.pyplot as plt
7
+ import seaborn as sns
8
+ from pathlib import Path
9
+ from scipy import stats
10
+
11
+ from sklearn.model_selection import StratifiedKFold, cross_validate
12
+ from sklearn.preprocessing import StandardScaler
13
+ from sklearn.metrics import (
14
+ accuracy_score, precision_score, recall_score, f1_score,
15
+ classification_report, confusion_matrix
16
+ )
17
+ from sklearn.tree import DecisionTreeClassifier
18
+ from sklearn.ensemble import BaggingClassifier
19
+ import xgboost as xgb
20
+ import lightgbm as lgb
21
+
22
+ warnings.filterwarnings('ignore')
23
+ np.random.seed(42)
24
+
25
+ REPO_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
26
+ DATA_DIR = os.path.join(REPO_ROOT, 'Datasets_all')
27
+ OUT_DIR = Path('models')
28
+ OUT_DIR.mkdir(exist_ok=True)
29
+
30
+ RANDOM_STATE = 42
31
+ N_SPLITS = 5
32
+ CHAMPION_F1 = 0.6110 # Score from A4
33
+
34
+ movement_features_df = pd.read_csv(os.path.join(DATA_DIR, 'aimoscores.csv'))
35
+ weaklink_scores_df = pd.read_csv(os.path.join(DATA_DIR, 'scores_and_weaklink.csv'))
36
+
37
+ print('Movement features shape:', movement_features_df.shape)
38
+ print('Weak link scores shape:', weaklink_scores_df.shape)
39
+
40
+ DUPLICATE_NASM_COLS = [
41
+ 'No_1_NASM_Deviation',
42
+ 'No_2_NASM_Deviation',
43
+ 'No_3_NASM_Deviation',
44
+ 'No_4_NASM_Deviation',
45
+ 'No_5_NASM_Deviation',
46
+ ]
47
+
48
+ movement_features_df = movement_features_df.drop(columns=DUPLICATE_NASM_COLS)
49
+ print('Shape after duplicate removal:', movement_features_df.shape)
50
+
51
+ weaklink_categories = [
52
+ 'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward',
53
+ 'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward',
54
+ 'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward',
55
+ 'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward',
56
+ 'RightKneeMovesOutward', 'RightShoulderElevation',
57
+ ]
58
+
59
+ weaklink_scores_df['WeakestLink'] = (
60
+ weaklink_scores_df[weaklink_categories].idxmax(axis=1)
61
+ )
62
+ print('Weakest Link class distribution:')
63
+ print(weaklink_scores_df['WeakestLink'].value_counts())
64
+
65
+ # Merge Datasets
66
+ target_df = weaklink_scores_df[['ID', 'WeakestLink']].copy()
67
+ merged_df = movement_features_df.merge(target_df, on='ID', how='inner')
68
+ print('Merged dataset shape:', merged_df.shape)
69
+
70
+ EXCLUDE_COLS = ['ID', 'WeakestLink', 'EstimatedScore']
71
+ feature_columns = [c for c in merged_df.columns if c not in EXCLUDE_COLS]
72
+
73
+ X = merged_df[feature_columns].values
74
+ y = merged_df['WeakestLink'].values
75
+
76
+ print(f'Feature matrix shape : {X.shape}')
77
+ print(f'Number of features : {len(feature_columns)}')
78
+ print(f'Number of classes : {len(np.unique(y))}')
79
+
80
+ # Encode string labels to integers for XGBoost/LightGBM compatibility
81
+ from sklearn.preprocessing import LabelEncoder
82
+ label_encoder = LabelEncoder()
83
+ y_encoded = label_encoder.fit_transform(y)
84
+
85
+ scaler = StandardScaler()
86
+ X_scaled = scaler.fit_transform(X)
87
+
88
+ cv_strategy = StratifiedKFold(
89
+ n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE
90
+ )
91
+
92
+ def evaluate_cv(model, X, y, cv, name='Model', use_encoded_labels=False):
93
+ scoring = {
94
+ 'accuracy' : 'accuracy',
95
+ 'f1' : 'f1_weighted',
96
+ 'precision': 'precision_weighted',
97
+ 'recall' : 'recall_weighted',
98
+ }
99
+ y_to_use = y_encoded if use_encoded_labels else y
100
+ cv_res = cross_validate(model, X, y_to_use, cv=cv, scoring=scoring)
101
+ return {
102
+ 'Model' : name,
103
+ 'Accuracy_mean' : cv_res['test_accuracy'].mean(),
104
+ 'Accuracy_std' : cv_res['test_accuracy'].std(),
105
+ 'F1_mean' : cv_res['test_f1'].mean(),
106
+ 'F1_std' : cv_res['test_f1'].std(),
107
+ 'Precision_mean': cv_res['test_precision'].mean(),
108
+ 'Recall_mean' : cv_res['test_recall'].mean(),
109
+ '_f1_scores' : cv_res['test_f1'],
110
+ }
111
+
112
+ # Baseline: Single Decision Tree
113
+ single_tree = DecisionTreeClassifier(
114
+ max_depth=15,
115
+ min_samples_split=5,
116
+ min_samples_leaf=2,
117
+ class_weight='balanced',
118
+ random_state=RANDOM_STATE
119
+ )
120
+ single_tree_cv = evaluate_cv(
121
+ single_tree, X_scaled, y, cv_strategy,
122
+ name='Single Decision Tree'
123
+ )
124
+ print('SINGLE DECISION TREE')
125
+ print(f'CV F1: {single_tree_cv["F1_mean"]:.4f} +/- {single_tree_cv["F1_std"]:.4f}')
126
+
127
+ # Bagging with Decision Trees (default: uses all features)
128
+ bagging_default = BaggingClassifier(
129
+ estimator=DecisionTreeClassifier(
130
+ max_depth=15,
131
+ min_samples_split=5,
132
+ min_samples_leaf=2,
133
+ class_weight='balanced',
134
+ random_state=RANDOM_STATE
135
+ ),
136
+ n_estimators=200,
137
+ max_samples=1.0, # Bootstrap sample size (100% of training data)
138
+ max_features=1.0, # Use all features
139
+ bootstrap=True,
140
+ bootstrap_features=False, # Don't subsample features
141
+ n_jobs=-1,
142
+ random_state=RANDOM_STATE
143
+ )
144
+ bagging_default_cv = evaluate_cv(
145
+ bagging_default, X_scaled, y, cv_strategy,
146
+ name='Bagging (All Features)'
147
+ )
148
+ print(f'Bagging (All Features) CV F1: {bagging_default_cv["F1_mean"]:.4f} +/- {bagging_default_cv["F1_std"]:.4f}')
149
+
150
+ # Bagging with Decision Trees + Feature Subsetting (Random Subspace Method)
151
+ # This creates trees using random subsets of predictors
152
+ bagging_subspace = BaggingClassifier(
153
+ estimator=DecisionTreeClassifier(
154
+ max_depth=15,
155
+ min_samples_split=5,
156
+ min_samples_leaf=2,
157
+ class_weight='balanced',
158
+ random_state=RANDOM_STATE
159
+ ),
160
+ n_estimators=200,
161
+ max_samples=1.0,
162
+ max_features=0.7, # Use 70% of features for each tree
163
+ bootstrap=True,
164
+ bootstrap_features=True, # Subsample features for each tree
165
+ n_jobs=-1,
166
+ random_state=RANDOM_STATE
167
+ )
168
+ bagging_subspace_cv = evaluate_cv(
169
+ bagging_subspace, X_scaled, y, cv_strategy,
170
+ name='Bagging (70% Features)'
171
+ )
172
+ print(f'Bagging (70% Features) CV F1: {bagging_subspace_cv["F1_mean"]:.4f} +/- {bagging_subspace_cv["F1_std"]:.4f}')
173
+
174
+ # Bagging with smaller feature subset (50%)
175
+ bagging_50features = BaggingClassifier(
176
+ estimator=DecisionTreeClassifier(
177
+ max_depth=15,
178
+ min_samples_split=5,
179
+ min_samples_leaf=2,
180
+ class_weight='balanced',
181
+ random_state=RANDOM_STATE
182
+ ),
183
+ n_estimators=200,
184
+ max_samples=1.0,
185
+ max_features=0.5, # Use 50% of features for each tree
186
+ bootstrap=True,
187
+ bootstrap_features=True,
188
+ n_jobs=-1,
189
+ random_state=RANDOM_STATE
190
+ )
191
+ bagging_50features_cv = evaluate_cv(
192
+ bagging_50features, X_scaled, y, cv_strategy,
193
+ name='Bagging (50% Features)'
194
+ )
195
+ print(f'Bagging (50% Features) CV F1: {bagging_50features_cv["F1_mean"]:.4f} +/- {bagging_50features_cv["F1_std"]:.4f}')
196
+
197
+ # Bagging with even smaller feature subset (30%)
198
+ bagging_30features = BaggingClassifier(
199
+ estimator=DecisionTreeClassifier(
200
+ max_depth=15,
201
+ min_samples_split=5,
202
+ min_samples_leaf=2,
203
+ class_weight='balanced',
204
+ random_state=RANDOM_STATE
205
+ ),
206
+ n_estimators=200,
207
+ max_samples=1.0,
208
+ max_features=0.3, # Use 30% of features for each tree
209
+ bootstrap=True,
210
+ bootstrap_features=True,
211
+ n_jobs=-1,
212
+ random_state=RANDOM_STATE
213
+ )
214
+ bagging_30features_cv = evaluate_cv(
215
+ bagging_30features, X_scaled, y, cv_strategy,
216
+ name='Bagging (30% Features)'
217
+ )
218
+ print(f'Bagging (30% Features) CV F1: {bagging_30features_cv["F1_mean"]:.4f} +/- {bagging_30features_cv["F1_std"]:.4f}')
219
+
220
+ # Compare with Random Forest (for reference)
221
+ from sklearn.ensemble import RandomForestClassifier
222
+ rf_model = RandomForestClassifier(
223
+ n_estimators=200,
224
+ max_depth=15,
225
+ min_samples_split=5,
226
+ min_samples_leaf=2,
227
+ max_features='sqrt', # sqrt(n_features) - standard random forest
228
+ class_weight='balanced',
229
+ random_state=RANDOM_STATE,
230
+ n_jobs=-1
231
+ )
232
+ rf_cv = evaluate_cv(
233
+ rf_model, X_scaled, y, cv_strategy,
234
+ name='Random Forest (sqrt features)'
235
+ )
236
+ print(f'Random Forest CV F1: {rf_cv["F1_mean"]:.4f} +/- {rf_cv["F1_std"]:.4f}')
237
+
238
+ # Compare with XGBoost and LightGBM (for reference)
239
+ xgb_model = xgb.XGBClassifier(
240
+ n_estimators=200,
241
+ max_depth=6,
242
+ learning_rate=0.1,
243
+ subsample=0.8,
244
+ colsample_bytree=0.8,
245
+ random_state=RANDOM_STATE,
246
+ class_weight='balanced',
247
+ n_jobs=-1,
248
+ verbosity=0
249
+ )
250
+ xgb_cv = evaluate_cv(
251
+ xgb_model, X_scaled, y, cv_strategy,
252
+ name='XGBoost',
253
+ use_encoded_labels=True
254
+ )
255
+ print(f'XGBoost CV F1: {xgb_cv["F1_mean"]:.4f} +/- {xgb_cv["F1_std"]:.4f}')
256
+
257
+ lgb_model = lgb.LGBMClassifier(
258
+ n_estimators=200,
259
+ learning_rate=0.1,
260
+ class_weight='balanced',
261
+ subsample=0.8,
262
+ colsample_bytree=0.8,
263
+ random_state=RANDOM_STATE,
264
+ n_jobs=-1,
265
+ verbosity=-1
266
+ )
267
+ lgb_cv = evaluate_cv(
268
+ lgb_model, X_scaled, y, cv_strategy,
269
+ name='LightGBM',
270
+ use_encoded_labels=True
271
+ )
272
+ print(f'LightGBM CV F1: {lgb_cv["F1_mean"]:.4f} +/- {lgb_cv["F1_std"]:.4f}')
273
+
274
+ # Collect all results
275
+ all_results = [
276
+ single_tree_cv,
277
+ bagging_default_cv,
278
+ bagging_subspace_cv,
279
+ bagging_50features_cv,
280
+ bagging_30features_cv,
281
+ rf_cv,
282
+ xgb_cv,
283
+ lgb_cv,
284
+ ]
285
+
286
+ results_df = (
287
+ pd.DataFrame([{k: v for k, v in r.items() if k != '_f1_scores'}
288
+ for r in all_results])
289
+ .sort_values('F1_mean', ascending=False)
290
+ .reset_index(drop=True)
291
+ )
292
+
293
+ print('\n5-FOLD CROSS-VALIDATION SUMMARY')
294
+ print(results_df[['Model', 'F1_mean', 'F1_std', 'Accuracy_mean',
295
+ 'Precision_mean', 'Recall_mean']].to_string(index=False))
296
+
297
+ # Statistical Significance Test (t-test)
298
+ def corrected_resampled_ttest(scores_a, scores_b, n_train, n_test):
299
+ k = len(scores_a)
300
+ diff = scores_a - scores_b
301
+ d_bar = diff.mean()
302
+ s_sq = diff.var(ddof=1)
303
+ var_corr = (1/k + n_test/n_train) * s_sq
304
+ t_stat = d_bar / np.sqrt(var_corr)
305
+ p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df=k-1))
306
+ return float(t_stat), float(p_value)
307
+
308
+ n_total = len(X_scaled)
309
+ n_test_fold = n_total // N_SPLITS
310
+ n_train_fold = n_total - n_test_fold
311
+
312
+ result_map = {r['Model']: r['_f1_scores'] for r in all_results}
313
+ best_model_name = results_df.iloc[0]['Model']
314
+ best_scores = result_map[best_model_name]
315
+
316
+ print('\nSTATISTICAL SIGNIFICANCE TESTS vs Best Model')
317
+ for r in all_results:
318
+ if r['Model'] == best_model_name:
319
+ continue
320
+ t, p = corrected_resampled_ttest(
321
+ r['_f1_scores'], best_scores, n_train_fold, n_test_fold
322
+ )
323
+ print(f' {r["Model"]:<35} t={t:+.3f} p={p:.4f}')
324
+
325
+ # Save the best model
326
+ model_objects = {
327
+ 'Single Decision Tree': single_tree,
328
+ 'Bagging (All Features)': bagging_default,
329
+ 'Bagging (70% Features)': bagging_subspace,
330
+ 'Bagging (50% Features)': bagging_50features,
331
+ 'Bagging (30% Features)': bagging_30features,
332
+ 'Random Forest': rf_model,
333
+ 'XGBoost': xgb_model,
334
+ 'LightGBM': lgb_model,
335
+ }
336
+
337
+ best_name = results_df.iloc[0]['Model']
338
+ best_model = model_objects[best_name]
339
+
340
+ print(f'\nBEST MODEL: {best_name}')
341
+ print(f'CV F1 : {results_df.iloc[0]["F1_mean"]:.4f} +/- {results_df.iloc[0]["F1_std"]:.4f}')
342
+
343
+ # Train final model on all data
344
+ best_model.fit(X_scaled, y_encoded)
345
+
346
+ # Save model artifact
347
+ artifact = {
348
+ 'model' : best_model,
349
+ 'model_name' : best_name,
350
+ 'scaler' : scaler,
351
+ 'label_encoder' : label_encoder,
352
+ 'feature_columns' : feature_columns,
353
+ 'cv_metrics': {
354
+ 'f1_mean' : float(results_df.iloc[0]['F1_mean']),
355
+ 'f1_std' : float(results_df.iloc[0]['F1_std']),
356
+ 'accuracy_mean': float(results_df.iloc[0]['Accuracy_mean']),
357
+ },
358
+ 'a4_champion_f1' : CHAMPION_F1,
359
+ }
360
+
361
+ out_path = OUT_DIR / 'bagging_trees_champion.pkl'
362
+ with open(out_path, 'wb') as f:
363
+ pickle.dump(artifact, f)
364
+
365
+ print(f'\nSaved: {out_path}')
366
+
367
+ # Print feature importances for the best ensemble model
368
+ if hasattr(best_model, 'feature_importances_'):
369
+ importances = best_model.feature_importances_
370
+ indices = np.argsort(importances)[::-1]
371
+
372
+ print(f'\nTop 10 Most Important Features ({best_name}):')
373
+ for i in range(min(10, len(feature_columns))):
374
+ print(f' {i+1}. {feature_columns[indices[i]]}: {importances[indices[i]]:.4f}')
A5b/classification_baseline.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import warnings
4
+ import numpy as np
5
+ import pandas as pd
6
+ import matplotlib.pyplot as plt
7
+ import seaborn as sns
8
+ from pathlib import Path
9
+ from scipy import stats
10
+
11
+ from sklearn.model_selection import (
12
+ train_test_split, StratifiedKFold, cross_validate
13
+ )
14
+ from sklearn.preprocessing import StandardScaler
15
+ from sklearn.metrics import (
16
+ accuracy_score, precision_score, recall_score, f1_score,
17
+ classification_report, confusion_matrix
18
+ )
19
+ from sklearn.linear_model import LogisticRegression
20
+ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
21
+ from sklearn.neighbors import KNeighborsClassifier
22
+ from sklearn.naive_bayes import GaussianNB
23
+ from sklearn.ensemble import (
24
+ RandomForestClassifier,
25
+ VotingClassifier,
26
+ BaggingClassifier,
27
+ StackingClassifier,
28
+ )
29
+ import xgboost as xgb
30
+ import lightgbm as lgb
31
+ warnings.filterwarnings('ignore')
32
+ np.random.seed(42)
33
+
34
+ REPO_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
35
+ DATA_DIR = os.path.join(REPO_ROOT, 'Datasets_all')
36
+ OUT_DIR = Path('models')
37
+ OUT_DIR.mkdir(exist_ok=True)
38
+
39
+ RANDOM_STATE = 42
40
+ N_SPLITS = 5
41
+ CHAMPION_F1 = 0.6110 # Score from A4
42
+
43
+ movement_features_df = pd.read_csv(os.path.join(DATA_DIR, 'aimoscores.csv'))
44
+ weaklink_scores_df = pd.read_csv(os.path.join(DATA_DIR, 'scores_and_weaklink.csv'))
45
+
46
+ print('Movement features shape:', movement_features_df.shape)
47
+ print('Weak link scores shape:', weaklink_scores_df.shape)
48
+
49
+ DUPLICATE_NASM_COLS = [
50
+ 'No_1_NASM_Deviation',
51
+ 'No_2_NASM_Deviation',
52
+ 'No_3_NASM_Deviation',
53
+ 'No_4_NASM_Deviation',
54
+ 'No_5_NASM_Deviation',
55
+ ]
56
+
57
+ movement_features_df = movement_features_df.drop(columns=DUPLICATE_NASM_COLS)
58
+ print('Shape after duplicate removal:', movement_features_df.shape)
59
+
60
+ weaklink_categories = [
61
+ 'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward',
62
+ 'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward',
63
+ 'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward',
64
+ 'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward',
65
+ 'RightKneeMovesOutward', 'RightShoulderElevation',
66
+ ]
67
+
68
+ weaklink_scores_df['WeakestLink'] = (
69
+ weaklink_scores_df[weaklink_categories].idxmax(axis=1)
70
+ )
71
+ print('Weakest Link class distribution:')
72
+ print(weaklink_scores_df['WeakestLink'].value_counts())
73
+
74
+ # Merge Datasets
75
+ target_df = weaklink_scores_df[['ID', 'WeakestLink']].copy()
76
+ merged_df = movement_features_df.merge(target_df, on='ID', how='inner')
77
+ print('Merged dataset shape:', merged_df.shape)
78
+
79
+ EXCLUDE_COLS = ['ID', 'WeakestLink', 'EstimatedScore']
80
+ feature_columns = [c for c in merged_df.columns if c not in EXCLUDE_COLS]
81
+
82
+ X = merged_df[feature_columns].values
83
+ y = merged_df['WeakestLink'].values
84
+
85
+ print(f'Feature matrix shape : {X.shape}')
86
+ print(f'Number of features : {len(feature_columns)}')
87
+ print(f'Number of classes : {len(np.unique(y))}')
88
+
89
+ # is the training split needed for cross validation?
90
+ X_train, X_test, y_train, y_test = train_test_split(
91
+ X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
92
+ )
93
+
94
+ scaler = StandardScaler()
95
+ X_train_scaled = scaler.fit_transform(X_train)
96
+ X_test_scaled = scaler.transform(X_test)
97
+
98
+ print(f'Training samples : {X_train.shape[0]}')
99
+ print(f'Test samples : {X_test.shape[0]}')
100
+
101
+ cv_strategy = StratifiedKFold(
102
+ n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE
103
+ )
104
+
105
+ def evaluate_cv(model, X, y, cv, name='Model'):
106
+ scoring = {
107
+ 'accuracy' : 'accuracy',
108
+ 'f1' : 'f1_weighted',
109
+ 'precision': 'precision_weighted',
110
+ 'recall' : 'recall_weighted',
111
+ }
112
+ cv_res = cross_validate(model, X, y, cv=cv, scoring=scoring)
113
+ return {
114
+ 'Model' : name,
115
+ 'Accuracy_mean' : cv_res['test_accuracy'].mean(),
116
+ 'Accuracy_std' : cv_res['test_accuracy'].std(),
117
+ 'F1_mean' : cv_res['test_f1'].mean(),
118
+ 'F1_std' : cv_res['test_f1'].std(),
119
+ 'Precision_mean': cv_res['test_precision'].mean(),
120
+ 'Recall_mean' : cv_res['test_recall'].mean(),
121
+ '_f1_scores' : cv_res['test_f1'],
122
+ }
123
+
124
+ rf_champion = RandomForestClassifier(
125
+ n_estimators=200, max_depth=15,
126
+ min_samples_split=5, min_samples_leaf=2,
127
+ class_weight='balanced',
128
+ random_state=RANDOM_STATE, n_jobs=-1
129
+ )
130
+ champ_cv = evaluate_cv(
131
+ rf_champion, X_train_scaled, y_train, cv_strategy,
132
+ name='A4 Champion – Random Forest'
133
+ )
134
+ rf_champion.fit(X_train_scaled, y_train)
135
+ champ_test_f1 = f1_score(y_test, rf_champion.predict(X_test_scaled), average='weighted')
136
+
137
+ print('A4 CHAMPION (Random Forest)')
138
+ print(f'CV F1: {champ_cv["F1_mean"]:.4f} +/- {champ_cv["F1_std"]:.4f}')
139
+ print(f'Test F1: {champ_test_f1:.4f}')
140
+
141
+ soft_voting = VotingClassifier(
142
+ estimators=[
143
+ ('rf', RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, class_weight='balanced_subsample',
144
+ random_state=RANDOM_STATE, n_jobs=-1)),
145
+ ('lr', LogisticRegression( max_iter=1000, class_weight='balanced',random_state=RANDOM_STATE)),
146
+ ('xgb', xgb.XGBClassifier( n_estimators=200, max_depth=6, learning_rate=0.1, subsample=0.8,
147
+ colsample_bytree=0.8, random_state=RANDOM_STATE,class_weight='balanced', n_jobs=-1 )),
148
+ ('lgb', lgb.LGBMClassifier( n_estimators=200, learning_rate=0.1, class_weight='balanced',subsample=0.8, colsample_bytree=0.8,
149
+ random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1 )),
150
+ ('knn', KNeighborsClassifier(n_neighbors=7)),
151
+ ('lda', LinearDiscriminantAnalysis()),
152
+ ],
153
+ voting='soft',
154
+ n_jobs=-1,
155
+ )
156
+
157
+ sv_cv = evaluate_cv(soft_voting, X_train_scaled, y_train, cv_strategy, name='Soft Voting')
158
+ print(f'Soft Voting CV F1: {sv_cv["F1_mean"]:.4f} +/- {sv_cv["F1_std"]:.4f}')
159
+
160
+ all_results = [champ_cv, sv_cv]
161
+ results_df = (
162
+ pd.DataFrame([{k: v for k, v in r.items() if k != '_f1_scores'}
163
+ for r in all_results])
164
+ .sort_values('F1_mean', ascending=False)
165
+ .reset_index(drop=True)
166
+ )
167
+
168
+ print('5-FOLD CROSS-VALIDATION SUMMARY')
169
+ print(results_df[['Model','F1_mean','F1_std','Accuracy_mean',
170
+ 'Precision_mean','Recall_mean']].to_string(index=False))
171
+
172
+ # Statistical Significance Test (t-test)
173
+ def corrected_resampled_ttest(scores_a, scores_b, n_train, n_test):
174
+ k = len(scores_a)
175
+ diff = scores_a - scores_b
176
+ d_bar = diff.mean()
177
+ s_sq = diff.var(ddof=1)
178
+ var_corr = (1/k + n_test/n_train) * s_sq
179
+ t_stat = d_bar / np.sqrt(var_corr)
180
+ p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df=k-1))
181
+ return float(t_stat), float(p_value)
182
+
183
+ n_total = len(X_train_scaled)
184
+ n_test_fold = n_total // N_SPLITS
185
+ n_train_fold = n_total - n_test_fold
186
+
187
+ result_map = {r['Model']: r['_f1_scores'] for r in all_results}
188
+ champ_scores = result_map['A4 Champion – Random Forest']
189
+
190
+ print('STATISTICAL SIGNIFICANCE TESTS vs A4 Champion')
191
+ for r in all_results:
192
+ if 'Champion' in r['Model']:
193
+ continue
194
+ t, p = corrected_resampled_ttest(
195
+ r['_f1_scores'], champ_scores, n_train_fold, n_test_fold
196
+ )
197
+ print(f' {r["Model"]:<35} t={t:+.3f} p={p:.4f}')
198
+
199
+ # unecessary eval on the test set?
200
+ model_objects = {
201
+ 'Soft Voting' : soft_voting,
202
+ 'A4 Champion – Random Forest': rf_champion,
203
+ }
204
+
205
+ best_name = results_df.iloc[0]['Model']
206
+ best_model = model_objects[best_name]
207
+
208
+ print(f'CHAMPION ENSEMBLE: {best_name}')
209
+ print(f'CV F1 : {results_df.iloc[0]["F1_mean"]:.4f} +/- {results_df.iloc[0]["F1_std"]:.4f}')
210
+
211
+ best_model.fit(X_train_scaled, y_train)
212
+ y_pred_best = best_model.predict(X_test_scaled)
213
+
214
+ test_f1 = f1_score(y_test, y_pred_best, average='weighted')
215
+ test_acc = accuracy_score(y_test, y_pred_best)
216
+ test_prec = precision_score(y_test, y_pred_best, average='weighted', zero_division=0)
217
+ test_rec = recall_score(y_test, y_pred_best, average='weighted', zero_division=0)
218
+ improvement = (test_f1 - CHAMPION_F1) / CHAMPION_F1 * 100
219
+
220
+ print('\n TEST SET RESULTS')
221
+ print(f'F1-Score (weighted) : {test_f1:.4f}')
222
+ print(f'Accuracy : {test_acc:.4f}')
223
+ print(f'Precision : {test_prec:.4f}')
224
+ print(f'Recall : {test_rec:.4f}')
225
+ print(f'\n A4 original champion F1 : {CHAMPION_F1:.4f}')
226
+
227
+ test_rows = []
228
+ for name, model in model_objects.items():
229
+ model.fit(X_train_scaled, y_train)
230
+ preds = model.predict(X_test_scaled)
231
+ test_rows.append({
232
+ 'Model' : name,
233
+ 'Test_F1' : f1_score(y_test, preds, average='weighted'),
234
+ 'Test_Acc' : accuracy_score(y_test, preds),
235
+ 'Test_Prec' : precision_score(y_test, preds, average='weighted', zero_division=0),
236
+ 'Test_Recall': recall_score(y_test, preds, average='weighted', zero_division=0),
237
+ })
238
+
239
+ test_results_df = pd.DataFrame(test_rows).sort_values('Test_F1', ascending=False)
240
+ print('TEST SET COMPARISON – ALL MODELS')
241
+ print(test_results_df.to_string(index=False))
242
+
243
+ print(f'CLASSIFICATION REPORT: {best_name}')
244
+ print(classification_report(y_test, y_pred_best, zero_division=0))
245
+
246
+ # save model
247
+ artifact = {
248
+ 'model' : best_model,
249
+ 'model_name' : best_name,
250
+ 'scaler' : scaler,
251
+ 'feature_columns' : feature_columns,
252
+ 'cv_metrics': {
253
+ 'f1_mean' : float(results_df.iloc[0]['F1_mean']),
254
+ 'f1_std' : float(results_df.iloc[0]['F1_std']),
255
+ 'accuracy_mean': float(results_df.iloc[0]['Accuracy_mean']),
256
+ },
257
+ 'test_metrics': {
258
+ 'f1' : float(test_f1),
259
+ 'accuracy' : float(test_acc),
260
+ 'precision': float(test_prec),
261
+ 'recall' : float(test_rec),
262
+ },
263
+ 'a4_champion_f1' : CHAMPION_F1,
264
+ 'improvement_pct': float(improvement),
265
+ }
266
+
267
+ out_path = OUT_DIR / 'ensemble_classification_champion.pkl'
268
+ with open(out_path, 'wb') as f:
269
+ pickle.dump(artifact, f)
270
+
271
+ print(f'Saved: {out_path}')
A5b/cv_baseline.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import warnings
4
+ import numpy as np
5
+ import pandas as pd
6
+ import matplotlib.pyplot as plt
7
+ import seaborn as sns
8
+ from pathlib import Path
9
+ from scipy import stats
10
+
11
+ from sklearn.model_selection import (
12
+ StratifiedKFold, cross_validate
13
+ )
14
+ from sklearn.preprocessing import StandardScaler
15
+ from sklearn.metrics import (
16
+ accuracy_score, precision_score, recall_score, f1_score,
17
+ classification_report, confusion_matrix
18
+ )
19
+ from sklearn.linear_model import LogisticRegression
20
+ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
21
+ from sklearn.neighbors import KNeighborsClassifier
22
+ from sklearn.naive_bayes import GaussianNB
23
+ from sklearn.ensemble import (
24
+ RandomForestClassifier,
25
+ VotingClassifier,
26
+ BaggingClassifier,
27
+ StackingClassifier,
28
+ )
29
+ import xgboost as xgb
30
+ import lightgbm as lgb
31
+ warnings.filterwarnings('ignore')
32
+ np.random.seed(42)
33
+
34
+ REPO_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
35
+ DATA_DIR = os.path.join(REPO_ROOT, 'Datasets_all')
36
+ OUT_DIR = Path('models')
37
+ OUT_DIR.mkdir(exist_ok=True)
38
+
39
+ RANDOM_STATE = 42
40
+ N_SPLITS = 5
41
+ CHAMPION_F1 = 0.6110 # Score from A4
42
+
43
+ movement_features_df = pd.read_csv(os.path.join(DATA_DIR, 'aimoscores.csv'))
44
+ weaklink_scores_df = pd.read_csv(os.path.join(DATA_DIR, 'scores_and_weaklink.csv'))
45
+
46
+ print('Movement features shape:', movement_features_df.shape)
47
+ print('Weak link scores shape:', weaklink_scores_df.shape)
48
+
49
+ DUPLICATE_NASM_COLS = [
50
+ 'No_1_NASM_Deviation',
51
+ 'No_2_NASM_Deviation',
52
+ 'No_3_NASM_Deviation',
53
+ 'No_4_NASM_Deviation',
54
+ 'No_5_NASM_Deviation',
55
+ ]
56
+
57
+ movement_features_df = movement_features_df.drop(columns=DUPLICATE_NASM_COLS)
58
+ print('Shape after duplicate removal:', movement_features_df.shape)
59
+
60
+ weaklink_categories = [
61
+ 'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward',
62
+ 'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward',
63
+ 'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward',
64
+ 'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward',
65
+ 'RightKneeMovesOutward', 'RightShoulderElevation',
66
+ ]
67
+
68
+ weaklink_scores_df['WeakestLink'] = (
69
+ weaklink_scores_df[weaklink_categories].idxmax(axis=1)
70
+ )
71
+ print('Weakest Link class distribution:')
72
+ print(weaklink_scores_df['WeakestLink'].value_counts())
73
+
74
+ # Merge Datasets
75
+ target_df = weaklink_scores_df[['ID', 'WeakestLink']].copy()
76
+ merged_df = movement_features_df.merge(target_df, on='ID', how='inner')
77
+ print('Merged dataset shape:', merged_df.shape)
78
+
79
+ EXCLUDE_COLS = ['ID', 'WeakestLink', 'EstimatedScore']
80
+ feature_columns = [c for c in merged_df.columns if c not in EXCLUDE_COLS]
81
+
82
+ X = merged_df[feature_columns].values
83
+ y = merged_df['WeakestLink'].values
84
+
85
+ print(f'Feature matrix shape : {X.shape}')
86
+ print(f'Number of features : {len(feature_columns)}')
87
+ print(f'Number of classes : {len(np.unique(y))}')
88
+
89
+ scaler = StandardScaler()
90
+ X_scaled = scaler.fit_transform(X)
91
+
92
+ print(f'Total samples : {X.shape[0]}')
93
+
94
+ cv_strategy = StratifiedKFold(
95
+ n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE
96
+ )
97
+
98
+ def evaluate_cv(model, X, y, cv, name='Model'):
99
+ scoring = {
100
+ 'accuracy' : 'accuracy',
101
+ 'f1' : 'f1_weighted',
102
+ 'precision': 'precision_weighted',
103
+ 'recall' : 'recall_weighted',
104
+ }
105
+ cv_res = cross_validate(model, X, y, cv=cv, scoring=scoring)
106
+ return {
107
+ 'Model' : name,
108
+ 'Accuracy_mean' : cv_res['test_accuracy'].mean(),
109
+ 'Accuracy_std' : cv_res['test_accuracy'].std(),
110
+ 'F1_mean' : cv_res['test_f1'].mean(),
111
+ 'F1_std' : cv_res['test_f1'].std(),
112
+ 'Precision_mean': cv_res['test_precision'].mean(),
113
+ 'Recall_mean' : cv_res['test_recall'].mean(),
114
+ '_f1_scores' : cv_res['test_f1'],
115
+ }
116
+
117
+ rf_champion = RandomForestClassifier(
118
+ n_estimators=200, max_depth=15,
119
+ min_samples_split=5, min_samples_leaf=2,
120
+ class_weight='balanced',
121
+ random_state=RANDOM_STATE, n_jobs=-1
122
+ )
123
+ champ_cv = evaluate_cv(
124
+ rf_champion, X_scaled, y, cv_strategy,
125
+ name='A4 Champion – Random Forest'
126
+ )
127
+
128
+ soft_voting = VotingClassifier(
129
+ estimators=[
130
+ ('rf', RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, class_weight='balanced_subsample',
131
+ random_state=RANDOM_STATE, n_jobs=-1)),
132
+ ('lr', LogisticRegression( max_iter=1000, class_weight='balanced',random_state=RANDOM_STATE)),
133
+ ('xgb', xgb.XGBClassifier( n_estimators=200, max_depth=6, learning_rate=0.1, subsample=0.8,
134
+ colsample_bytree=0.8, random_state=RANDOM_STATE,class_weight='balanced', n_jobs=-1 )),
135
+ ('lgb', lgb.LGBMClassifier( n_estimators=200, learning_rate=0.1, class_weight='balanced',subsample=0.8, colsample_bytree=0.8,
136
+ random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1 )),
137
+ ('knn', KNeighborsClassifier(n_neighbors=7)),
138
+ ('lda', LinearDiscriminantAnalysis()),
139
+ ],
140
+ voting='soft',
141
+ n_jobs=-1,
142
+ )
143
+
144
+ sv_cv = evaluate_cv(soft_voting, X_scaled, y, cv_strategy, name='Soft Voting')
145
+
146
+ all_results = [champ_cv, sv_cv]
147
+ results_df = (
148
+ pd.DataFrame([{k: v for k, v in r.items() if k != '_f1_scores'}
149
+ for r in all_results])
150
+ .sort_values('F1_mean', ascending=False)
151
+ .reset_index(drop=True)
152
+ )
153
+
154
+ print('5-FOLD CROSS-VALIDATION SUMMARY')
155
+ print(results_df[['Model','F1_mean','F1_std','Accuracy_mean',
156
+ 'Precision_mean','Recall_mean']].to_string(index=False))
157
+
158
+ # Statistical Significance Test (t-test)
159
+ def corrected_resampled_ttest(scores_a, scores_b, n_train, n_test):
160
+ k = len(scores_a)
161
+ diff = scores_a - scores_b
162
+ d_bar = diff.mean()
163
+ s_sq = diff.var(ddof=1)
164
+ var_corr = (1/k + n_test/n_train) * s_sq
165
+ t_stat = d_bar / np.sqrt(var_corr)
166
+ p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df=k-1))
167
+ return float(t_stat), float(p_value)
168
+
169
+ n_total = len(X_scaled)
170
+ n_test_fold = n_total // N_SPLITS
171
+ n_train_fold = n_total - n_test_fold
172
+
173
+ result_map = {r['Model']: r['_f1_scores'] for r in all_results}
174
+ champ_scores = result_map['A4 Champion – Random Forest']
175
+
176
+ print('STATISTICAL SIGNIFICANCE TESTS vs A4 Champion')
177
+ for r in all_results:
178
+ if 'Champion' in r['Model']:
179
+ continue
180
+ t, p = corrected_resampled_ttest(
181
+ r['_f1_scores'], champ_scores, n_train_fold, n_test_fold
182
+ )
183
+ print(f' {r["Model"]:<35} t={t:+.3f} p={p:.4f}')
184
+
185
+ # Save model using cross-validation (fit on all data)
186
+ model_objects = {
187
+ 'Soft Voting' : soft_voting,
188
+ 'A4 Champion – Random Forest': rf_champion,
189
+ }
190
+
191
+ best_name = results_df.iloc[0]['Model']
192
+ best_model = model_objects[best_name]
193
+
194
+ print(f'CHAMPION ENSEMBLE: {best_name}')
195
+ print(f'CV F1 : {results_df.iloc[0]["F1_mean"]:.4f} +/- {results_df.iloc[0]["F1_std"]:.4f}')
196
+
197
+ # Fit best model on all data for final deployment
198
+ best_model.fit(X_scaled, y)
199
+
200
+ # Save model
201
+ artifact = {
202
+ 'model' : best_model,
203
+ 'model_name' : best_name,
204
+ 'scaler' : scaler,
205
+ 'feature_columns' : feature_columns,
206
+ 'cv_metrics': {
207
+ 'f1_mean' : float(results_df.iloc[0]['F1_mean']),
208
+ 'f1_std' : float(results_df.iloc[0]['F1_std']),
209
+ 'accuracy_mean': float(results_df.iloc[0]['Accuracy_mean']),
210
+ 'precision_mean': float(results_df.iloc[0]['Precision_mean']),
211
+ 'recall_mean' : float(results_df.iloc[0]['Recall_mean']),
212
+ },
213
+ 'a4_champion_f1' : CHAMPION_F1,
214
+ }
215
+
216
+ out_path = OUT_DIR / 'ensemble_classification_champion.pkl'
217
+ with open(out_path, 'wb') as f:
218
+ pickle.dump(artifact, f)
219
+
220
+ print(f'Saved: {out_path}')
A5b/models/adaboost_classification.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d34c717b5f5dc02f4481f3207afcacb94ceb3ec69069589ca6abe435c8001470
3
+ size 725059
A5b/models/adaboost_feature_importance.png ADDED

Git LFS Details

  • SHA256: aecc76dbe9ce90a4813a3b7040d1e57ee324ec1d71f10303b129c41cbc3ad744
  • Pointer size: 131 Bytes
  • Size of remote file: 101 kB
A5b/models/bagging_trees_champion.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2475e28a11e89c0c4544064525f6d41d7890e19c5549575ac597d64e076616e
3
+ size 6506122
A5b/models/ensemble_classification_champion.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93cecc19fe3e22c357af6ee6778990d7e3b518a36bbb6418a78ecb6795ef4cce
3
+ size 30798315