Spaces:
Sleeping
Sleeping
Bachstelze commited on
Commit ·
f5e4068
1
Parent(s): 54eac1a
test baseline with cv only
Browse files- A5b/classification_adaboost.py +466 -0
- A5b/classification_bagging_trees.py +374 -0
- A5b/classification_baseline.py +271 -0
- A5b/cv_baseline.py +220 -0
- A5b/models/adaboost_classification.pkl +3 -0
- A5b/models/adaboost_feature_importance.png +3 -0
- A5b/models/bagging_trees_champion.pkl +3 -0
- A5b/models/ensemble_classification_champion.pkl +3 -0
A5b/classification_adaboost.py
ADDED
|
@@ -0,0 +1,466 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pickle
|
| 3 |
+
import warnings
|
| 4 |
+
import numpy as np
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import matplotlib.pyplot as plt
|
| 7 |
+
import seaborn as sns
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from scipy import stats
|
| 10 |
+
from typing import List, Tuple, Dict, Any
|
| 11 |
+
|
| 12 |
+
from sklearn.model_selection import (
|
| 13 |
+
train_test_split, StratifiedKFold, cross_validate
|
| 14 |
+
)
|
| 15 |
+
from sklearn.base import BaseEstimator, ClassifierMixin
|
| 16 |
+
from sklearn.preprocessing import StandardScaler
|
| 17 |
+
from sklearn.metrics import (
|
| 18 |
+
accuracy_score, precision_score, recall_score, f1_score,
|
| 19 |
+
classification_report, confusion_matrix
|
| 20 |
+
)
|
| 21 |
+
from sklearn.tree import DecisionTreeClassifier
|
| 22 |
+
from sklearn.ensemble import (
|
| 23 |
+
RandomForestClassifier,
|
| 24 |
+
VotingClassifier,
|
| 25 |
+
BaggingClassifier,
|
| 26 |
+
StackingClassifier,
|
| 27 |
+
)
|
| 28 |
+
import xgboost as xgb
|
| 29 |
+
import lightgbm as lgb
|
| 30 |
+
|
| 31 |
+
warnings.filterwarnings('ignore')
|
| 32 |
+
np.random.seed(42)
|
| 33 |
+
|
| 34 |
+
REPO_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
|
| 35 |
+
DATA_DIR = os.path.join(REPO_ROOT, 'Datasets_all')
|
| 36 |
+
OUT_DIR = Path('models')
|
| 37 |
+
OUT_DIR.mkdir(exist_ok=True)
|
| 38 |
+
|
| 39 |
+
RANDOM_STATE = 42
|
| 40 |
+
N_SPLITS = 5
|
| 41 |
+
CHAMPION_F1 = 0.6110 # Score from A4
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class WeightedDecisionTree(DecisionTreeClassifier):
|
| 45 |
+
"""
|
| 46 |
+
A wrapper around DecisionTreeClassifier that properly handles sample weights.
|
| 47 |
+
This tree is grown based on weighted training errors.
|
| 48 |
+
"""
|
| 49 |
+
def __init__(self, max_depth: int = 5, min_samples_split: int = 2,
|
| 50 |
+
min_samples_leaf: int = 1, random_state: int = 42):
|
| 51 |
+
super().__init__(
|
| 52 |
+
max_depth=max_depth,
|
| 53 |
+
min_samples_split=min_samples_split,
|
| 54 |
+
min_samples_leaf=min_samples_leaf,
|
| 55 |
+
random_state=random_state
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
def fit(self, X, y, sample_weight=None):
|
| 59 |
+
"""Fit the decision tree with optional sample weights."""
|
| 60 |
+
return super().fit(X, y, sample_weight=sample_weight)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class AdaBoostEnsemble(BaseEstimator, ClassifierMixin):
|
| 64 |
+
"""
|
| 65 |
+
AdaBoost ensemble of decision trees where each tree is grown based on
|
| 66 |
+
weighted training errors. Weights are updated based on the error of
|
| 67 |
+
previous trees.
|
| 68 |
+
|
| 69 |
+
The algorithm:
|
| 70 |
+
1. Initialize equal weights for all training samples
|
| 71 |
+
2. For each tree in the ensemble:
|
| 72 |
+
- Train a decision tree on weighted data
|
| 73 |
+
- Calculate weighted error rate
|
| 74 |
+
- Compute tree weight (alpha)
|
| 75 |
+
- Update sample weights (increase for misclassified, decrease for correct)
|
| 76 |
+
- Normalize weights
|
| 77 |
+
3. Make predictions using weighted voting
|
| 78 |
+
"""
|
| 79 |
+
|
| 80 |
+
def __init__(
|
| 81 |
+
self,
|
| 82 |
+
n_estimators: int = 50,
|
| 83 |
+
max_depth: int = 5,
|
| 84 |
+
min_samples_split: int = 2,
|
| 85 |
+
min_samples_leaf: int = 1,
|
| 86 |
+
random_state: int = 42
|
| 87 |
+
):
|
| 88 |
+
self.n_estimators = n_estimators
|
| 89 |
+
self.max_depth = max_depth
|
| 90 |
+
self.min_samples_split = min_samples_split
|
| 91 |
+
self.min_samples_leaf = min_samples_leaf
|
| 92 |
+
self.random_state = random_state
|
| 93 |
+
self.trees: List[WeightedDecisionTree] = []
|
| 94 |
+
self.tree_weights: List[float] = []
|
| 95 |
+
self.n_classes: int = 0
|
| 96 |
+
self.classes_: np.ndarray = None
|
| 97 |
+
|
| 98 |
+
def _initialize_weights(self, n_samples: int) -> np.ndarray:
|
| 99 |
+
"""Initialize equal weights for all samples."""
|
| 100 |
+
return np.ones(n_samples) / n_samples
|
| 101 |
+
|
| 102 |
+
def _update_weights(
|
| 103 |
+
self,
|
| 104 |
+
weights: np.ndarray,
|
| 105 |
+
y_true: np.ndarray,
|
| 106 |
+
y_pred: np.ndarray,
|
| 107 |
+
alpha: float
|
| 108 |
+
) -> np.ndarray:
|
| 109 |
+
"""
|
| 110 |
+
Update sample weights based on prediction errors.
|
| 111 |
+
Increase weight for misclassified samples, decrease for correct.
|
| 112 |
+
"""
|
| 113 |
+
# Misclassified samples get multiplied by exp(alpha)
|
| 114 |
+
# Correctly classified samples get multiplied by exp(-alpha)
|
| 115 |
+
misclassified = y_true != y_pred
|
| 116 |
+
updated_weights = weights * np.exp(alpha * misclassified.astype(float))
|
| 117 |
+
|
| 118 |
+
# Normalize weights
|
| 119 |
+
return updated_weights / updated_weights.sum()
|
| 120 |
+
|
| 121 |
+
def _compute_weighted_error(
|
| 122 |
+
self,
|
| 123 |
+
weights: np.ndarray,
|
| 124 |
+
y_true: np.ndarray,
|
| 125 |
+
y_pred: np.ndarray
|
| 126 |
+
) -> float:
|
| 127 |
+
"""Compute weighted error rate."""
|
| 128 |
+
misclassified = (y_true != y_pred).astype(float)
|
| 129 |
+
return np.sum(weights * misclassified) / np.sum(weights)
|
| 130 |
+
|
| 131 |
+
def _compute_alpha(self, error: float) -> float:
|
| 132 |
+
"""
|
| 133 |
+
Compute the weight of the classifier.
|
| 134 |
+
Avoid division by zero and log(0).
|
| 135 |
+
"""
|
| 136 |
+
if error <= 0:
|
| 137 |
+
return 10.0 # Very high weight for perfect classifier
|
| 138 |
+
if error >= 1:
|
| 139 |
+
return -10.0 # Very negative weight for completely wrong classifier
|
| 140 |
+
return 0.5 * np.log((1 - error) / error)
|
| 141 |
+
|
| 142 |
+
def fit(self, X: np.ndarray, y: np.ndarray) -> 'AdaBoostEnsemble':
|
| 143 |
+
"""Fit the AdaBoost ensemble."""
|
| 144 |
+
n_samples, n_features = X.shape
|
| 145 |
+
self.classes_ = np.unique(y)
|
| 146 |
+
self.n_classes = len(self.classes_)
|
| 147 |
+
|
| 148 |
+
# Initialize sample weights
|
| 149 |
+
weights = self._initialize_weights(n_samples)
|
| 150 |
+
|
| 151 |
+
for i in range(self.n_estimators):
|
| 152 |
+
# Create and train decision tree with current weights
|
| 153 |
+
tree = WeightedDecisionTree(
|
| 154 |
+
max_depth=self.max_depth,
|
| 155 |
+
min_samples_split=self.min_samples_split,
|
| 156 |
+
min_samples_leaf=self.min_samples_leaf,
|
| 157 |
+
random_state=self.random_state + i
|
| 158 |
+
)
|
| 159 |
+
tree.fit(X, y, sample_weight=weights)
|
| 160 |
+
|
| 161 |
+
# Make predictions
|
| 162 |
+
y_pred = tree.predict(X)
|
| 163 |
+
|
| 164 |
+
# Calculate weighted error
|
| 165 |
+
error = self._compute_weighted_error(weights, y, y_pred)
|
| 166 |
+
|
| 167 |
+
# Compute tree weight (alpha)
|
| 168 |
+
alpha = self._compute_alpha(error)
|
| 169 |
+
|
| 170 |
+
# Update sample weights
|
| 171 |
+
weights = self._update_weights(weights, y, y_pred, alpha)
|
| 172 |
+
|
| 173 |
+
# Store tree and its weight
|
| 174 |
+
self.trees.append(tree)
|
| 175 |
+
self.tree_weights.append(alpha)
|
| 176 |
+
|
| 177 |
+
print(f"Tree {i+1}/{self.n_estimators}: Error={error:.4f}, Alpha={alpha:.4f}")
|
| 178 |
+
|
| 179 |
+
return self
|
| 180 |
+
|
| 181 |
+
def predict(self, X: np.ndarray) -> np.ndarray:
|
| 182 |
+
"""Predict using weighted voting."""
|
| 183 |
+
# Get predictions from all trees
|
| 184 |
+
all_predictions = np.array([tree.predict(X) for tree in self.trees])
|
| 185 |
+
|
| 186 |
+
# Get class labels
|
| 187 |
+
classes = self.classes_
|
| 188 |
+
|
| 189 |
+
# Compute weighted votes for each class
|
| 190 |
+
n_samples = X.shape[0]
|
| 191 |
+
weighted_votes = np.zeros((n_samples, len(classes)))
|
| 192 |
+
|
| 193 |
+
for tree_idx, tree in enumerate(self.trees):
|
| 194 |
+
alpha = self.tree_weights[tree_idx]
|
| 195 |
+
predictions = all_predictions[tree_idx]
|
| 196 |
+
|
| 197 |
+
for class_idx, class_label in enumerate(classes):
|
| 198 |
+
weighted_votes[:, class_idx] += alpha * (predictions == class_label)
|
| 199 |
+
|
| 200 |
+
# Return class with highest weighted vote
|
| 201 |
+
return classes[np.argmax(weighted_votes, axis=1)]
|
| 202 |
+
|
| 203 |
+
def predict_proba(self, X: np.ndarray) -> np.ndarray:
|
| 204 |
+
"""Predict class probabilities using weighted voting."""
|
| 205 |
+
# Get predictions from all trees
|
| 206 |
+
all_predictions = np.array([tree.predict(X) for tree in self.trees])
|
| 207 |
+
|
| 208 |
+
# Get class labels
|
| 209 |
+
classes = self.classes_
|
| 210 |
+
|
| 211 |
+
# Compute weighted vote proportions for each class
|
| 212 |
+
n_samples = X.shape[0]
|
| 213 |
+
weighted_votes = np.zeros((n_samples, len(classes)))
|
| 214 |
+
|
| 215 |
+
total_weight = sum(abs(w) for w in self.tree_weights)
|
| 216 |
+
|
| 217 |
+
for tree_idx, tree in enumerate(self.trees):
|
| 218 |
+
alpha = self.tree_weights[tree_idx]
|
| 219 |
+
predictions = all_predictions[tree_idx]
|
| 220 |
+
|
| 221 |
+
for class_idx, class_label in enumerate(classes):
|
| 222 |
+
weighted_votes[:, class_idx] += abs(alpha) * (predictions == class_label)
|
| 223 |
+
|
| 224 |
+
# Normalize to get probabilities
|
| 225 |
+
return weighted_votes / total_weight
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def evaluate_cv(model, X, y, cv, name='Model'):
|
| 229 |
+
"""Evaluate model using cross-validation."""
|
| 230 |
+
scoring = {
|
| 231 |
+
'accuracy' : 'accuracy',
|
| 232 |
+
'f1' : 'f1_weighted',
|
| 233 |
+
'precision': 'precision_weighted',
|
| 234 |
+
'recall' : 'recall_weighted',
|
| 235 |
+
}
|
| 236 |
+
cv_res = cross_validate(model, X, y, cv=cv, scoring=scoring)
|
| 237 |
+
return {
|
| 238 |
+
'Model' : name,
|
| 239 |
+
'Accuracy_mean' : cv_res['test_accuracy'].mean(),
|
| 240 |
+
'Accuracy_std' : cv_res['test_accuracy'].std(),
|
| 241 |
+
'F1_mean' : cv_res['test_f1'].mean(),
|
| 242 |
+
'F1_std' : cv_res['test_f1'].std(),
|
| 243 |
+
'Precision_mean': cv_res['test_precision'].mean(),
|
| 244 |
+
'Recall_mean' : cv_res['test_recall'].mean(),
|
| 245 |
+
'_f1_scores' : cv_res['test_f1'],
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
# Load data
|
| 250 |
+
movement_features_df = pd.read_csv(os.path.join(DATA_DIR, 'aimoscores.csv'))
|
| 251 |
+
weaklink_scores_df = pd.read_csv(os.path.join(DATA_DIR, 'scores_and_weaklink.csv'))
|
| 252 |
+
|
| 253 |
+
print('Movement features shape:', movement_features_df.shape)
|
| 254 |
+
print('Weak link scores shape:', weaklink_scores_df.shape)
|
| 255 |
+
|
| 256 |
+
DUPLICATE_NASM_COLS = [
|
| 257 |
+
'No_1_NASM_Deviation',
|
| 258 |
+
'No_2_NASM_Deviation',
|
| 259 |
+
'No_3_NASM_Deviation',
|
| 260 |
+
'No_4_NASM_Deviation',
|
| 261 |
+
'No_5_NASM_Deviation',
|
| 262 |
+
]
|
| 263 |
+
|
| 264 |
+
movement_features_df = movement_features_df.drop(columns=DUPLICATE_NASM_COLS)
|
| 265 |
+
print('Shape after duplicate removal:', movement_features_df.shape)
|
| 266 |
+
|
| 267 |
+
weaklink_categories = [
|
| 268 |
+
'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward',
|
| 269 |
+
'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward',
|
| 270 |
+
'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward',
|
| 271 |
+
'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward',
|
| 272 |
+
'RightKneeMovesOutward', 'RightShoulderElevation',
|
| 273 |
+
]
|
| 274 |
+
|
| 275 |
+
weaklink_scores_df['WeakestLink'] = (
|
| 276 |
+
weaklink_scores_df[weaklink_categories].idxmax(axis=1)
|
| 277 |
+
)
|
| 278 |
+
print('Weakest Link class distribution:')
|
| 279 |
+
print(weaklink_scores_df['WeakestLink'].value_counts())
|
| 280 |
+
|
| 281 |
+
# Merge Datasets
|
| 282 |
+
target_df = weaklink_scores_df[['ID', 'WeakestLink']].copy()
|
| 283 |
+
merged_df = movement_features_df.merge(target_df, on='ID', how='inner')
|
| 284 |
+
print('Merged dataset shape:', merged_df.shape)
|
| 285 |
+
|
| 286 |
+
EXCLUDE_COLS = ['ID', 'WeakestLink', 'EstimatedScore']
|
| 287 |
+
feature_columns = [c for c in merged_df.columns if c not in EXCLUDE_COLS]
|
| 288 |
+
|
| 289 |
+
X = merged_df[feature_columns].values
|
| 290 |
+
y = merged_df['WeakestLink'].values
|
| 291 |
+
|
| 292 |
+
print(f'Feature matrix shape : {X.shape}')
|
| 293 |
+
print(f'Number of features : {len(feature_columns)}')
|
| 294 |
+
print(f'Number of classes : {len(np.unique(y))}')
|
| 295 |
+
|
| 296 |
+
# Split data
|
| 297 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 298 |
+
X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
|
| 299 |
+
)
|
| 300 |
+
|
| 301 |
+
scaler = StandardScaler()
|
| 302 |
+
X_train_scaled = scaler.fit_transform(X_train)
|
| 303 |
+
X_test_scaled = scaler.transform(X_test)
|
| 304 |
+
|
| 305 |
+
print(f'Training samples : {X_train.shape[0]}')
|
| 306 |
+
print(f'Test samples : {X_test.shape[0]}')
|
| 307 |
+
|
| 308 |
+
cv_strategy = StratifiedKFold(
|
| 309 |
+
n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
# Train AdaBoost ensemble
|
| 313 |
+
print("\n" + "="*60)
|
| 314 |
+
print("TRAINING ADABOOST ENSEMBLE")
|
| 315 |
+
print("="*60)
|
| 316 |
+
|
| 317 |
+
adaboost_model = AdaBoostEnsemble(
|
| 318 |
+
n_estimators=50,
|
| 319 |
+
max_depth=5,
|
| 320 |
+
min_samples_split=5,
|
| 321 |
+
min_samples_leaf=2,
|
| 322 |
+
random_state=RANDOM_STATE
|
| 323 |
+
)
|
| 324 |
+
|
| 325 |
+
adaboost_model.fit(X_train_scaled, y_train)
|
| 326 |
+
|
| 327 |
+
# Cross-validation
|
| 328 |
+
adaboost_cv = evaluate_cv(
|
| 329 |
+
adaboost_model, X_train_scaled, y_train, cv_strategy,
|
| 330 |
+
name='AdaBoost Ensemble'
|
| 331 |
+
)
|
| 332 |
+
|
| 333 |
+
# Test set evaluation
|
| 334 |
+
adaboost_model.fit(X_train_scaled, y_train)
|
| 335 |
+
y_pred_adaboost = adaboost_model.predict(X_test_scaled)
|
| 336 |
+
|
| 337 |
+
test_f1_adaboost = f1_score(y_test, y_pred_adaboost, average='weighted')
|
| 338 |
+
test_acc_adaboost = accuracy_score(y_test, y_pred_adaboost)
|
| 339 |
+
test_prec_adaboost = precision_score(y_test, y_pred_adaboost, average='weighted', zero_division=0)
|
| 340 |
+
test_rec_adaboost = recall_score(y_test, y_pred_adaboost, average='weighted', zero_division=0)
|
| 341 |
+
|
| 342 |
+
print("\n" + "="*60)
|
| 343 |
+
print("ADABOOST RESULTS")
|
| 344 |
+
print("="*60)
|
| 345 |
+
print(f'CV F1: {adaboost_cv["F1_mean"]:.4f} +/- {adaboost_cv["F1_std"]:.4f}')
|
| 346 |
+
print(f'Test F1: {test_f1_adaboost:.4f}')
|
| 347 |
+
print(f'Test Accuracy: {test_acc_adaboost:.4f}')
|
| 348 |
+
print(f'Test Precision: {test_prec_adaboost:.4f}')
|
| 349 |
+
print(f'Test Recall: {test_rec_adaboost:.4f}')
|
| 350 |
+
|
| 351 |
+
# Compare with baseline models
|
| 352 |
+
rf_champion = RandomForestClassifier(
|
| 353 |
+
n_estimators=200, max_depth=15,
|
| 354 |
+
min_samples_split=5, min_samples_leaf=2,
|
| 355 |
+
class_weight='balanced',
|
| 356 |
+
random_state=RANDOM_STATE, n_jobs=-1
|
| 357 |
+
)
|
| 358 |
+
|
| 359 |
+
rf_cv = evaluate_cv(
|
| 360 |
+
rf_champion, X_train_scaled, y_train, cv_strategy,
|
| 361 |
+
name='Random Forest (Baseline)'
|
| 362 |
+
)
|
| 363 |
+
|
| 364 |
+
rf_champion.fit(X_train_scaled, y_train)
|
| 365 |
+
y_pred_rf = rf_champion.predict(X_test_scaled)
|
| 366 |
+
test_f1_rf = f1_score(y_test, y_pred_rf, average='weighted')
|
| 367 |
+
|
| 368 |
+
print("\n" + "="*60)
|
| 369 |
+
print("COMPARISON WITH BASELINE")
|
| 370 |
+
print("="*60)
|
| 371 |
+
print(f'Random Forest CV F1: {rf_cv["F1_mean"]:.4f} +/- {rf_cv["F1_std"]:.4f}')
|
| 372 |
+
print(f'Random Forest Test F1: {test_f1_rf:.4f}')
|
| 373 |
+
|
| 374 |
+
# Statistical significance test
|
| 375 |
+
def corrected_resampled_ttest(scores_a, scores_b, n_train, n_test):
|
| 376 |
+
k = len(scores_a)
|
| 377 |
+
diff = scores_a - scores_b
|
| 378 |
+
d_bar = diff.mean()
|
| 379 |
+
s_sq = diff.var(ddof=1)
|
| 380 |
+
var_corr = (1/k + n_test/n_train) * s_sq
|
| 381 |
+
t_stat = d_bar / np.sqrt(var_corr)
|
| 382 |
+
p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df=k-1))
|
| 383 |
+
return float(t_stat), float(p_value)
|
| 384 |
+
|
| 385 |
+
n_total = len(X_train_scaled)
|
| 386 |
+
n_test_fold = n_total // N_SPLITS
|
| 387 |
+
n_train_fold = n_total - n_test_fold
|
| 388 |
+
|
| 389 |
+
result_map = {
|
| 390 |
+
'AdaBoost Ensemble': adaboost_cv['_f1_scores'],
|
| 391 |
+
'Random Forest': rf_cv['_f1_scores']
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
adaboost_scores = result_map['AdaBoost Ensemble']
|
| 395 |
+
rf_scores = result_map['Random Forest']
|
| 396 |
+
|
| 397 |
+
t, p = corrected_resampled_ttest(adaboost_scores, rf_scores, n_train_fold, n_test_fold)
|
| 398 |
+
print(f"\nStatistical Test (AdaBoost vs Random Forest):")
|
| 399 |
+
print(f" t-statistic: {t:+.3f}")
|
| 400 |
+
print(f" p-value: {p:.4f}")
|
| 401 |
+
print(f" Significant at α=0.05: {'Yes' if p < 0.05 else 'No'}")
|
| 402 |
+
|
| 403 |
+
# Save model
|
| 404 |
+
artifact = {
|
| 405 |
+
'model' : adaboost_model,
|
| 406 |
+
'model_name' : 'AdaBoost Ensemble',
|
| 407 |
+
'scaler' : scaler,
|
| 408 |
+
'feature_columns' : feature_columns,
|
| 409 |
+
'cv_metrics': {
|
| 410 |
+
'f1_mean' : float(adaboost_cv['F1_mean']),
|
| 411 |
+
'f1_std' : float(adaboost_cv['F1_std']),
|
| 412 |
+
'accuracy_mean': float(adaboost_cv['Accuracy_mean']),
|
| 413 |
+
},
|
| 414 |
+
'test_metrics': {
|
| 415 |
+
'f1' : float(test_f1_adaboost),
|
| 416 |
+
'accuracy' : float(test_acc_adaboost),
|
| 417 |
+
'precision': float(test_prec_adaboost),
|
| 418 |
+
'recall' : float(test_rec_adaboost),
|
| 419 |
+
},
|
| 420 |
+
'a4_champion_f1' : CHAMPION_F1,
|
| 421 |
+
'improvement_pct': float((test_f1_adaboost - CHAMPION_F1) / CHAMPION_F1 * 100),
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
out_path = OUT_DIR / 'adaboost_classification.pkl'
|
| 425 |
+
with open(out_path, 'wb') as f:
|
| 426 |
+
pickle.dump(artifact, f)
|
| 427 |
+
|
| 428 |
+
print(f'\nSaved model to: {out_path}')
|
| 429 |
+
|
| 430 |
+
# Classification report
|
| 431 |
+
print('\nCLASSIFICATION REPORT: AdaBoost Ensemble')
|
| 432 |
+
print(classification_report(y_test, y_pred_adaboost, zero_division=0))
|
| 433 |
+
|
| 434 |
+
# Feature importance analysis (simplified)
|
| 435 |
+
print("\n" + "="*60)
|
| 436 |
+
print("FEATURE IMPORTANCE ANALYSIS")
|
| 437 |
+
print("="*60)
|
| 438 |
+
|
| 439 |
+
# Calculate feature importance as average across all trees
|
| 440 |
+
all_importances = np.zeros(len(feature_columns))
|
| 441 |
+
for tree in adaboost_model.trees:
|
| 442 |
+
all_importances += tree.feature_importances_
|
| 443 |
+
|
| 444 |
+
avg_importances = all_importances / len(adaboost_model.trees)
|
| 445 |
+
importance_df = pd.DataFrame({
|
| 446 |
+
'Feature': feature_columns,
|
| 447 |
+
'Importance': avg_importances
|
| 448 |
+
}).sort_values('Importance', ascending=False)
|
| 449 |
+
|
| 450 |
+
print("\nTop 10 Most Important Features:")
|
| 451 |
+
print(importance_df.head(10).to_string(index=False))
|
| 452 |
+
|
| 453 |
+
# Plot feature importance
|
| 454 |
+
plt.figure(figsize=(12, 8))
|
| 455 |
+
top_features = importance_df.head(15)
|
| 456 |
+
plt.barh(range(len(top_features)), top_features['Importance'].values)
|
| 457 |
+
plt.yticks(range(len(top_features)), top_features['Feature'].values)
|
| 458 |
+
plt.xlabel('Average Feature Importance')
|
| 459 |
+
plt.ylabel('Features')
|
| 460 |
+
plt.title('Top 15 Feature Importance - AdaBoost Ensemble')
|
| 461 |
+
plt.gca().invert_yaxis()
|
| 462 |
+
plt.tight_layout()
|
| 463 |
+
plt.savefig(OUT_DIR / 'adaboost_feature_importance.png', dpi=150)
|
| 464 |
+
plt.close()
|
| 465 |
+
|
| 466 |
+
print(f"\nSaved feature importance plot to: {OUT_DIR / 'adaboost_feature_importance.png'}")
|
A5b/classification_bagging_trees.py
ADDED
|
@@ -0,0 +1,374 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pickle
|
| 3 |
+
import warnings
|
| 4 |
+
import numpy as np
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import matplotlib.pyplot as plt
|
| 7 |
+
import seaborn as sns
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from scipy import stats
|
| 10 |
+
|
| 11 |
+
from sklearn.model_selection import StratifiedKFold, cross_validate
|
| 12 |
+
from sklearn.preprocessing import StandardScaler
|
| 13 |
+
from sklearn.metrics import (
|
| 14 |
+
accuracy_score, precision_score, recall_score, f1_score,
|
| 15 |
+
classification_report, confusion_matrix
|
| 16 |
+
)
|
| 17 |
+
from sklearn.tree import DecisionTreeClassifier
|
| 18 |
+
from sklearn.ensemble import BaggingClassifier
|
| 19 |
+
import xgboost as xgb
|
| 20 |
+
import lightgbm as lgb
|
| 21 |
+
|
| 22 |
+
warnings.filterwarnings('ignore')
|
| 23 |
+
np.random.seed(42)
|
| 24 |
+
|
| 25 |
+
REPO_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
|
| 26 |
+
DATA_DIR = os.path.join(REPO_ROOT, 'Datasets_all')
|
| 27 |
+
OUT_DIR = Path('models')
|
| 28 |
+
OUT_DIR.mkdir(exist_ok=True)
|
| 29 |
+
|
| 30 |
+
RANDOM_STATE = 42
|
| 31 |
+
N_SPLITS = 5
|
| 32 |
+
CHAMPION_F1 = 0.6110 # Score from A4
|
| 33 |
+
|
| 34 |
+
movement_features_df = pd.read_csv(os.path.join(DATA_DIR, 'aimoscores.csv'))
|
| 35 |
+
weaklink_scores_df = pd.read_csv(os.path.join(DATA_DIR, 'scores_and_weaklink.csv'))
|
| 36 |
+
|
| 37 |
+
print('Movement features shape:', movement_features_df.shape)
|
| 38 |
+
print('Weak link scores shape:', weaklink_scores_df.shape)
|
| 39 |
+
|
| 40 |
+
DUPLICATE_NASM_COLS = [
|
| 41 |
+
'No_1_NASM_Deviation',
|
| 42 |
+
'No_2_NASM_Deviation',
|
| 43 |
+
'No_3_NASM_Deviation',
|
| 44 |
+
'No_4_NASM_Deviation',
|
| 45 |
+
'No_5_NASM_Deviation',
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
movement_features_df = movement_features_df.drop(columns=DUPLICATE_NASM_COLS)
|
| 49 |
+
print('Shape after duplicate removal:', movement_features_df.shape)
|
| 50 |
+
|
| 51 |
+
weaklink_categories = [
|
| 52 |
+
'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward',
|
| 53 |
+
'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward',
|
| 54 |
+
'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward',
|
| 55 |
+
'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward',
|
| 56 |
+
'RightKneeMovesOutward', 'RightShoulderElevation',
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
weaklink_scores_df['WeakestLink'] = (
|
| 60 |
+
weaklink_scores_df[weaklink_categories].idxmax(axis=1)
|
| 61 |
+
)
|
| 62 |
+
print('Weakest Link class distribution:')
|
| 63 |
+
print(weaklink_scores_df['WeakestLink'].value_counts())
|
| 64 |
+
|
| 65 |
+
# Merge Datasets
|
| 66 |
+
target_df = weaklink_scores_df[['ID', 'WeakestLink']].copy()
|
| 67 |
+
merged_df = movement_features_df.merge(target_df, on='ID', how='inner')
|
| 68 |
+
print('Merged dataset shape:', merged_df.shape)
|
| 69 |
+
|
| 70 |
+
EXCLUDE_COLS = ['ID', 'WeakestLink', 'EstimatedScore']
|
| 71 |
+
feature_columns = [c for c in merged_df.columns if c not in EXCLUDE_COLS]
|
| 72 |
+
|
| 73 |
+
X = merged_df[feature_columns].values
|
| 74 |
+
y = merged_df['WeakestLink'].values
|
| 75 |
+
|
| 76 |
+
print(f'Feature matrix shape : {X.shape}')
|
| 77 |
+
print(f'Number of features : {len(feature_columns)}')
|
| 78 |
+
print(f'Number of classes : {len(np.unique(y))}')
|
| 79 |
+
|
| 80 |
+
# Encode string labels to integers for XGBoost/LightGBM compatibility
|
| 81 |
+
from sklearn.preprocessing import LabelEncoder
|
| 82 |
+
label_encoder = LabelEncoder()
|
| 83 |
+
y_encoded = label_encoder.fit_transform(y)
|
| 84 |
+
|
| 85 |
+
scaler = StandardScaler()
|
| 86 |
+
X_scaled = scaler.fit_transform(X)
|
| 87 |
+
|
| 88 |
+
cv_strategy = StratifiedKFold(
|
| 89 |
+
n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
def evaluate_cv(model, X, y, cv, name='Model', use_encoded_labels=False):
|
| 93 |
+
scoring = {
|
| 94 |
+
'accuracy' : 'accuracy',
|
| 95 |
+
'f1' : 'f1_weighted',
|
| 96 |
+
'precision': 'precision_weighted',
|
| 97 |
+
'recall' : 'recall_weighted',
|
| 98 |
+
}
|
| 99 |
+
y_to_use = y_encoded if use_encoded_labels else y
|
| 100 |
+
cv_res = cross_validate(model, X, y_to_use, cv=cv, scoring=scoring)
|
| 101 |
+
return {
|
| 102 |
+
'Model' : name,
|
| 103 |
+
'Accuracy_mean' : cv_res['test_accuracy'].mean(),
|
| 104 |
+
'Accuracy_std' : cv_res['test_accuracy'].std(),
|
| 105 |
+
'F1_mean' : cv_res['test_f1'].mean(),
|
| 106 |
+
'F1_std' : cv_res['test_f1'].std(),
|
| 107 |
+
'Precision_mean': cv_res['test_precision'].mean(),
|
| 108 |
+
'Recall_mean' : cv_res['test_recall'].mean(),
|
| 109 |
+
'_f1_scores' : cv_res['test_f1'],
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
# Baseline: Single Decision Tree
|
| 113 |
+
single_tree = DecisionTreeClassifier(
|
| 114 |
+
max_depth=15,
|
| 115 |
+
min_samples_split=5,
|
| 116 |
+
min_samples_leaf=2,
|
| 117 |
+
class_weight='balanced',
|
| 118 |
+
random_state=RANDOM_STATE
|
| 119 |
+
)
|
| 120 |
+
single_tree_cv = evaluate_cv(
|
| 121 |
+
single_tree, X_scaled, y, cv_strategy,
|
| 122 |
+
name='Single Decision Tree'
|
| 123 |
+
)
|
| 124 |
+
print('SINGLE DECISION TREE')
|
| 125 |
+
print(f'CV F1: {single_tree_cv["F1_mean"]:.4f} +/- {single_tree_cv["F1_std"]:.4f}')
|
| 126 |
+
|
| 127 |
+
# Bagging with Decision Trees (default: uses all features)
|
| 128 |
+
bagging_default = BaggingClassifier(
|
| 129 |
+
estimator=DecisionTreeClassifier(
|
| 130 |
+
max_depth=15,
|
| 131 |
+
min_samples_split=5,
|
| 132 |
+
min_samples_leaf=2,
|
| 133 |
+
class_weight='balanced',
|
| 134 |
+
random_state=RANDOM_STATE
|
| 135 |
+
),
|
| 136 |
+
n_estimators=200,
|
| 137 |
+
max_samples=1.0, # Bootstrap sample size (100% of training data)
|
| 138 |
+
max_features=1.0, # Use all features
|
| 139 |
+
bootstrap=True,
|
| 140 |
+
bootstrap_features=False, # Don't subsample features
|
| 141 |
+
n_jobs=-1,
|
| 142 |
+
random_state=RANDOM_STATE
|
| 143 |
+
)
|
| 144 |
+
bagging_default_cv = evaluate_cv(
|
| 145 |
+
bagging_default, X_scaled, y, cv_strategy,
|
| 146 |
+
name='Bagging (All Features)'
|
| 147 |
+
)
|
| 148 |
+
print(f'Bagging (All Features) CV F1: {bagging_default_cv["F1_mean"]:.4f} +/- {bagging_default_cv["F1_std"]:.4f}')
|
| 149 |
+
|
| 150 |
+
# Bagging with Decision Trees + Feature Subsetting (Random Subspace Method)
|
| 151 |
+
# This creates trees using random subsets of predictors
|
| 152 |
+
bagging_subspace = BaggingClassifier(
|
| 153 |
+
estimator=DecisionTreeClassifier(
|
| 154 |
+
max_depth=15,
|
| 155 |
+
min_samples_split=5,
|
| 156 |
+
min_samples_leaf=2,
|
| 157 |
+
class_weight='balanced',
|
| 158 |
+
random_state=RANDOM_STATE
|
| 159 |
+
),
|
| 160 |
+
n_estimators=200,
|
| 161 |
+
max_samples=1.0,
|
| 162 |
+
max_features=0.7, # Use 70% of features for each tree
|
| 163 |
+
bootstrap=True,
|
| 164 |
+
bootstrap_features=True, # Subsample features for each tree
|
| 165 |
+
n_jobs=-1,
|
| 166 |
+
random_state=RANDOM_STATE
|
| 167 |
+
)
|
| 168 |
+
bagging_subspace_cv = evaluate_cv(
|
| 169 |
+
bagging_subspace, X_scaled, y, cv_strategy,
|
| 170 |
+
name='Bagging (70% Features)'
|
| 171 |
+
)
|
| 172 |
+
print(f'Bagging (70% Features) CV F1: {bagging_subspace_cv["F1_mean"]:.4f} +/- {bagging_subspace_cv["F1_std"]:.4f}')
|
| 173 |
+
|
| 174 |
+
# Bagging with smaller feature subset (50%)
|
| 175 |
+
bagging_50features = BaggingClassifier(
|
| 176 |
+
estimator=DecisionTreeClassifier(
|
| 177 |
+
max_depth=15,
|
| 178 |
+
min_samples_split=5,
|
| 179 |
+
min_samples_leaf=2,
|
| 180 |
+
class_weight='balanced',
|
| 181 |
+
random_state=RANDOM_STATE
|
| 182 |
+
),
|
| 183 |
+
n_estimators=200,
|
| 184 |
+
max_samples=1.0,
|
| 185 |
+
max_features=0.5, # Use 50% of features for each tree
|
| 186 |
+
bootstrap=True,
|
| 187 |
+
bootstrap_features=True,
|
| 188 |
+
n_jobs=-1,
|
| 189 |
+
random_state=RANDOM_STATE
|
| 190 |
+
)
|
| 191 |
+
bagging_50features_cv = evaluate_cv(
|
| 192 |
+
bagging_50features, X_scaled, y, cv_strategy,
|
| 193 |
+
name='Bagging (50% Features)'
|
| 194 |
+
)
|
| 195 |
+
print(f'Bagging (50% Features) CV F1: {bagging_50features_cv["F1_mean"]:.4f} +/- {bagging_50features_cv["F1_std"]:.4f}')
|
| 196 |
+
|
| 197 |
+
# Bagging with even smaller feature subset (30%)
|
| 198 |
+
bagging_30features = BaggingClassifier(
|
| 199 |
+
estimator=DecisionTreeClassifier(
|
| 200 |
+
max_depth=15,
|
| 201 |
+
min_samples_split=5,
|
| 202 |
+
min_samples_leaf=2,
|
| 203 |
+
class_weight='balanced',
|
| 204 |
+
random_state=RANDOM_STATE
|
| 205 |
+
),
|
| 206 |
+
n_estimators=200,
|
| 207 |
+
max_samples=1.0,
|
| 208 |
+
max_features=0.3, # Use 30% of features for each tree
|
| 209 |
+
bootstrap=True,
|
| 210 |
+
bootstrap_features=True,
|
| 211 |
+
n_jobs=-1,
|
| 212 |
+
random_state=RANDOM_STATE
|
| 213 |
+
)
|
| 214 |
+
bagging_30features_cv = evaluate_cv(
|
| 215 |
+
bagging_30features, X_scaled, y, cv_strategy,
|
| 216 |
+
name='Bagging (30% Features)'
|
| 217 |
+
)
|
| 218 |
+
print(f'Bagging (30% Features) CV F1: {bagging_30features_cv["F1_mean"]:.4f} +/- {bagging_30features_cv["F1_std"]:.4f}')
|
| 219 |
+
|
| 220 |
+
# Compare with Random Forest (for reference)
|
| 221 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 222 |
+
rf_model = RandomForestClassifier(
|
| 223 |
+
n_estimators=200,
|
| 224 |
+
max_depth=15,
|
| 225 |
+
min_samples_split=5,
|
| 226 |
+
min_samples_leaf=2,
|
| 227 |
+
max_features='sqrt', # sqrt(n_features) - standard random forest
|
| 228 |
+
class_weight='balanced',
|
| 229 |
+
random_state=RANDOM_STATE,
|
| 230 |
+
n_jobs=-1
|
| 231 |
+
)
|
| 232 |
+
rf_cv = evaluate_cv(
|
| 233 |
+
rf_model, X_scaled, y, cv_strategy,
|
| 234 |
+
name='Random Forest (sqrt features)'
|
| 235 |
+
)
|
| 236 |
+
print(f'Random Forest CV F1: {rf_cv["F1_mean"]:.4f} +/- {rf_cv["F1_std"]:.4f}')
|
| 237 |
+
|
| 238 |
+
# Compare with XGBoost and LightGBM (for reference)
|
| 239 |
+
xgb_model = xgb.XGBClassifier(
|
| 240 |
+
n_estimators=200,
|
| 241 |
+
max_depth=6,
|
| 242 |
+
learning_rate=0.1,
|
| 243 |
+
subsample=0.8,
|
| 244 |
+
colsample_bytree=0.8,
|
| 245 |
+
random_state=RANDOM_STATE,
|
| 246 |
+
class_weight='balanced',
|
| 247 |
+
n_jobs=-1,
|
| 248 |
+
verbosity=0
|
| 249 |
+
)
|
| 250 |
+
xgb_cv = evaluate_cv(
|
| 251 |
+
xgb_model, X_scaled, y, cv_strategy,
|
| 252 |
+
name='XGBoost',
|
| 253 |
+
use_encoded_labels=True
|
| 254 |
+
)
|
| 255 |
+
print(f'XGBoost CV F1: {xgb_cv["F1_mean"]:.4f} +/- {xgb_cv["F1_std"]:.4f}')
|
| 256 |
+
|
| 257 |
+
lgb_model = lgb.LGBMClassifier(
|
| 258 |
+
n_estimators=200,
|
| 259 |
+
learning_rate=0.1,
|
| 260 |
+
class_weight='balanced',
|
| 261 |
+
subsample=0.8,
|
| 262 |
+
colsample_bytree=0.8,
|
| 263 |
+
random_state=RANDOM_STATE,
|
| 264 |
+
n_jobs=-1,
|
| 265 |
+
verbosity=-1
|
| 266 |
+
)
|
| 267 |
+
lgb_cv = evaluate_cv(
|
| 268 |
+
lgb_model, X_scaled, y, cv_strategy,
|
| 269 |
+
name='LightGBM',
|
| 270 |
+
use_encoded_labels=True
|
| 271 |
+
)
|
| 272 |
+
print(f'LightGBM CV F1: {lgb_cv["F1_mean"]:.4f} +/- {lgb_cv["F1_std"]:.4f}')
|
| 273 |
+
|
| 274 |
+
# Collect all results
|
| 275 |
+
all_results = [
|
| 276 |
+
single_tree_cv,
|
| 277 |
+
bagging_default_cv,
|
| 278 |
+
bagging_subspace_cv,
|
| 279 |
+
bagging_50features_cv,
|
| 280 |
+
bagging_30features_cv,
|
| 281 |
+
rf_cv,
|
| 282 |
+
xgb_cv,
|
| 283 |
+
lgb_cv,
|
| 284 |
+
]
|
| 285 |
+
|
| 286 |
+
results_df = (
|
| 287 |
+
pd.DataFrame([{k: v for k, v in r.items() if k != '_f1_scores'}
|
| 288 |
+
for r in all_results])
|
| 289 |
+
.sort_values('F1_mean', ascending=False)
|
| 290 |
+
.reset_index(drop=True)
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
+
print('\n5-FOLD CROSS-VALIDATION SUMMARY')
|
| 294 |
+
print(results_df[['Model', 'F1_mean', 'F1_std', 'Accuracy_mean',
|
| 295 |
+
'Precision_mean', 'Recall_mean']].to_string(index=False))
|
| 296 |
+
|
| 297 |
+
# Statistical Significance Test (t-test)
|
| 298 |
+
def corrected_resampled_ttest(scores_a, scores_b, n_train, n_test):
|
| 299 |
+
k = len(scores_a)
|
| 300 |
+
diff = scores_a - scores_b
|
| 301 |
+
d_bar = diff.mean()
|
| 302 |
+
s_sq = diff.var(ddof=1)
|
| 303 |
+
var_corr = (1/k + n_test/n_train) * s_sq
|
| 304 |
+
t_stat = d_bar / np.sqrt(var_corr)
|
| 305 |
+
p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df=k-1))
|
| 306 |
+
return float(t_stat), float(p_value)
|
| 307 |
+
|
| 308 |
+
n_total = len(X_scaled)
|
| 309 |
+
n_test_fold = n_total // N_SPLITS
|
| 310 |
+
n_train_fold = n_total - n_test_fold
|
| 311 |
+
|
| 312 |
+
result_map = {r['Model']: r['_f1_scores'] for r in all_results}
|
| 313 |
+
best_model_name = results_df.iloc[0]['Model']
|
| 314 |
+
best_scores = result_map[best_model_name]
|
| 315 |
+
|
| 316 |
+
print('\nSTATISTICAL SIGNIFICANCE TESTS vs Best Model')
|
| 317 |
+
for r in all_results:
|
| 318 |
+
if r['Model'] == best_model_name:
|
| 319 |
+
continue
|
| 320 |
+
t, p = corrected_resampled_ttest(
|
| 321 |
+
r['_f1_scores'], best_scores, n_train_fold, n_test_fold
|
| 322 |
+
)
|
| 323 |
+
print(f' {r["Model"]:<35} t={t:+.3f} p={p:.4f}')
|
| 324 |
+
|
| 325 |
+
# Save the best model
|
| 326 |
+
model_objects = {
|
| 327 |
+
'Single Decision Tree': single_tree,
|
| 328 |
+
'Bagging (All Features)': bagging_default,
|
| 329 |
+
'Bagging (70% Features)': bagging_subspace,
|
| 330 |
+
'Bagging (50% Features)': bagging_50features,
|
| 331 |
+
'Bagging (30% Features)': bagging_30features,
|
| 332 |
+
'Random Forest': rf_model,
|
| 333 |
+
'XGBoost': xgb_model,
|
| 334 |
+
'LightGBM': lgb_model,
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
best_name = results_df.iloc[0]['Model']
|
| 338 |
+
best_model = model_objects[best_name]
|
| 339 |
+
|
| 340 |
+
print(f'\nBEST MODEL: {best_name}')
|
| 341 |
+
print(f'CV F1 : {results_df.iloc[0]["F1_mean"]:.4f} +/- {results_df.iloc[0]["F1_std"]:.4f}')
|
| 342 |
+
|
| 343 |
+
# Train final model on all data
|
| 344 |
+
best_model.fit(X_scaled, y_encoded)
|
| 345 |
+
|
| 346 |
+
# Save model artifact
|
| 347 |
+
artifact = {
|
| 348 |
+
'model' : best_model,
|
| 349 |
+
'model_name' : best_name,
|
| 350 |
+
'scaler' : scaler,
|
| 351 |
+
'label_encoder' : label_encoder,
|
| 352 |
+
'feature_columns' : feature_columns,
|
| 353 |
+
'cv_metrics': {
|
| 354 |
+
'f1_mean' : float(results_df.iloc[0]['F1_mean']),
|
| 355 |
+
'f1_std' : float(results_df.iloc[0]['F1_std']),
|
| 356 |
+
'accuracy_mean': float(results_df.iloc[0]['Accuracy_mean']),
|
| 357 |
+
},
|
| 358 |
+
'a4_champion_f1' : CHAMPION_F1,
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
out_path = OUT_DIR / 'bagging_trees_champion.pkl'
|
| 362 |
+
with open(out_path, 'wb') as f:
|
| 363 |
+
pickle.dump(artifact, f)
|
| 364 |
+
|
| 365 |
+
print(f'\nSaved: {out_path}')
|
| 366 |
+
|
| 367 |
+
# Print feature importances for the best ensemble model
|
| 368 |
+
if hasattr(best_model, 'feature_importances_'):
|
| 369 |
+
importances = best_model.feature_importances_
|
| 370 |
+
indices = np.argsort(importances)[::-1]
|
| 371 |
+
|
| 372 |
+
print(f'\nTop 10 Most Important Features ({best_name}):')
|
| 373 |
+
for i in range(min(10, len(feature_columns))):
|
| 374 |
+
print(f' {i+1}. {feature_columns[indices[i]]}: {importances[indices[i]]:.4f}')
|
A5b/classification_baseline.py
ADDED
|
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pickle
|
| 3 |
+
import warnings
|
| 4 |
+
import numpy as np
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import matplotlib.pyplot as plt
|
| 7 |
+
import seaborn as sns
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from scipy import stats
|
| 10 |
+
|
| 11 |
+
from sklearn.model_selection import (
|
| 12 |
+
train_test_split, StratifiedKFold, cross_validate
|
| 13 |
+
)
|
| 14 |
+
from sklearn.preprocessing import StandardScaler
|
| 15 |
+
from sklearn.metrics import (
|
| 16 |
+
accuracy_score, precision_score, recall_score, f1_score,
|
| 17 |
+
classification_report, confusion_matrix
|
| 18 |
+
)
|
| 19 |
+
from sklearn.linear_model import LogisticRegression
|
| 20 |
+
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
| 21 |
+
from sklearn.neighbors import KNeighborsClassifier
|
| 22 |
+
from sklearn.naive_bayes import GaussianNB
|
| 23 |
+
from sklearn.ensemble import (
|
| 24 |
+
RandomForestClassifier,
|
| 25 |
+
VotingClassifier,
|
| 26 |
+
BaggingClassifier,
|
| 27 |
+
StackingClassifier,
|
| 28 |
+
)
|
| 29 |
+
import xgboost as xgb
|
| 30 |
+
import lightgbm as lgb
|
| 31 |
+
warnings.filterwarnings('ignore')
|
| 32 |
+
np.random.seed(42)
|
| 33 |
+
|
| 34 |
+
REPO_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
|
| 35 |
+
DATA_DIR = os.path.join(REPO_ROOT, 'Datasets_all')
|
| 36 |
+
OUT_DIR = Path('models')
|
| 37 |
+
OUT_DIR.mkdir(exist_ok=True)
|
| 38 |
+
|
| 39 |
+
RANDOM_STATE = 42
|
| 40 |
+
N_SPLITS = 5
|
| 41 |
+
CHAMPION_F1 = 0.6110 # Score from A4
|
| 42 |
+
|
| 43 |
+
movement_features_df = pd.read_csv(os.path.join(DATA_DIR, 'aimoscores.csv'))
|
| 44 |
+
weaklink_scores_df = pd.read_csv(os.path.join(DATA_DIR, 'scores_and_weaklink.csv'))
|
| 45 |
+
|
| 46 |
+
print('Movement features shape:', movement_features_df.shape)
|
| 47 |
+
print('Weak link scores shape:', weaklink_scores_df.shape)
|
| 48 |
+
|
| 49 |
+
DUPLICATE_NASM_COLS = [
|
| 50 |
+
'No_1_NASM_Deviation',
|
| 51 |
+
'No_2_NASM_Deviation',
|
| 52 |
+
'No_3_NASM_Deviation',
|
| 53 |
+
'No_4_NASM_Deviation',
|
| 54 |
+
'No_5_NASM_Deviation',
|
| 55 |
+
]
|
| 56 |
+
|
| 57 |
+
movement_features_df = movement_features_df.drop(columns=DUPLICATE_NASM_COLS)
|
| 58 |
+
print('Shape after duplicate removal:', movement_features_df.shape)
|
| 59 |
+
|
| 60 |
+
weaklink_categories = [
|
| 61 |
+
'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward',
|
| 62 |
+
'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward',
|
| 63 |
+
'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward',
|
| 64 |
+
'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward',
|
| 65 |
+
'RightKneeMovesOutward', 'RightShoulderElevation',
|
| 66 |
+
]
|
| 67 |
+
|
| 68 |
+
weaklink_scores_df['WeakestLink'] = (
|
| 69 |
+
weaklink_scores_df[weaklink_categories].idxmax(axis=1)
|
| 70 |
+
)
|
| 71 |
+
print('Weakest Link class distribution:')
|
| 72 |
+
print(weaklink_scores_df['WeakestLink'].value_counts())
|
| 73 |
+
|
| 74 |
+
# Merge Datasets
|
| 75 |
+
target_df = weaklink_scores_df[['ID', 'WeakestLink']].copy()
|
| 76 |
+
merged_df = movement_features_df.merge(target_df, on='ID', how='inner')
|
| 77 |
+
print('Merged dataset shape:', merged_df.shape)
|
| 78 |
+
|
| 79 |
+
EXCLUDE_COLS = ['ID', 'WeakestLink', 'EstimatedScore']
|
| 80 |
+
feature_columns = [c for c in merged_df.columns if c not in EXCLUDE_COLS]
|
| 81 |
+
|
| 82 |
+
X = merged_df[feature_columns].values
|
| 83 |
+
y = merged_df['WeakestLink'].values
|
| 84 |
+
|
| 85 |
+
print(f'Feature matrix shape : {X.shape}')
|
| 86 |
+
print(f'Number of features : {len(feature_columns)}')
|
| 87 |
+
print(f'Number of classes : {len(np.unique(y))}')
|
| 88 |
+
|
| 89 |
+
# is the training split needed for cross validation?
|
| 90 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 91 |
+
X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
scaler = StandardScaler()
|
| 95 |
+
X_train_scaled = scaler.fit_transform(X_train)
|
| 96 |
+
X_test_scaled = scaler.transform(X_test)
|
| 97 |
+
|
| 98 |
+
print(f'Training samples : {X_train.shape[0]}')
|
| 99 |
+
print(f'Test samples : {X_test.shape[0]}')
|
| 100 |
+
|
| 101 |
+
cv_strategy = StratifiedKFold(
|
| 102 |
+
n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
def evaluate_cv(model, X, y, cv, name='Model'):
|
| 106 |
+
scoring = {
|
| 107 |
+
'accuracy' : 'accuracy',
|
| 108 |
+
'f1' : 'f1_weighted',
|
| 109 |
+
'precision': 'precision_weighted',
|
| 110 |
+
'recall' : 'recall_weighted',
|
| 111 |
+
}
|
| 112 |
+
cv_res = cross_validate(model, X, y, cv=cv, scoring=scoring)
|
| 113 |
+
return {
|
| 114 |
+
'Model' : name,
|
| 115 |
+
'Accuracy_mean' : cv_res['test_accuracy'].mean(),
|
| 116 |
+
'Accuracy_std' : cv_res['test_accuracy'].std(),
|
| 117 |
+
'F1_mean' : cv_res['test_f1'].mean(),
|
| 118 |
+
'F1_std' : cv_res['test_f1'].std(),
|
| 119 |
+
'Precision_mean': cv_res['test_precision'].mean(),
|
| 120 |
+
'Recall_mean' : cv_res['test_recall'].mean(),
|
| 121 |
+
'_f1_scores' : cv_res['test_f1'],
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
rf_champion = RandomForestClassifier(
|
| 125 |
+
n_estimators=200, max_depth=15,
|
| 126 |
+
min_samples_split=5, min_samples_leaf=2,
|
| 127 |
+
class_weight='balanced',
|
| 128 |
+
random_state=RANDOM_STATE, n_jobs=-1
|
| 129 |
+
)
|
| 130 |
+
champ_cv = evaluate_cv(
|
| 131 |
+
rf_champion, X_train_scaled, y_train, cv_strategy,
|
| 132 |
+
name='A4 Champion – Random Forest'
|
| 133 |
+
)
|
| 134 |
+
rf_champion.fit(X_train_scaled, y_train)
|
| 135 |
+
champ_test_f1 = f1_score(y_test, rf_champion.predict(X_test_scaled), average='weighted')
|
| 136 |
+
|
| 137 |
+
print('A4 CHAMPION (Random Forest)')
|
| 138 |
+
print(f'CV F1: {champ_cv["F1_mean"]:.4f} +/- {champ_cv["F1_std"]:.4f}')
|
| 139 |
+
print(f'Test F1: {champ_test_f1:.4f}')
|
| 140 |
+
|
| 141 |
+
soft_voting = VotingClassifier(
|
| 142 |
+
estimators=[
|
| 143 |
+
('rf', RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, class_weight='balanced_subsample',
|
| 144 |
+
random_state=RANDOM_STATE, n_jobs=-1)),
|
| 145 |
+
('lr', LogisticRegression( max_iter=1000, class_weight='balanced',random_state=RANDOM_STATE)),
|
| 146 |
+
('xgb', xgb.XGBClassifier( n_estimators=200, max_depth=6, learning_rate=0.1, subsample=0.8,
|
| 147 |
+
colsample_bytree=0.8, random_state=RANDOM_STATE,class_weight='balanced', n_jobs=-1 )),
|
| 148 |
+
('lgb', lgb.LGBMClassifier( n_estimators=200, learning_rate=0.1, class_weight='balanced',subsample=0.8, colsample_bytree=0.8,
|
| 149 |
+
random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1 )),
|
| 150 |
+
('knn', KNeighborsClassifier(n_neighbors=7)),
|
| 151 |
+
('lda', LinearDiscriminantAnalysis()),
|
| 152 |
+
],
|
| 153 |
+
voting='soft',
|
| 154 |
+
n_jobs=-1,
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
sv_cv = evaluate_cv(soft_voting, X_train_scaled, y_train, cv_strategy, name='Soft Voting')
|
| 158 |
+
print(f'Soft Voting CV F1: {sv_cv["F1_mean"]:.4f} +/- {sv_cv["F1_std"]:.4f}')
|
| 159 |
+
|
| 160 |
+
all_results = [champ_cv, sv_cv]
|
| 161 |
+
results_df = (
|
| 162 |
+
pd.DataFrame([{k: v for k, v in r.items() if k != '_f1_scores'}
|
| 163 |
+
for r in all_results])
|
| 164 |
+
.sort_values('F1_mean', ascending=False)
|
| 165 |
+
.reset_index(drop=True)
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
print('5-FOLD CROSS-VALIDATION SUMMARY')
|
| 169 |
+
print(results_df[['Model','F1_mean','F1_std','Accuracy_mean',
|
| 170 |
+
'Precision_mean','Recall_mean']].to_string(index=False))
|
| 171 |
+
|
| 172 |
+
# Statistical Significance Test (t-test)
|
| 173 |
+
def corrected_resampled_ttest(scores_a, scores_b, n_train, n_test):
|
| 174 |
+
k = len(scores_a)
|
| 175 |
+
diff = scores_a - scores_b
|
| 176 |
+
d_bar = diff.mean()
|
| 177 |
+
s_sq = diff.var(ddof=1)
|
| 178 |
+
var_corr = (1/k + n_test/n_train) * s_sq
|
| 179 |
+
t_stat = d_bar / np.sqrt(var_corr)
|
| 180 |
+
p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df=k-1))
|
| 181 |
+
return float(t_stat), float(p_value)
|
| 182 |
+
|
| 183 |
+
n_total = len(X_train_scaled)
|
| 184 |
+
n_test_fold = n_total // N_SPLITS
|
| 185 |
+
n_train_fold = n_total - n_test_fold
|
| 186 |
+
|
| 187 |
+
result_map = {r['Model']: r['_f1_scores'] for r in all_results}
|
| 188 |
+
champ_scores = result_map['A4 Champion – Random Forest']
|
| 189 |
+
|
| 190 |
+
print('STATISTICAL SIGNIFICANCE TESTS vs A4 Champion')
|
| 191 |
+
for r in all_results:
|
| 192 |
+
if 'Champion' in r['Model']:
|
| 193 |
+
continue
|
| 194 |
+
t, p = corrected_resampled_ttest(
|
| 195 |
+
r['_f1_scores'], champ_scores, n_train_fold, n_test_fold
|
| 196 |
+
)
|
| 197 |
+
print(f' {r["Model"]:<35} t={t:+.3f} p={p:.4f}')
|
| 198 |
+
|
| 199 |
+
# unecessary eval on the test set?
|
| 200 |
+
model_objects = {
|
| 201 |
+
'Soft Voting' : soft_voting,
|
| 202 |
+
'A4 Champion – Random Forest': rf_champion,
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
best_name = results_df.iloc[0]['Model']
|
| 206 |
+
best_model = model_objects[best_name]
|
| 207 |
+
|
| 208 |
+
print(f'CHAMPION ENSEMBLE: {best_name}')
|
| 209 |
+
print(f'CV F1 : {results_df.iloc[0]["F1_mean"]:.4f} +/- {results_df.iloc[0]["F1_std"]:.4f}')
|
| 210 |
+
|
| 211 |
+
best_model.fit(X_train_scaled, y_train)
|
| 212 |
+
y_pred_best = best_model.predict(X_test_scaled)
|
| 213 |
+
|
| 214 |
+
test_f1 = f1_score(y_test, y_pred_best, average='weighted')
|
| 215 |
+
test_acc = accuracy_score(y_test, y_pred_best)
|
| 216 |
+
test_prec = precision_score(y_test, y_pred_best, average='weighted', zero_division=0)
|
| 217 |
+
test_rec = recall_score(y_test, y_pred_best, average='weighted', zero_division=0)
|
| 218 |
+
improvement = (test_f1 - CHAMPION_F1) / CHAMPION_F1 * 100
|
| 219 |
+
|
| 220 |
+
print('\n TEST SET RESULTS')
|
| 221 |
+
print(f'F1-Score (weighted) : {test_f1:.4f}')
|
| 222 |
+
print(f'Accuracy : {test_acc:.4f}')
|
| 223 |
+
print(f'Precision : {test_prec:.4f}')
|
| 224 |
+
print(f'Recall : {test_rec:.4f}')
|
| 225 |
+
print(f'\n A4 original champion F1 : {CHAMPION_F1:.4f}')
|
| 226 |
+
|
| 227 |
+
test_rows = []
|
| 228 |
+
for name, model in model_objects.items():
|
| 229 |
+
model.fit(X_train_scaled, y_train)
|
| 230 |
+
preds = model.predict(X_test_scaled)
|
| 231 |
+
test_rows.append({
|
| 232 |
+
'Model' : name,
|
| 233 |
+
'Test_F1' : f1_score(y_test, preds, average='weighted'),
|
| 234 |
+
'Test_Acc' : accuracy_score(y_test, preds),
|
| 235 |
+
'Test_Prec' : precision_score(y_test, preds, average='weighted', zero_division=0),
|
| 236 |
+
'Test_Recall': recall_score(y_test, preds, average='weighted', zero_division=0),
|
| 237 |
+
})
|
| 238 |
+
|
| 239 |
+
test_results_df = pd.DataFrame(test_rows).sort_values('Test_F1', ascending=False)
|
| 240 |
+
print('TEST SET COMPARISON – ALL MODELS')
|
| 241 |
+
print(test_results_df.to_string(index=False))
|
| 242 |
+
|
| 243 |
+
print(f'CLASSIFICATION REPORT: {best_name}')
|
| 244 |
+
print(classification_report(y_test, y_pred_best, zero_division=0))
|
| 245 |
+
|
| 246 |
+
# save model
|
| 247 |
+
artifact = {
|
| 248 |
+
'model' : best_model,
|
| 249 |
+
'model_name' : best_name,
|
| 250 |
+
'scaler' : scaler,
|
| 251 |
+
'feature_columns' : feature_columns,
|
| 252 |
+
'cv_metrics': {
|
| 253 |
+
'f1_mean' : float(results_df.iloc[0]['F1_mean']),
|
| 254 |
+
'f1_std' : float(results_df.iloc[0]['F1_std']),
|
| 255 |
+
'accuracy_mean': float(results_df.iloc[0]['Accuracy_mean']),
|
| 256 |
+
},
|
| 257 |
+
'test_metrics': {
|
| 258 |
+
'f1' : float(test_f1),
|
| 259 |
+
'accuracy' : float(test_acc),
|
| 260 |
+
'precision': float(test_prec),
|
| 261 |
+
'recall' : float(test_rec),
|
| 262 |
+
},
|
| 263 |
+
'a4_champion_f1' : CHAMPION_F1,
|
| 264 |
+
'improvement_pct': float(improvement),
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
out_path = OUT_DIR / 'ensemble_classification_champion.pkl'
|
| 268 |
+
with open(out_path, 'wb') as f:
|
| 269 |
+
pickle.dump(artifact, f)
|
| 270 |
+
|
| 271 |
+
print(f'Saved: {out_path}')
|
A5b/cv_baseline.py
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pickle
|
| 3 |
+
import warnings
|
| 4 |
+
import numpy as np
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import matplotlib.pyplot as plt
|
| 7 |
+
import seaborn as sns
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from scipy import stats
|
| 10 |
+
|
| 11 |
+
from sklearn.model_selection import (
|
| 12 |
+
StratifiedKFold, cross_validate
|
| 13 |
+
)
|
| 14 |
+
from sklearn.preprocessing import StandardScaler
|
| 15 |
+
from sklearn.metrics import (
|
| 16 |
+
accuracy_score, precision_score, recall_score, f1_score,
|
| 17 |
+
classification_report, confusion_matrix
|
| 18 |
+
)
|
| 19 |
+
from sklearn.linear_model import LogisticRegression
|
| 20 |
+
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
| 21 |
+
from sklearn.neighbors import KNeighborsClassifier
|
| 22 |
+
from sklearn.naive_bayes import GaussianNB
|
| 23 |
+
from sklearn.ensemble import (
|
| 24 |
+
RandomForestClassifier,
|
| 25 |
+
VotingClassifier,
|
| 26 |
+
BaggingClassifier,
|
| 27 |
+
StackingClassifier,
|
| 28 |
+
)
|
| 29 |
+
import xgboost as xgb
|
| 30 |
+
import lightgbm as lgb
|
| 31 |
+
warnings.filterwarnings('ignore')
|
| 32 |
+
np.random.seed(42)
|
| 33 |
+
|
| 34 |
+
REPO_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
|
| 35 |
+
DATA_DIR = os.path.join(REPO_ROOT, 'Datasets_all')
|
| 36 |
+
OUT_DIR = Path('models')
|
| 37 |
+
OUT_DIR.mkdir(exist_ok=True)
|
| 38 |
+
|
| 39 |
+
RANDOM_STATE = 42
|
| 40 |
+
N_SPLITS = 5
|
| 41 |
+
CHAMPION_F1 = 0.6110 # Score from A4
|
| 42 |
+
|
| 43 |
+
movement_features_df = pd.read_csv(os.path.join(DATA_DIR, 'aimoscores.csv'))
|
| 44 |
+
weaklink_scores_df = pd.read_csv(os.path.join(DATA_DIR, 'scores_and_weaklink.csv'))
|
| 45 |
+
|
| 46 |
+
print('Movement features shape:', movement_features_df.shape)
|
| 47 |
+
print('Weak link scores shape:', weaklink_scores_df.shape)
|
| 48 |
+
|
| 49 |
+
DUPLICATE_NASM_COLS = [
|
| 50 |
+
'No_1_NASM_Deviation',
|
| 51 |
+
'No_2_NASM_Deviation',
|
| 52 |
+
'No_3_NASM_Deviation',
|
| 53 |
+
'No_4_NASM_Deviation',
|
| 54 |
+
'No_5_NASM_Deviation',
|
| 55 |
+
]
|
| 56 |
+
|
| 57 |
+
movement_features_df = movement_features_df.drop(columns=DUPLICATE_NASM_COLS)
|
| 58 |
+
print('Shape after duplicate removal:', movement_features_df.shape)
|
| 59 |
+
|
| 60 |
+
weaklink_categories = [
|
| 61 |
+
'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward',
|
| 62 |
+
'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward',
|
| 63 |
+
'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward',
|
| 64 |
+
'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward',
|
| 65 |
+
'RightKneeMovesOutward', 'RightShoulderElevation',
|
| 66 |
+
]
|
| 67 |
+
|
| 68 |
+
weaklink_scores_df['WeakestLink'] = (
|
| 69 |
+
weaklink_scores_df[weaklink_categories].idxmax(axis=1)
|
| 70 |
+
)
|
| 71 |
+
print('Weakest Link class distribution:')
|
| 72 |
+
print(weaklink_scores_df['WeakestLink'].value_counts())
|
| 73 |
+
|
| 74 |
+
# Merge Datasets
|
| 75 |
+
target_df = weaklink_scores_df[['ID', 'WeakestLink']].copy()
|
| 76 |
+
merged_df = movement_features_df.merge(target_df, on='ID', how='inner')
|
| 77 |
+
print('Merged dataset shape:', merged_df.shape)
|
| 78 |
+
|
| 79 |
+
EXCLUDE_COLS = ['ID', 'WeakestLink', 'EstimatedScore']
|
| 80 |
+
feature_columns = [c for c in merged_df.columns if c not in EXCLUDE_COLS]
|
| 81 |
+
|
| 82 |
+
X = merged_df[feature_columns].values
|
| 83 |
+
y = merged_df['WeakestLink'].values
|
| 84 |
+
|
| 85 |
+
print(f'Feature matrix shape : {X.shape}')
|
| 86 |
+
print(f'Number of features : {len(feature_columns)}')
|
| 87 |
+
print(f'Number of classes : {len(np.unique(y))}')
|
| 88 |
+
|
| 89 |
+
scaler = StandardScaler()
|
| 90 |
+
X_scaled = scaler.fit_transform(X)
|
| 91 |
+
|
| 92 |
+
print(f'Total samples : {X.shape[0]}')
|
| 93 |
+
|
| 94 |
+
cv_strategy = StratifiedKFold(
|
| 95 |
+
n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
def evaluate_cv(model, X, y, cv, name='Model'):
|
| 99 |
+
scoring = {
|
| 100 |
+
'accuracy' : 'accuracy',
|
| 101 |
+
'f1' : 'f1_weighted',
|
| 102 |
+
'precision': 'precision_weighted',
|
| 103 |
+
'recall' : 'recall_weighted',
|
| 104 |
+
}
|
| 105 |
+
cv_res = cross_validate(model, X, y, cv=cv, scoring=scoring)
|
| 106 |
+
return {
|
| 107 |
+
'Model' : name,
|
| 108 |
+
'Accuracy_mean' : cv_res['test_accuracy'].mean(),
|
| 109 |
+
'Accuracy_std' : cv_res['test_accuracy'].std(),
|
| 110 |
+
'F1_mean' : cv_res['test_f1'].mean(),
|
| 111 |
+
'F1_std' : cv_res['test_f1'].std(),
|
| 112 |
+
'Precision_mean': cv_res['test_precision'].mean(),
|
| 113 |
+
'Recall_mean' : cv_res['test_recall'].mean(),
|
| 114 |
+
'_f1_scores' : cv_res['test_f1'],
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
rf_champion = RandomForestClassifier(
|
| 118 |
+
n_estimators=200, max_depth=15,
|
| 119 |
+
min_samples_split=5, min_samples_leaf=2,
|
| 120 |
+
class_weight='balanced',
|
| 121 |
+
random_state=RANDOM_STATE, n_jobs=-1
|
| 122 |
+
)
|
| 123 |
+
champ_cv = evaluate_cv(
|
| 124 |
+
rf_champion, X_scaled, y, cv_strategy,
|
| 125 |
+
name='A4 Champion – Random Forest'
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
soft_voting = VotingClassifier(
|
| 129 |
+
estimators=[
|
| 130 |
+
('rf', RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, class_weight='balanced_subsample',
|
| 131 |
+
random_state=RANDOM_STATE, n_jobs=-1)),
|
| 132 |
+
('lr', LogisticRegression( max_iter=1000, class_weight='balanced',random_state=RANDOM_STATE)),
|
| 133 |
+
('xgb', xgb.XGBClassifier( n_estimators=200, max_depth=6, learning_rate=0.1, subsample=0.8,
|
| 134 |
+
colsample_bytree=0.8, random_state=RANDOM_STATE,class_weight='balanced', n_jobs=-1 )),
|
| 135 |
+
('lgb', lgb.LGBMClassifier( n_estimators=200, learning_rate=0.1, class_weight='balanced',subsample=0.8, colsample_bytree=0.8,
|
| 136 |
+
random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1 )),
|
| 137 |
+
('knn', KNeighborsClassifier(n_neighbors=7)),
|
| 138 |
+
('lda', LinearDiscriminantAnalysis()),
|
| 139 |
+
],
|
| 140 |
+
voting='soft',
|
| 141 |
+
n_jobs=-1,
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
sv_cv = evaluate_cv(soft_voting, X_scaled, y, cv_strategy, name='Soft Voting')
|
| 145 |
+
|
| 146 |
+
all_results = [champ_cv, sv_cv]
|
| 147 |
+
results_df = (
|
| 148 |
+
pd.DataFrame([{k: v for k, v in r.items() if k != '_f1_scores'}
|
| 149 |
+
for r in all_results])
|
| 150 |
+
.sort_values('F1_mean', ascending=False)
|
| 151 |
+
.reset_index(drop=True)
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
print('5-FOLD CROSS-VALIDATION SUMMARY')
|
| 155 |
+
print(results_df[['Model','F1_mean','F1_std','Accuracy_mean',
|
| 156 |
+
'Precision_mean','Recall_mean']].to_string(index=False))
|
| 157 |
+
|
| 158 |
+
# Statistical Significance Test (t-test)
|
| 159 |
+
def corrected_resampled_ttest(scores_a, scores_b, n_train, n_test):
|
| 160 |
+
k = len(scores_a)
|
| 161 |
+
diff = scores_a - scores_b
|
| 162 |
+
d_bar = diff.mean()
|
| 163 |
+
s_sq = diff.var(ddof=1)
|
| 164 |
+
var_corr = (1/k + n_test/n_train) * s_sq
|
| 165 |
+
t_stat = d_bar / np.sqrt(var_corr)
|
| 166 |
+
p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df=k-1))
|
| 167 |
+
return float(t_stat), float(p_value)
|
| 168 |
+
|
| 169 |
+
n_total = len(X_scaled)
|
| 170 |
+
n_test_fold = n_total // N_SPLITS
|
| 171 |
+
n_train_fold = n_total - n_test_fold
|
| 172 |
+
|
| 173 |
+
result_map = {r['Model']: r['_f1_scores'] for r in all_results}
|
| 174 |
+
champ_scores = result_map['A4 Champion – Random Forest']
|
| 175 |
+
|
| 176 |
+
print('STATISTICAL SIGNIFICANCE TESTS vs A4 Champion')
|
| 177 |
+
for r in all_results:
|
| 178 |
+
if 'Champion' in r['Model']:
|
| 179 |
+
continue
|
| 180 |
+
t, p = corrected_resampled_ttest(
|
| 181 |
+
r['_f1_scores'], champ_scores, n_train_fold, n_test_fold
|
| 182 |
+
)
|
| 183 |
+
print(f' {r["Model"]:<35} t={t:+.3f} p={p:.4f}')
|
| 184 |
+
|
| 185 |
+
# Save model using cross-validation (fit on all data)
|
| 186 |
+
model_objects = {
|
| 187 |
+
'Soft Voting' : soft_voting,
|
| 188 |
+
'A4 Champion – Random Forest': rf_champion,
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
best_name = results_df.iloc[0]['Model']
|
| 192 |
+
best_model = model_objects[best_name]
|
| 193 |
+
|
| 194 |
+
print(f'CHAMPION ENSEMBLE: {best_name}')
|
| 195 |
+
print(f'CV F1 : {results_df.iloc[0]["F1_mean"]:.4f} +/- {results_df.iloc[0]["F1_std"]:.4f}')
|
| 196 |
+
|
| 197 |
+
# Fit best model on all data for final deployment
|
| 198 |
+
best_model.fit(X_scaled, y)
|
| 199 |
+
|
| 200 |
+
# Save model
|
| 201 |
+
artifact = {
|
| 202 |
+
'model' : best_model,
|
| 203 |
+
'model_name' : best_name,
|
| 204 |
+
'scaler' : scaler,
|
| 205 |
+
'feature_columns' : feature_columns,
|
| 206 |
+
'cv_metrics': {
|
| 207 |
+
'f1_mean' : float(results_df.iloc[0]['F1_mean']),
|
| 208 |
+
'f1_std' : float(results_df.iloc[0]['F1_std']),
|
| 209 |
+
'accuracy_mean': float(results_df.iloc[0]['Accuracy_mean']),
|
| 210 |
+
'precision_mean': float(results_df.iloc[0]['Precision_mean']),
|
| 211 |
+
'recall_mean' : float(results_df.iloc[0]['Recall_mean']),
|
| 212 |
+
},
|
| 213 |
+
'a4_champion_f1' : CHAMPION_F1,
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
out_path = OUT_DIR / 'ensemble_classification_champion.pkl'
|
| 217 |
+
with open(out_path, 'wb') as f:
|
| 218 |
+
pickle.dump(artifact, f)
|
| 219 |
+
|
| 220 |
+
print(f'Saved: {out_path}')
|
A5b/models/adaboost_classification.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d34c717b5f5dc02f4481f3207afcacb94ceb3ec69069589ca6abe435c8001470
|
| 3 |
+
size 725059
|
A5b/models/adaboost_feature_importance.png
ADDED
|
Git LFS Details
|
A5b/models/bagging_trees_champion.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d2475e28a11e89c0c4544064525f6d41d7890e19c5549575ac597d64e076616e
|
| 3 |
+
size 6506122
|
A5b/models/ensemble_classification_champion.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:93cecc19fe3e22c357af6ee6778990d7e3b518a36bbb6418a78ecb6795ef4cce
|
| 3 |
+
size 30798315
|