Spaces:

Bachstelze
/

github_sync

Sleeping

App Files Files Community

Bachstelze commited on 15 days ago

Commit

f5e4068

1 Parent(s): 54eac1a

test baseline with cv only

Browse files

Files changed (8) hide show

A5b/classification_adaboost.py +466 -0
A5b/classification_bagging_trees.py +374 -0
A5b/classification_baseline.py +271 -0
A5b/cv_baseline.py +220 -0
A5b/models/adaboost_classification.pkl +3 -0
A5b/models/adaboost_feature_importance.png +3 -0
A5b/models/bagging_trees_champion.pkl +3 -0
A5b/models/ensemble_classification_champion.pkl +3 -0

A5b/classification_adaboost.py ADDED Viewed

	@@ -0,0 +1,466 @@

+import os
+import pickle
+import warnings
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from pathlib import Path
+from scipy import stats
+from typing import List, Tuple, Dict, Any
+from sklearn.model_selection import (
+    train_test_split, StratifiedKFold, cross_validate
+)
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.preprocessing import StandardScaler
+from sklearn.metrics import (
+    accuracy_score, precision_score, recall_score, f1_score,
+    classification_report, confusion_matrix
+)
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import (
+    RandomForestClassifier,
+    VotingClassifier,
+    BaggingClassifier,
+    StackingClassifier,
+)
+import xgboost as xgb
+import lightgbm as lgb
+warnings.filterwarnings('ignore')
+np.random.seed(42)
+REPO_ROOT    = os.path.abspath(os.path.join(os.getcwd(), '..'))
+DATA_DIR     = os.path.join(REPO_ROOT, 'Datasets_all')
+OUT_DIR      = Path('models')
+OUT_DIR.mkdir(exist_ok=True)
+RANDOM_STATE = 42
+N_SPLITS     = 5
+CHAMPION_F1  = 0.6110   # Score from A4
+class WeightedDecisionTree(DecisionTreeClassifier):
+    """
+    A wrapper around DecisionTreeClassifier that properly handles sample weights.
+    This tree is grown based on weighted training errors.
+    """
+    def __init__(self, max_depth: int = 5, min_samples_split: int = 2,
+                 min_samples_leaf: int = 1, random_state: int = 42):
+        super().__init__(
+            max_depth=max_depth,
+            min_samples_split=min_samples_split,
+            min_samples_leaf=min_samples_leaf,
+            random_state=random_state
+        )
+    def fit(self, X, y, sample_weight=None):
+        """Fit the decision tree with optional sample weights."""
+        return super().fit(X, y, sample_weight=sample_weight)
+class AdaBoostEnsemble(BaseEstimator, ClassifierMixin):
+    """
+    AdaBoost ensemble of decision trees where each tree is grown based on
+    weighted training errors. Weights are updated based on the error of
+    previous trees.
+    The algorithm:
+    1. Initialize equal weights for all training samples
+    2. For each tree in the ensemble:
+       - Train a decision tree on weighted data
+       - Calculate weighted error rate
+       - Compute tree weight (alpha)
+       - Update sample weights (increase for misclassified, decrease for correct)
+       - Normalize weights
+    3. Make predictions using weighted voting
+    """
+    def __init__(
+        self,
+        n_estimators: int = 50,
+        max_depth: int = 5,
+        min_samples_split: int = 2,
+        min_samples_leaf: int = 1,
+        random_state: int = 42
+    ):
+        self.n_estimators = n_estimators
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.random_state = random_state
+        self.trees: List[WeightedDecisionTree] = []
+        self.tree_weights: List[float] = []
+        self.n_classes: int = 0
+        self.classes_: np.ndarray = None
+    def _initialize_weights(self, n_samples: int) -> np.ndarray:
+        """Initialize equal weights for all samples."""
+        return np.ones(n_samples) / n_samples
+    def _update_weights(
+        self,
+        weights: np.ndarray,
+        y_true: np.ndarray,
+        y_pred: np.ndarray,
+        alpha: float
+    ) -> np.ndarray:
+        """
+        Update sample weights based on prediction errors.
+        Increase weight for misclassified samples, decrease for correct.
+        """
+        # Misclassified samples get multiplied by exp(alpha)
+        # Correctly classified samples get multiplied by exp(-alpha)
+        misclassified = y_true != y_pred
+        updated_weights = weights * np.exp(alpha * misclassified.astype(float))
+        # Normalize weights
+        return updated_weights / updated_weights.sum()
+    def _compute_weighted_error(
+        self,
+        weights: np.ndarray,
+        y_true: np.ndarray,
+        y_pred: np.ndarray
+    ) -> float:
+        """Compute weighted error rate."""
+        misclassified = (y_true != y_pred).astype(float)
+        return np.sum(weights * misclassified) / np.sum(weights)
+    def _compute_alpha(self, error: float) -> float:
+        """
+        Compute the weight of the classifier.
+        Avoid division by zero and log(0).
+        """
+        if error <= 0:
+            return 10.0  # Very high weight for perfect classifier
+        if error >= 1:
+            return -10.0  # Very negative weight for completely wrong classifier
+        return 0.5 * np.log((1 - error) / error)
+    def fit(self, X: np.ndarray, y: np.ndarray) -> 'AdaBoostEnsemble':
+        """Fit the AdaBoost ensemble."""
+        n_samples, n_features = X.shape
+        self.classes_ = np.unique(y)
+        self.n_classes = len(self.classes_)
+        # Initialize sample weights
+        weights = self._initialize_weights(n_samples)
+        for i in range(self.n_estimators):
+            # Create and train decision tree with current weights
+            tree = WeightedDecisionTree(
+                max_depth=self.max_depth,
+                min_samples_split=self.min_samples_split,
+                min_samples_leaf=self.min_samples_leaf,
+                random_state=self.random_state + i
+            )
+            tree.fit(X, y, sample_weight=weights)
+            # Make predictions
+            y_pred = tree.predict(X)
+            # Calculate weighted error
+            error = self._compute_weighted_error(weights, y, y_pred)
+            # Compute tree weight (alpha)
+            alpha = self._compute_alpha(error)
+            # Update sample weights
+            weights = self._update_weights(weights, y, y_pred, alpha)
+            # Store tree and its weight
+            self.trees.append(tree)
+            self.tree_weights.append(alpha)
+            print(f"Tree {i+1}/{self.n_estimators}: Error={error:.4f}, Alpha={alpha:.4f}")
+        return self
+    def predict(self, X: np.ndarray) -> np.ndarray:
+        """Predict using weighted voting."""
+        # Get predictions from all trees
+        all_predictions = np.array([tree.predict(X) for tree in self.trees])
+        # Get class labels
+        classes = self.classes_
+        # Compute weighted votes for each class
+        n_samples = X.shape[0]
+        weighted_votes = np.zeros((n_samples, len(classes)))
+        for tree_idx, tree in enumerate(self.trees):
+            alpha = self.tree_weights[tree_idx]
+            predictions = all_predictions[tree_idx]
+            for class_idx, class_label in enumerate(classes):
+                weighted_votes[:, class_idx] += alpha * (predictions == class_label)
+        # Return class with highest weighted vote
+        return classes[np.argmax(weighted_votes, axis=1)]
+    def predict_proba(self, X: np.ndarray) -> np.ndarray:
+        """Predict class probabilities using weighted voting."""
+        # Get predictions from all trees
+        all_predictions = np.array([tree.predict(X) for tree in self.trees])
+        # Get class labels
+        classes = self.classes_
+        # Compute weighted vote proportions for each class
+        n_samples = X.shape[0]
+        weighted_votes = np.zeros((n_samples, len(classes)))
+        total_weight = sum(abs(w) for w in self.tree_weights)
+        for tree_idx, tree in enumerate(self.trees):
+            alpha = self.tree_weights[tree_idx]
+            predictions = all_predictions[tree_idx]
+            for class_idx, class_label in enumerate(classes):
+                weighted_votes[:, class_idx] += abs(alpha) * (predictions == class_label)
+        # Normalize to get probabilities
+        return weighted_votes / total_weight
+def evaluate_cv(model, X, y, cv, name='Model'):
+    """Evaluate model using cross-validation."""
+    scoring = {
+        'accuracy' : 'accuracy',
+        'f1'       : 'f1_weighted',
+        'precision': 'precision_weighted',
+        'recall'   : 'recall_weighted',
+    }
+    cv_res = cross_validate(model, X, y, cv=cv, scoring=scoring)
+    return {
+        'Model'         : name,
+        'Accuracy_mean' : cv_res['test_accuracy'].mean(),
+        'Accuracy_std'  : cv_res['test_accuracy'].std(),
+        'F1_mean'       : cv_res['test_f1'].mean(),
+        'F1_std'        : cv_res['test_f1'].std(),
+        'Precision_mean': cv_res['test_precision'].mean(),
+        'Recall_mean'   : cv_res['test_recall'].mean(),
+        '_f1_scores'    : cv_res['test_f1'],
+    }
+# Load data
+movement_features_df = pd.read_csv(os.path.join(DATA_DIR, 'aimoscores.csv'))
+weaklink_scores_df   = pd.read_csv(os.path.join(DATA_DIR, 'scores_and_weaklink.csv'))
+print('Movement features shape:', movement_features_df.shape)
+print('Weak link scores shape:', weaklink_scores_df.shape)
+DUPLICATE_NASM_COLS = [
+    'No_1_NASM_Deviation',
+    'No_2_NASM_Deviation',
+    'No_3_NASM_Deviation',
+    'No_4_NASM_Deviation',
+    'No_5_NASM_Deviation',
+]
+movement_features_df = movement_features_df.drop(columns=DUPLICATE_NASM_COLS)
+print('Shape after duplicate removal:', movement_features_df.shape)
+weaklink_categories = [
+    'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward',
+    'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward',
+    'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward',
+    'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward',
+    'RightKneeMovesOutward', 'RightShoulderElevation',
+]
+weaklink_scores_df['WeakestLink'] = (
+    weaklink_scores_df[weaklink_categories].idxmax(axis=1)
+)
+print('Weakest Link class distribution:')
+print(weaklink_scores_df['WeakestLink'].value_counts())
+# Merge Datasets
+target_df = weaklink_scores_df[['ID', 'WeakestLink']].copy()
+merged_df = movement_features_df.merge(target_df, on='ID', how='inner')
+print('Merged dataset shape:', merged_df.shape)
+EXCLUDE_COLS    = ['ID', 'WeakestLink', 'EstimatedScore']
+feature_columns = [c for c in merged_df.columns if c not in EXCLUDE_COLS]
+X = merged_df[feature_columns].values
+y = merged_df['WeakestLink'].values
+print(f'Feature matrix shape : {X.shape}')
+print(f'Number of features   : {len(feature_columns)}')
+print(f'Number of classes    : {len(np.unique(y))}')
+# Split data
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
+)
+scaler         = StandardScaler()
+X_train_scaled = scaler.fit_transform(X_train)
+X_test_scaled  = scaler.transform(X_test)
+print(f'Training samples : {X_train.shape[0]}')
+print(f'Test samples     : {X_test.shape[0]}')
+cv_strategy = StratifiedKFold(
+    n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE
+)
+# Train AdaBoost ensemble
+print("\n" + "="*60)
+print("TRAINING ADABOOST ENSEMBLE")
+print("="*60)
+adaboost_model = AdaBoostEnsemble(
+    n_estimators=50,
+    max_depth=5,
+    min_samples_split=5,
+    min_samples_leaf=2,
+    random_state=RANDOM_STATE
+)
+adaboost_model.fit(X_train_scaled, y_train)
+# Cross-validation
+adaboost_cv = evaluate_cv(
+    adaboost_model, X_train_scaled, y_train, cv_strategy,
+    name='AdaBoost Ensemble'
+)
+# Test set evaluation
+adaboost_model.fit(X_train_scaled, y_train)
+y_pred_adaboost = adaboost_model.predict(X_test_scaled)
+test_f1_adaboost = f1_score(y_test, y_pred_adaboost, average='weighted')
+test_acc_adaboost = accuracy_score(y_test, y_pred_adaboost)
+test_prec_adaboost = precision_score(y_test, y_pred_adaboost, average='weighted', zero_division=0)
+test_rec_adaboost = recall_score(y_test, y_pred_adaboost, average='weighted', zero_division=0)
+print("\n" + "="*60)
+print("ADABOOST RESULTS")
+print("="*60)
+print(f'CV F1: {adaboost_cv["F1_mean"]:.4f} +/- {adaboost_cv["F1_std"]:.4f}')
+print(f'Test F1: {test_f1_adaboost:.4f}')
+print(f'Test Accuracy: {test_acc_adaboost:.4f}')
+print(f'Test Precision: {test_prec_adaboost:.4f}')
+print(f'Test Recall: {test_rec_adaboost:.4f}')
+# Compare with baseline models
+rf_champion = RandomForestClassifier(
+    n_estimators=200, max_depth=15,
+    min_samples_split=5, min_samples_leaf=2,
+    class_weight='balanced',
+    random_state=RANDOM_STATE, n_jobs=-1
+)
+rf_cv = evaluate_cv(
+    rf_champion, X_train_scaled, y_train, cv_strategy,
+    name='Random Forest (Baseline)'
+)
+rf_champion.fit(X_train_scaled, y_train)
+y_pred_rf = rf_champion.predict(X_test_scaled)
+test_f1_rf = f1_score(y_test, y_pred_rf, average='weighted')
+print("\n" + "="*60)
+print("COMPARISON WITH BASELINE")
+print("="*60)
+print(f'Random Forest CV F1: {rf_cv["F1_mean"]:.4f} +/- {rf_cv["F1_std"]:.4f}')
+print(f'Random Forest Test F1: {test_f1_rf:.4f}')
+# Statistical significance test
+def corrected_resampled_ttest(scores_a, scores_b, n_train, n_test):
+    k        = len(scores_a)
+    diff     = scores_a - scores_b
+    d_bar    = diff.mean()
+    s_sq     = diff.var(ddof=1)
+    var_corr = (1/k + n_test/n_train) * s_sq
+    t_stat   = d_bar / np.sqrt(var_corr)
+    p_value  = 2 * (1 - stats.t.cdf(abs(t_stat), df=k-1))
+    return float(t_stat), float(p_value)
+n_total      = len(X_train_scaled)
+n_test_fold  = n_total // N_SPLITS
+n_train_fold = n_total - n_test_fold
+result_map   = {
+    'AdaBoost Ensemble': adaboost_cv['_f1_scores'],
+    'Random Forest': rf_cv['_f1_scores']
+}
+adaboost_scores = result_map['AdaBoost Ensemble']
+rf_scores = result_map['Random Forest']
+t, p = corrected_resampled_ttest(adaboost_scores, rf_scores, n_train_fold, n_test_fold)
+print(f"\nStatistical Test (AdaBoost vs Random Forest):")
+print(f"  t-statistic: {t:+.3f}")
+print(f"  p-value: {p:.4f}")
+print(f"  Significant at α=0.05: {'Yes' if p < 0.05 else 'No'}")
+# Save model
+artifact = {
+    'model'                  : adaboost_model,
+    'model_name'             : 'AdaBoost Ensemble',
+    'scaler'                 : scaler,
+    'feature_columns'        : feature_columns,
+    'cv_metrics': {
+        'f1_mean'      : float(adaboost_cv['F1_mean']),
+        'f1_std'       : float(adaboost_cv['F1_std']),
+        'accuracy_mean': float(adaboost_cv['Accuracy_mean']),
+    },
+    'test_metrics': {
+        'f1'       : float(test_f1_adaboost),
+        'accuracy' : float(test_acc_adaboost),
+        'precision': float(test_prec_adaboost),
+        'recall'   : float(test_rec_adaboost),
+    },
+    'a4_champion_f1' : CHAMPION_F1,
+    'improvement_pct': float((test_f1_adaboost - CHAMPION_F1) / CHAMPION_F1 * 100),
+}
+out_path = OUT_DIR / 'adaboost_classification.pkl'
+with open(out_path, 'wb') as f:
+    pickle.dump(artifact, f)
+print(f'\nSaved model to: {out_path}')
+# Classification report
+print('\nCLASSIFICATION REPORT: AdaBoost Ensemble')
+print(classification_report(y_test, y_pred_adaboost, zero_division=0))
+# Feature importance analysis (simplified)
+print("\n" + "="*60)
+print("FEATURE IMPORTANCE ANALYSIS")
+print("="*60)
+# Calculate feature importance as average across all trees
+all_importances = np.zeros(len(feature_columns))
+for tree in adaboost_model.trees:
+    all_importances += tree.feature_importances_
+avg_importances = all_importances / len(adaboost_model.trees)
+importance_df = pd.DataFrame({
+    'Feature': feature_columns,
+    'Importance': avg_importances
+}).sort_values('Importance', ascending=False)
+print("\nTop 10 Most Important Features:")
+print(importance_df.head(10).to_string(index=False))
+# Plot feature importance
+plt.figure(figsize=(12, 8))
+top_features = importance_df.head(15)
+plt.barh(range(len(top_features)), top_features['Importance'].values)
+plt.yticks(range(len(top_features)), top_features['Feature'].values)
+plt.xlabel('Average Feature Importance')
+plt.ylabel('Features')
+plt.title('Top 15 Feature Importance - AdaBoost Ensemble')
+plt.gca().invert_yaxis()
+plt.tight_layout()
+plt.savefig(OUT_DIR / 'adaboost_feature_importance.png', dpi=150)
+plt.close()
+print(f"\nSaved feature importance plot to: {OUT_DIR / 'adaboost_feature_importance.png'}")

A5b/classification_bagging_trees.py ADDED Viewed

	@@ -0,0 +1,374 @@

+import os
+import pickle
+import warnings
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from pathlib import Path
+from scipy import stats
+from sklearn.model_selection import StratifiedKFold, cross_validate
+from sklearn.preprocessing import StandardScaler
+from sklearn.metrics import (
+    accuracy_score, precision_score, recall_score, f1_score,
+    classification_report, confusion_matrix
+)
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import BaggingClassifier
+import xgboost as xgb
+import lightgbm as lgb
+warnings.filterwarnings('ignore')
+np.random.seed(42)
+REPO_ROOT    = os.path.abspath(os.path.join(os.getcwd(), '..'))
+DATA_DIR     = os.path.join(REPO_ROOT, 'Datasets_all')
+OUT_DIR      = Path('models')
+OUT_DIR.mkdir(exist_ok=True)
+RANDOM_STATE = 42
+N_SPLITS     = 5
+CHAMPION_F1  = 0.6110   # Score from A4
+movement_features_df = pd.read_csv(os.path.join(DATA_DIR, 'aimoscores.csv'))
+weaklink_scores_df   = pd.read_csv(os.path.join(DATA_DIR, 'scores_and_weaklink.csv'))
+print('Movement features shape:', movement_features_df.shape)
+print('Weak link scores shape:', weaklink_scores_df.shape)
+DUPLICATE_NASM_COLS = [
+    'No_1_NASM_Deviation',
+    'No_2_NASM_Deviation',
+    'No_3_NASM_Deviation',
+    'No_4_NASM_Deviation',
+    'No_5_NASM_Deviation',
+]
+movement_features_df = movement_features_df.drop(columns=DUPLICATE_NASM_COLS)
+print('Shape after duplicate removal:', movement_features_df.shape)
+weaklink_categories = [
+    'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward',
+    'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward',
+    'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward',
+    'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward',
+    'RightKneeMovesOutward', 'RightShoulderElevation',
+]
+weaklink_scores_df['WeakestLink'] = (
+    weaklink_scores_df[weaklink_categories].idxmax(axis=1)
+)
+print('Weakest Link class distribution:')
+print(weaklink_scores_df['WeakestLink'].value_counts())
+# Merge Datasets
+target_df = weaklink_scores_df[['ID', 'WeakestLink']].copy()
+merged_df = movement_features_df.merge(target_df, on='ID', how='inner')
+print('Merged dataset shape:', merged_df.shape)
+EXCLUDE_COLS    = ['ID', 'WeakestLink', 'EstimatedScore']
+feature_columns = [c for c in merged_df.columns if c not in EXCLUDE_COLS]
+X = merged_df[feature_columns].values
+y = merged_df['WeakestLink'].values
+print(f'Feature matrix shape : {X.shape}')
+print(f'Number of features   : {len(feature_columns)}')
+print(f'Number of classes    : {len(np.unique(y))}')
+# Encode string labels to integers for XGBoost/LightGBM compatibility
+from sklearn.preprocessing import LabelEncoder
+label_encoder = LabelEncoder()
+y_encoded = label_encoder.fit_transform(y)
+scaler = StandardScaler()
+X_scaled = scaler.fit_transform(X)
+cv_strategy = StratifiedKFold(
+    n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE
+)
+def evaluate_cv(model, X, y, cv, name='Model', use_encoded_labels=False):
+    scoring = {
+        'accuracy' : 'accuracy',
+        'f1'       : 'f1_weighted',
+        'precision': 'precision_weighted',
+        'recall'   : 'recall_weighted',
+    }
+    y_to_use = y_encoded if use_encoded_labels else y
+    cv_res = cross_validate(model, X, y_to_use, cv=cv, scoring=scoring)
+    return {
+        'Model'         : name,
+        'Accuracy_mean' : cv_res['test_accuracy'].mean(),
+        'Accuracy_std'  : cv_res['test_accuracy'].std(),
+        'F1_mean'       : cv_res['test_f1'].mean(),
+        'F1_std'        : cv_res['test_f1'].std(),
+        'Precision_mean': cv_res['test_precision'].mean(),
+        'Recall_mean'   : cv_res['test_recall'].mean(),
+        '_f1_scores'    : cv_res['test_f1'],
+    }
+# Baseline: Single Decision Tree
+single_tree = DecisionTreeClassifier(
+    max_depth=15,
+    min_samples_split=5,
+    min_samples_leaf=2,
+    class_weight='balanced',
+    random_state=RANDOM_STATE
+)
+single_tree_cv = evaluate_cv(
+    single_tree, X_scaled, y, cv_strategy,
+    name='Single Decision Tree'
+)
+print('SINGLE DECISION TREE')
+print(f'CV F1: {single_tree_cv["F1_mean"]:.4f} +/- {single_tree_cv["F1_std"]:.4f}')
+# Bagging with Decision Trees (default: uses all features)
+bagging_default = BaggingClassifier(
+    estimator=DecisionTreeClassifier(
+        max_depth=15,
+        min_samples_split=5,
+        min_samples_leaf=2,
+        class_weight='balanced',
+        random_state=RANDOM_STATE
+    ),
+    n_estimators=200,
+    max_samples=1.0,  # Bootstrap sample size (100% of training data)
+    max_features=1.0,  # Use all features
+    bootstrap=True,
+    bootstrap_features=False,  # Don't subsample features
+    n_jobs=-1,
+    random_state=RANDOM_STATE
+)
+bagging_default_cv = evaluate_cv(
+    bagging_default, X_scaled, y, cv_strategy,
+    name='Bagging (All Features)'
+)
+print(f'Bagging (All Features) CV F1: {bagging_default_cv["F1_mean"]:.4f} +/- {bagging_default_cv["F1_std"]:.4f}')
+# Bagging with Decision Trees + Feature Subsetting (Random Subspace Method)
+# This creates trees using random subsets of predictors
+bagging_subspace = BaggingClassifier(
+    estimator=DecisionTreeClassifier(
+        max_depth=15,
+        min_samples_split=5,
+        min_samples_leaf=2,
+        class_weight='balanced',
+        random_state=RANDOM_STATE
+    ),
+    n_estimators=200,
+    max_samples=1.0,
+    max_features=0.7,  # Use 70% of features for each tree
+    bootstrap=True,
+    bootstrap_features=True,  # Subsample features for each tree
+    n_jobs=-1,
+    random_state=RANDOM_STATE
+)
+bagging_subspace_cv = evaluate_cv(
+    bagging_subspace, X_scaled, y, cv_strategy,
+    name='Bagging (70% Features)'
+)
+print(f'Bagging (70% Features) CV F1: {bagging_subspace_cv["F1_mean"]:.4f} +/- {bagging_subspace_cv["F1_std"]:.4f}')
+# Bagging with smaller feature subset (50%)
+bagging_50features = BaggingClassifier(
+    estimator=DecisionTreeClassifier(
+        max_depth=15,
+        min_samples_split=5,
+        min_samples_leaf=2,
+        class_weight='balanced',
+        random_state=RANDOM_STATE
+    ),
+    n_estimators=200,
+    max_samples=1.0,
+    max_features=0.5,  # Use 50% of features for each tree
+    bootstrap=True,
+    bootstrap_features=True,
+    n_jobs=-1,
+    random_state=RANDOM_STATE
+)
+bagging_50features_cv = evaluate_cv(
+    bagging_50features, X_scaled, y, cv_strategy,
+    name='Bagging (50% Features)'
+)
+print(f'Bagging (50% Features) CV F1: {bagging_50features_cv["F1_mean"]:.4f} +/- {bagging_50features_cv["F1_std"]:.4f}')
+# Bagging with even smaller feature subset (30%)
+bagging_30features = BaggingClassifier(
+    estimator=DecisionTreeClassifier(
+        max_depth=15,
+        min_samples_split=5,
+        min_samples_leaf=2,
+        class_weight='balanced',
+        random_state=RANDOM_STATE
+    ),
+    n_estimators=200,
+    max_samples=1.0,
+    max_features=0.3,  # Use 30% of features for each tree
+    bootstrap=True,
+    bootstrap_features=True,
+    n_jobs=-1,
+    random_state=RANDOM_STATE
+)
+bagging_30features_cv = evaluate_cv(
+    bagging_30features, X_scaled, y, cv_strategy,
+    name='Bagging (30% Features)'
+)
+print(f'Bagging (30% Features) CV F1: {bagging_30features_cv["F1_mean"]:.4f} +/- {bagging_30features_cv["F1_std"]:.4f}')
+# Compare with Random Forest (for reference)
+from sklearn.ensemble import RandomForestClassifier
+rf_model = RandomForestClassifier(
+    n_estimators=200,
+    max_depth=15,
+    min_samples_split=5,
+    min_samples_leaf=2,
+    max_features='sqrt',  # sqrt(n_features) - standard random forest
+    class_weight='balanced',
+    random_state=RANDOM_STATE,
+    n_jobs=-1
+)
+rf_cv = evaluate_cv(
+    rf_model, X_scaled, y, cv_strategy,
+    name='Random Forest (sqrt features)'
+)
+print(f'Random Forest CV F1: {rf_cv["F1_mean"]:.4f} +/- {rf_cv["F1_std"]:.4f}')
+# Compare with XGBoost and LightGBM (for reference)
+xgb_model = xgb.XGBClassifier(
+    n_estimators=200,
+    max_depth=6,
+    learning_rate=0.1,
+    subsample=0.8,
+    colsample_bytree=0.8,
+    random_state=RANDOM_STATE,
+    class_weight='balanced',
+    n_jobs=-1,
+    verbosity=0
+)
+xgb_cv = evaluate_cv(
+    xgb_model, X_scaled, y, cv_strategy,
+    name='XGBoost',
+    use_encoded_labels=True
+)
+print(f'XGBoost CV F1: {xgb_cv["F1_mean"]:.4f} +/- {xgb_cv["F1_std"]:.4f}')
+lgb_model = lgb.LGBMClassifier(
+    n_estimators=200,
+    learning_rate=0.1,
+    class_weight='balanced',
+    subsample=0.8,
+    colsample_bytree=0.8,
+    random_state=RANDOM_STATE,
+    n_jobs=-1,
+    verbosity=-1
+)
+lgb_cv = evaluate_cv(
+    lgb_model, X_scaled, y, cv_strategy,
+    name='LightGBM',
+    use_encoded_labels=True
+)
+print(f'LightGBM CV F1: {lgb_cv["F1_mean"]:.4f} +/- {lgb_cv["F1_std"]:.4f}')
+# Collect all results
+all_results = [
+    single_tree_cv,
+    bagging_default_cv,
+    bagging_subspace_cv,
+    bagging_50features_cv,
+    bagging_30features_cv,
+    rf_cv,
+    xgb_cv,
+    lgb_cv,
+]
+results_df = (
+    pd.DataFrame([{k: v for k, v in r.items() if k != '_f1_scores'}
+                  for r in all_results])
+    .sort_values('F1_mean', ascending=False)
+    .reset_index(drop=True)
+)
+print('\n5-FOLD CROSS-VALIDATION SUMMARY')
+print(results_df[['Model', 'F1_mean', 'F1_std', 'Accuracy_mean',
+                   'Precision_mean', 'Recall_mean']].to_string(index=False))
+# Statistical Significance Test (t-test)
+def corrected_resampled_ttest(scores_a, scores_b, n_train, n_test):
+    k        = len(scores_a)
+    diff     = scores_a - scores_b
+    d_bar    = diff.mean()
+    s_sq     = diff.var(ddof=1)
+    var_corr = (1/k + n_test/n_train) * s_sq
+    t_stat   = d_bar / np.sqrt(var_corr)
+    p_value  = 2 * (1 - stats.t.cdf(abs(t_stat), df=k-1))
+    return float(t_stat), float(p_value)
+n_total      = len(X_scaled)
+n_test_fold  = n_total // N_SPLITS
+n_train_fold = n_total - n_test_fold
+result_map   = {r['Model']: r['_f1_scores'] for r in all_results}
+best_model_name = results_df.iloc[0]['Model']
+best_scores = result_map[best_model_name]
+print('\nSTATISTICAL SIGNIFICANCE TESTS vs Best Model')
+for r in all_results:
+    if r['Model'] == best_model_name:
+        continue
+    t, p = corrected_resampled_ttest(
+        r['_f1_scores'], best_scores, n_train_fold, n_test_fold
+    )
+    print(f'  {r["Model"]:<35}  t={t:+.3f}  p={p:.4f}')
+# Save the best model
+model_objects = {
+    'Single Decision Tree': single_tree,
+    'Bagging (All Features)': bagging_default,
+    'Bagging (70% Features)': bagging_subspace,
+    'Bagging (50% Features)': bagging_50features,
+    'Bagging (30% Features)': bagging_30features,
+    'Random Forest': rf_model,
+    'XGBoost': xgb_model,
+    'LightGBM': lgb_model,
+}
+best_name = results_df.iloc[0]['Model']
+best_model = model_objects[best_name]
+print(f'\nBEST MODEL: {best_name}')
+print(f'CV F1 : {results_df.iloc[0]["F1_mean"]:.4f} +/- {results_df.iloc[0]["F1_std"]:.4f}')
+# Train final model on all data
+best_model.fit(X_scaled, y_encoded)
+# Save model artifact
+artifact = {
+    'model'                  : best_model,
+    'model_name'             : best_name,
+    'scaler'                 : scaler,
+    'label_encoder'          : label_encoder,
+    'feature_columns'        : feature_columns,
+    'cv_metrics': {
+        'f1_mean'      : float(results_df.iloc[0]['F1_mean']),
+        'f1_std'       : float(results_df.iloc[0]['F1_std']),
+        'accuracy_mean': float(results_df.iloc[0]['Accuracy_mean']),
+    },
+    'a4_champion_f1' : CHAMPION_F1,
+}
+out_path = OUT_DIR / 'bagging_trees_champion.pkl'
+with open(out_path, 'wb') as f:
+    pickle.dump(artifact, f)
+print(f'\nSaved: {out_path}')
+# Print feature importances for the best ensemble model
+if hasattr(best_model, 'feature_importances_'):
+    importances = best_model.feature_importances_
+    indices = np.argsort(importances)[::-1]
+    print(f'\nTop 10 Most Important Features ({best_name}):')
+    for i in range(min(10, len(feature_columns))):
+        print(f'  {i+1}. {feature_columns[indices[i]]}: {importances[indices[i]]:.4f}')

A5b/classification_baseline.py ADDED Viewed

	@@ -0,0 +1,271 @@

+import os
+import pickle
+import warnings
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from pathlib import Path
+from scipy import stats
+from sklearn.model_selection import (
+    train_test_split, StratifiedKFold, cross_validate
+)
+from sklearn.preprocessing import StandardScaler
+from sklearn.metrics import (
+    accuracy_score, precision_score, recall_score, f1_score,
+    classification_report, confusion_matrix
+)
+from sklearn.linear_model import LogisticRegression
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.ensemble import (
+    RandomForestClassifier,
+    VotingClassifier,
+    BaggingClassifier,
+    StackingClassifier,
+)
+import xgboost as xgb
+import lightgbm as lgb
+warnings.filterwarnings('ignore')
+np.random.seed(42)
+REPO_ROOT    = os.path.abspath(os.path.join(os.getcwd(), '..'))
+DATA_DIR     = os.path.join(REPO_ROOT, 'Datasets_all')
+OUT_DIR      = Path('models')
+OUT_DIR.mkdir(exist_ok=True)
+RANDOM_STATE = 42
+N_SPLITS     = 5
+CHAMPION_F1  = 0.6110   # Score from A4
+movement_features_df = pd.read_csv(os.path.join(DATA_DIR, 'aimoscores.csv'))
+weaklink_scores_df   = pd.read_csv(os.path.join(DATA_DIR, 'scores_and_weaklink.csv'))
+print('Movement features shape:', movement_features_df.shape)
+print('Weak link scores shape:', weaklink_scores_df.shape)
+DUPLICATE_NASM_COLS = [
+    'No_1_NASM_Deviation',
+    'No_2_NASM_Deviation',
+    'No_3_NASM_Deviation',
+    'No_4_NASM_Deviation',
+    'No_5_NASM_Deviation',
+]
+movement_features_df = movement_features_df.drop(columns=DUPLICATE_NASM_COLS)
+print('Shape after duplicate removal:', movement_features_df.shape)
+weaklink_categories = [
+    'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward',
+    'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward',
+    'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward',
+    'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward',
+    'RightKneeMovesOutward', 'RightShoulderElevation',
+]
+weaklink_scores_df['WeakestLink'] = (
+    weaklink_scores_df[weaklink_categories].idxmax(axis=1)
+)
+print('Weakest Link class distribution:')
+print(weaklink_scores_df['WeakestLink'].value_counts())
+# Merge Datasets
+target_df = weaklink_scores_df[['ID', 'WeakestLink']].copy()
+merged_df = movement_features_df.merge(target_df, on='ID', how='inner')
+print('Merged dataset shape:', merged_df.shape)
+EXCLUDE_COLS    = ['ID', 'WeakestLink', 'EstimatedScore']
+feature_columns = [c for c in merged_df.columns if c not in EXCLUDE_COLS]
+X = merged_df[feature_columns].values
+y = merged_df['WeakestLink'].values
+print(f'Feature matrix shape : {X.shape}')
+print(f'Number of features   : {len(feature_columns)}')
+print(f'Number of classes    : {len(np.unique(y))}')
+# is the training split needed for cross validation?
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
+)
+scaler         = StandardScaler()
+X_train_scaled = scaler.fit_transform(X_train)
+X_test_scaled  = scaler.transform(X_test)
+print(f'Training samples : {X_train.shape[0]}')
+print(f'Test samples     : {X_test.shape[0]}')
+cv_strategy = StratifiedKFold(
+    n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE
+)
+def evaluate_cv(model, X, y, cv, name='Model'):
+    scoring = {
+        'accuracy' : 'accuracy',
+        'f1'       : 'f1_weighted',
+        'precision': 'precision_weighted',
+        'recall'   : 'recall_weighted',
+    }
+    cv_res = cross_validate(model, X, y, cv=cv, scoring=scoring)
+    return {
+        'Model'         : name,
+        'Accuracy_mean' : cv_res['test_accuracy'].mean(),
+        'Accuracy_std'  : cv_res['test_accuracy'].std(),
+        'F1_mean'       : cv_res['test_f1'].mean(),
+        'F1_std'        : cv_res['test_f1'].std(),
+        'Precision_mean': cv_res['test_precision'].mean(),
+        'Recall_mean'   : cv_res['test_recall'].mean(),
+        '_f1_scores'    : cv_res['test_f1'],
+    }
+rf_champion = RandomForestClassifier(
+    n_estimators=200, max_depth=15,
+    min_samples_split=5, min_samples_leaf=2,
+    class_weight='balanced',
+    random_state=RANDOM_STATE, n_jobs=-1
+)
+champ_cv = evaluate_cv(
+    rf_champion, X_train_scaled, y_train, cv_strategy,
+    name='A4 Champion – Random Forest'
+)
+rf_champion.fit(X_train_scaled, y_train)
+champ_test_f1 = f1_score(y_test, rf_champion.predict(X_test_scaled), average='weighted')
+print('A4 CHAMPION (Random Forest)')
+print(f'CV F1: {champ_cv["F1_mean"]:.4f} +/- {champ_cv["F1_std"]:.4f}')
+print(f'Test F1: {champ_test_f1:.4f}')
+soft_voting = VotingClassifier(
+    estimators=[
+        ('rf',  RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, class_weight='balanced_subsample',
+                                       random_state=RANDOM_STATE, n_jobs=-1)),
+        ('lr',  LogisticRegression( max_iter=1000, class_weight='balanced',random_state=RANDOM_STATE)),
+        ('xgb', xgb.XGBClassifier(  n_estimators=200, max_depth=6, learning_rate=0.1, subsample=0.8,
+                                    colsample_bytree=0.8, random_state=RANDOM_STATE,class_weight='balanced', n_jobs=-1 )),
+        ('lgb', lgb.LGBMClassifier( n_estimators=200, learning_rate=0.1, class_weight='balanced',subsample=0.8, colsample_bytree=0.8,
+                                    random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1 )),
+        ('knn', KNeighborsClassifier(n_neighbors=7)),
+        ('lda', LinearDiscriminantAnalysis()),
+    ],
+    voting='soft',
+    n_jobs=-1,
+)
+sv_cv = evaluate_cv(soft_voting, X_train_scaled, y_train, cv_strategy, name='Soft Voting')
+print(f'Soft Voting CV F1: {sv_cv["F1_mean"]:.4f} +/- {sv_cv["F1_std"]:.4f}')
+all_results = [champ_cv, sv_cv]
+results_df  = (
+    pd.DataFrame([{k: v for k, v in r.items() if k != '_f1_scores'}
+                  for r in all_results])
+    .sort_values('F1_mean', ascending=False)
+    .reset_index(drop=True)
+)
+print('5-FOLD CROSS-VALIDATION SUMMARY')
+print(results_df[['Model','F1_mean','F1_std','Accuracy_mean',
+                   'Precision_mean','Recall_mean']].to_string(index=False))
+# Statistical Significance Test (t-test)
+def corrected_resampled_ttest(scores_a, scores_b, n_train, n_test):
+    k        = len(scores_a)
+    diff     = scores_a - scores_b
+    d_bar    = diff.mean()
+    s_sq     = diff.var(ddof=1)
+    var_corr = (1/k + n_test/n_train) * s_sq
+    t_stat   = d_bar / np.sqrt(var_corr)
+    p_value  = 2 * (1 - stats.t.cdf(abs(t_stat), df=k-1))
+    return float(t_stat), float(p_value)
+n_total      = len(X_train_scaled)
+n_test_fold  = n_total // N_SPLITS
+n_train_fold = n_total - n_test_fold
+result_map   = {r['Model']: r['_f1_scores'] for r in all_results}
+champ_scores = result_map['A4 Champion – Random Forest']
+print('STATISTICAL SIGNIFICANCE TESTS vs A4 Champion')
+for r in all_results:
+    if 'Champion' in r['Model']:
+        continue
+    t, p = corrected_resampled_ttest(
+        r['_f1_scores'], champ_scores, n_train_fold, n_test_fold
+    )
+    print(f'  {r["Model"]:<35}  t={t:+.3f}  p={p:.4f}')
+# unecessary eval on the test set?
+model_objects = {
+    'Soft Voting'                : soft_voting,
+    'A4 Champion – Random Forest': rf_champion,
+}
+best_name  = results_df.iloc[0]['Model']
+best_model = model_objects[best_name]
+print(f'CHAMPION ENSEMBLE: {best_name}')
+print(f'CV F1 : {results_df.iloc[0]["F1_mean"]:.4f} +/- {results_df.iloc[0]["F1_std"]:.4f}')
+best_model.fit(X_train_scaled, y_train)
+y_pred_best = best_model.predict(X_test_scaled)
+test_f1   = f1_score(y_test, y_pred_best, average='weighted')
+test_acc  = accuracy_score(y_test, y_pred_best)
+test_prec = precision_score(y_test, y_pred_best, average='weighted', zero_division=0)
+test_rec  = recall_score(y_test, y_pred_best, average='weighted', zero_division=0)
+improvement = (test_f1 - CHAMPION_F1) / CHAMPION_F1 * 100
+print('\n TEST SET RESULTS')
+print(f'F1-Score (weighted) : {test_f1:.4f}')
+print(f'Accuracy  : {test_acc:.4f}')
+print(f'Precision : {test_prec:.4f}')
+print(f'Recall : {test_rec:.4f}')
+print(f'\n A4 original champion F1 : {CHAMPION_F1:.4f}')
+test_rows = []
+for name, model in model_objects.items():
+    model.fit(X_train_scaled, y_train)
+    preds = model.predict(X_test_scaled)
+    test_rows.append({
+        'Model'      : name,
+        'Test_F1'    : f1_score(y_test, preds, average='weighted'),
+        'Test_Acc'   : accuracy_score(y_test, preds),
+        'Test_Prec'  : precision_score(y_test, preds, average='weighted', zero_division=0),
+        'Test_Recall': recall_score(y_test, preds, average='weighted', zero_division=0),
+    })
+test_results_df = pd.DataFrame(test_rows).sort_values('Test_F1', ascending=False)
+print('TEST SET COMPARISON – ALL MODELS')
+print(test_results_df.to_string(index=False))
+print(f'CLASSIFICATION REPORT: {best_name}')
+print(classification_report(y_test, y_pred_best, zero_division=0))
+# save model
+artifact = {
+    'model'                  : best_model,
+    'model_name'             : best_name,
+    'scaler'                 : scaler,
+    'feature_columns'        : feature_columns,
+    'cv_metrics': {
+        'f1_mean'      : float(results_df.iloc[0]['F1_mean']),
+        'f1_std'       : float(results_df.iloc[0]['F1_std']),
+        'accuracy_mean': float(results_df.iloc[0]['Accuracy_mean']),
+    },
+    'test_metrics': {
+        'f1'       : float(test_f1),
+        'accuracy' : float(test_acc),
+        'precision': float(test_prec),
+        'recall'   : float(test_rec),
+    },
+    'a4_champion_f1' : CHAMPION_F1,
+    'improvement_pct': float(improvement),
+}
+out_path = OUT_DIR / 'ensemble_classification_champion.pkl'
+with open(out_path, 'wb') as f:
+    pickle.dump(artifact, f)
+print(f'Saved: {out_path}')

A5b/cv_baseline.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import os
+import pickle
+import warnings
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from pathlib import Path
+from scipy import stats
+from sklearn.model_selection import (
+    StratifiedKFold, cross_validate
+)
+from sklearn.preprocessing import StandardScaler
+from sklearn.metrics import (
+    accuracy_score, precision_score, recall_score, f1_score,
+    classification_report, confusion_matrix
+)
+from sklearn.linear_model import LogisticRegression
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.ensemble import (
+    RandomForestClassifier,
+    VotingClassifier,
+    BaggingClassifier,
+    StackingClassifier,
+)
+import xgboost as xgb
+import lightgbm as lgb
+warnings.filterwarnings('ignore')
+np.random.seed(42)
+REPO_ROOT    = os.path.abspath(os.path.join(os.getcwd(), '..'))
+DATA_DIR     = os.path.join(REPO_ROOT, 'Datasets_all')
+OUT_DIR      = Path('models')
+OUT_DIR.mkdir(exist_ok=True)
+RANDOM_STATE = 42
+N_SPLITS     = 5
+CHAMPION_F1  = 0.6110   # Score from A4
+movement_features_df = pd.read_csv(os.path.join(DATA_DIR, 'aimoscores.csv'))
+weaklink_scores_df   = pd.read_csv(os.path.join(DATA_DIR, 'scores_and_weaklink.csv'))
+print('Movement features shape:', movement_features_df.shape)
+print('Weak link scores shape:', weaklink_scores_df.shape)
+DUPLICATE_NASM_COLS = [
+    'No_1_NASM_Deviation',
+    'No_2_NASM_Deviation',
+    'No_3_NASM_Deviation',
+    'No_4_NASM_Deviation',
+    'No_5_NASM_Deviation',
+]
+movement_features_df = movement_features_df.drop(columns=DUPLICATE_NASM_COLS)
+print('Shape after duplicate removal:', movement_features_df.shape)
+weaklink_categories = [
+    'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward',
+    'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward',
+    'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward',
+    'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward',
+    'RightKneeMovesOutward', 'RightShoulderElevation',
+]
+weaklink_scores_df['WeakestLink'] = (
+    weaklink_scores_df[weaklink_categories].idxmax(axis=1)
+)
+print('Weakest Link class distribution:')
+print(weaklink_scores_df['WeakestLink'].value_counts())
+# Merge Datasets
+target_df = weaklink_scores_df[['ID', 'WeakestLink']].copy()
+merged_df = movement_features_df.merge(target_df, on='ID', how='inner')
+print('Merged dataset shape:', merged_df.shape)
+EXCLUDE_COLS    = ['ID', 'WeakestLink', 'EstimatedScore']
+feature_columns = [c for c in merged_df.columns if c not in EXCLUDE_COLS]
+X = merged_df[feature_columns].values
+y = merged_df['WeakestLink'].values
+print(f'Feature matrix shape : {X.shape}')
+print(f'Number of features   : {len(feature_columns)}')
+print(f'Number of classes    : {len(np.unique(y))}')
+scaler = StandardScaler()
+X_scaled = scaler.fit_transform(X)
+print(f'Total samples      : {X.shape[0]}')
+cv_strategy = StratifiedKFold(
+    n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE
+)
+def evaluate_cv(model, X, y, cv, name='Model'):
+    scoring = {
+        'accuracy' : 'accuracy',
+        'f1'       : 'f1_weighted',
+        'precision': 'precision_weighted',
+        'recall'   : 'recall_weighted',
+    }
+    cv_res = cross_validate(model, X, y, cv=cv, scoring=scoring)
+    return {
+        'Model'         : name,
+        'Accuracy_mean' : cv_res['test_accuracy'].mean(),
+        'Accuracy_std'  : cv_res['test_accuracy'].std(),
+        'F1_mean'       : cv_res['test_f1'].mean(),
+        'F1_std'        : cv_res['test_f1'].std(),
+        'Precision_mean': cv_res['test_precision'].mean(),
+        'Recall_mean'   : cv_res['test_recall'].mean(),
+        '_f1_scores'    : cv_res['test_f1'],
+    }
+rf_champion = RandomForestClassifier(
+    n_estimators=200, max_depth=15,
+    min_samples_split=5, min_samples_leaf=2,
+    class_weight='balanced',
+    random_state=RANDOM_STATE, n_jobs=-1
+)
+champ_cv = evaluate_cv(
+    rf_champion, X_scaled, y, cv_strategy,
+    name='A4 Champion – Random Forest'
+)
+soft_voting = VotingClassifier(
+    estimators=[
+        ('rf',  RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, class_weight='balanced_subsample',
+                                       random_state=RANDOM_STATE, n_jobs=-1)),
+        ('lr',  LogisticRegression( max_iter=1000, class_weight='balanced',random_state=RANDOM_STATE)),
+        ('xgb', xgb.XGBClassifier(  n_estimators=200, max_depth=6, learning_rate=0.1, subsample=0.8,
+                                    colsample_bytree=0.8, random_state=RANDOM_STATE,class_weight='balanced', n_jobs=-1 )),
+        ('lgb', lgb.LGBMClassifier( n_estimators=200, learning_rate=0.1, class_weight='balanced',subsample=0.8, colsample_bytree=0.8,
+                                    random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1 )),
+        ('knn', KNeighborsClassifier(n_neighbors=7)),
+        ('lda', LinearDiscriminantAnalysis()),
+    ],
+    voting='soft',
+    n_jobs=-1,
+)
+sv_cv = evaluate_cv(soft_voting, X_scaled, y, cv_strategy, name='Soft Voting')
+all_results = [champ_cv, sv_cv]
+results_df  = (
+    pd.DataFrame([{k: v for k, v in r.items() if k != '_f1_scores'}
+                  for r in all_results])
+    .sort_values('F1_mean', ascending=False)
+    .reset_index(drop=True)
+)
+print('5-FOLD CROSS-VALIDATION SUMMARY')
+print(results_df[['Model','F1_mean','F1_std','Accuracy_mean',
+                   'Precision_mean','Recall_mean']].to_string(index=False))
+# Statistical Significance Test (t-test)
+def corrected_resampled_ttest(scores_a, scores_b, n_train, n_test):
+    k        = len(scores_a)
+    diff     = scores_a - scores_b
+    d_bar    = diff.mean()
+    s_sq     = diff.var(ddof=1)
+    var_corr = (1/k + n_test/n_train) * s_sq
+    t_stat   = d_bar / np.sqrt(var_corr)
+    p_value  = 2 * (1 - stats.t.cdf(abs(t_stat), df=k-1))
+    return float(t_stat), float(p_value)
+n_total      = len(X_scaled)
+n_test_fold  = n_total // N_SPLITS
+n_train_fold = n_total - n_test_fold
+result_map   = {r['Model']: r['_f1_scores'] for r in all_results}
+champ_scores = result_map['A4 Champion – Random Forest']
+print('STATISTICAL SIGNIFICANCE TESTS vs A4 Champion')
+for r in all_results:
+    if 'Champion' in r['Model']:
+        continue
+    t, p = corrected_resampled_ttest(
+        r['_f1_scores'], champ_scores, n_train_fold, n_test_fold
+    )
+    print(f'  {r["Model"]:<35}  t={t:+.3f}  p={p:.4f}')
+# Save model using cross-validation (fit on all data)
+model_objects = {
+    'Soft Voting'                : soft_voting,
+    'A4 Champion – Random Forest': rf_champion,
+}
+best_name  = results_df.iloc[0]['Model']
+best_model = model_objects[best_name]
+print(f'CHAMPION ENSEMBLE: {best_name}')
+print(f'CV F1 : {results_df.iloc[0]["F1_mean"]:.4f} +/- {results_df.iloc[0]["F1_std"]:.4f}')
+# Fit best model on all data for final deployment
+best_model.fit(X_scaled, y)
+# Save model
+artifact = {
+    'model'                  : best_model,
+    'model_name'             : best_name,
+    'scaler'                 : scaler,
+    'feature_columns'        : feature_columns,
+    'cv_metrics': {
+        'f1_mean'      : float(results_df.iloc[0]['F1_mean']),
+        'f1_std'       : float(results_df.iloc[0]['F1_std']),
+        'accuracy_mean': float(results_df.iloc[0]['Accuracy_mean']),
+        'precision_mean': float(results_df.iloc[0]['Precision_mean']),
+        'recall_mean'  : float(results_df.iloc[0]['Recall_mean']),
+    },
+    'a4_champion_f1' : CHAMPION_F1,
+}
+out_path = OUT_DIR / 'ensemble_classification_champion.pkl'
+with open(out_path, 'wb') as f:
+    pickle.dump(artifact, f)
+print(f'Saved: {out_path}')

A5b/models/adaboost_classification.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d34c717b5f5dc02f4481f3207afcacb94ceb3ec69069589ca6abe435c8001470
+size 725059

A5b/models/adaboost_feature_importance.png ADDED Viewed

Git LFS Details

SHA256: aecc76dbe9ce90a4813a3b7040d1e57ee324ec1d71f10303b129c41cbc3ad744
Pointer size: 131 Bytes
Size of remote file: 101 kB

A5b/models/bagging_trees_champion.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d2475e28a11e89c0c4544064525f6d41d7890e19c5549575ac597d64e076616e
+size 6506122

A5b/models/ensemble_classification_champion.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:93cecc19fe3e22c357af6ee6778990d7e3b518a36bbb6418a78ecb6795ef4cce
+size 30798315