Spaces:
Sleeping
Sleeping
| import os | |
| import pickle | |
| import warnings | |
| import numpy as np | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from pathlib import Path | |
| from scipy import stats | |
| from sklearn.model_selection import ( | |
| train_test_split, StratifiedKFold, cross_validate | |
| ) | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.metrics import ( | |
| accuracy_score, precision_score, recall_score, f1_score, | |
| classification_report, confusion_matrix | |
| ) | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.discriminant_analysis import LinearDiscriminantAnalysis | |
| from sklearn.neighbors import KNeighborsClassifier | |
| from sklearn.naive_bayes import GaussianNB | |
| from sklearn.ensemble import ( | |
| RandomForestClassifier, | |
| VotingClassifier, | |
| BaggingClassifier, | |
| StackingClassifier, | |
| ) | |
| import xgboost as xgb | |
| import lightgbm as lgb | |
| warnings.filterwarnings('ignore') | |
| np.random.seed(42) | |
| REPO_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..')) | |
| DATA_DIR = os.path.join(REPO_ROOT, 'Datasets_all') | |
| OUT_DIR = Path('models') | |
| OUT_DIR.mkdir(exist_ok=True) | |
| RANDOM_STATE = 42 | |
| N_SPLITS = 5 | |
| CHAMPION_F1 = 0.6110 # Score from A4 | |
| movement_features_df = pd.read_csv(os.path.join(DATA_DIR, 'aimoscores.csv')) | |
| weaklink_scores_df = pd.read_csv(os.path.join(DATA_DIR, 'scores_and_weaklink.csv')) | |
| print('Movement features shape:', movement_features_df.shape) | |
| print('Weak link scores shape:', weaklink_scores_df.shape) | |
| DUPLICATE_NASM_COLS = [ | |
| 'No_1_NASM_Deviation', | |
| 'No_2_NASM_Deviation', | |
| 'No_3_NASM_Deviation', | |
| 'No_4_NASM_Deviation', | |
| 'No_5_NASM_Deviation', | |
| ] | |
| movement_features_df = movement_features_df.drop(columns=DUPLICATE_NASM_COLS) | |
| print('Shape after duplicate removal:', movement_features_df.shape) | |
| weaklink_categories = [ | |
| 'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward', | |
| 'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward', | |
| 'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward', | |
| 'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward', | |
| 'RightKneeMovesOutward', 'RightShoulderElevation', | |
| ] | |
| weaklink_scores_df['WeakestLink'] = ( | |
| weaklink_scores_df[weaklink_categories].idxmax(axis=1) | |
| ) | |
| print('Weakest Link class distribution:') | |
| print(weaklink_scores_df['WeakestLink'].value_counts()) | |
| # Merge Datasets | |
| target_df = weaklink_scores_df[['ID', 'WeakestLink']].copy() | |
| merged_df = movement_features_df.merge(target_df, on='ID', how='inner') | |
| print('Merged dataset shape:', merged_df.shape) | |
| EXCLUDE_COLS = ['ID', 'WeakestLink', 'EstimatedScore'] | |
| feature_columns = [c for c in merged_df.columns if c not in EXCLUDE_COLS] | |
| X = merged_df[feature_columns].values | |
| y = merged_df['WeakestLink'].values | |
| print(f'Feature matrix shape : {X.shape}') | |
| print(f'Number of features : {len(feature_columns)}') | |
| print(f'Number of classes : {len(np.unique(y))}') | |
| # is the training split needed for cross validation? | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y | |
| ) | |
| scaler = StandardScaler() | |
| X_train_scaled = scaler.fit_transform(X_train) | |
| X_test_scaled = scaler.transform(X_test) | |
| print(f'Training samples : {X_train.shape[0]}') | |
| print(f'Test samples : {X_test.shape[0]}') | |
| cv_strategy = StratifiedKFold( | |
| n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE | |
| ) | |
| def evaluate_cv(model, X, y, cv, name='Model'): | |
| scoring = { | |
| 'accuracy' : 'accuracy', | |
| 'f1' : 'f1_weighted', | |
| 'precision': 'precision_weighted', | |
| 'recall' : 'recall_weighted', | |
| } | |
| cv_res = cross_validate(model, X, y, cv=cv, scoring=scoring) | |
| return { | |
| 'Model' : name, | |
| 'Accuracy_mean' : cv_res['test_accuracy'].mean(), | |
| 'Accuracy_std' : cv_res['test_accuracy'].std(), | |
| 'F1_mean' : cv_res['test_f1'].mean(), | |
| 'F1_std' : cv_res['test_f1'].std(), | |
| 'Precision_mean': cv_res['test_precision'].mean(), | |
| 'Recall_mean' : cv_res['test_recall'].mean(), | |
| '_f1_scores' : cv_res['test_f1'], | |
| } | |
| rf_champion = RandomForestClassifier( | |
| n_estimators=200, max_depth=15, | |
| min_samples_split=5, min_samples_leaf=2, | |
| class_weight='balanced', | |
| random_state=RANDOM_STATE, n_jobs=-1 | |
| ) | |
| champ_cv = evaluate_cv( | |
| rf_champion, X_train_scaled, y_train, cv_strategy, | |
| name='A4 Champion – Random Forest' | |
| ) | |
| rf_champion.fit(X_train_scaled, y_train) | |
| champ_test_f1 = f1_score(y_test, rf_champion.predict(X_test_scaled), average='weighted') | |
| print('A4 CHAMPION (Random Forest)') | |
| print(f'CV F1: {champ_cv["F1_mean"]:.4f} +/- {champ_cv["F1_std"]:.4f}') | |
| print(f'Test F1: {champ_test_f1:.4f}') | |
| soft_voting = VotingClassifier( | |
| estimators=[ | |
| ('rf', RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, class_weight='balanced_subsample', | |
| random_state=RANDOM_STATE, n_jobs=-1)), | |
| ('lr', LogisticRegression( max_iter=1000, class_weight='balanced',random_state=RANDOM_STATE)), | |
| ('xgb', xgb.XGBClassifier( n_estimators=200, max_depth=6, learning_rate=0.1, subsample=0.8, | |
| colsample_bytree=0.8, random_state=RANDOM_STATE,class_weight='balanced', n_jobs=-1 )), | |
| ('lgb', lgb.LGBMClassifier( n_estimators=200, learning_rate=0.1, class_weight='balanced',subsample=0.8, colsample_bytree=0.8, | |
| random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1 )), | |
| ('knn', KNeighborsClassifier(n_neighbors=7)), | |
| ('lda', LinearDiscriminantAnalysis()), | |
| ], | |
| voting='soft', | |
| n_jobs=-1, | |
| ) | |
| sv_cv = evaluate_cv(soft_voting, X_train_scaled, y_train, cv_strategy, name='Soft Voting') | |
| print(f'Soft Voting CV F1: {sv_cv["F1_mean"]:.4f} +/- {sv_cv["F1_std"]:.4f}') | |
| all_results = [champ_cv, sv_cv] | |
| results_df = ( | |
| pd.DataFrame([{k: v for k, v in r.items() if k != '_f1_scores'} | |
| for r in all_results]) | |
| .sort_values('F1_mean', ascending=False) | |
| .reset_index(drop=True) | |
| ) | |
| print('5-FOLD CROSS-VALIDATION SUMMARY') | |
| print(results_df[['Model','F1_mean','F1_std','Accuracy_mean', | |
| 'Precision_mean','Recall_mean']].to_string(index=False)) | |
| # Statistical Significance Test (t-test) | |
| def corrected_resampled_ttest(scores_a, scores_b, n_train, n_test): | |
| k = len(scores_a) | |
| diff = scores_a - scores_b | |
| d_bar = diff.mean() | |
| s_sq = diff.var(ddof=1) | |
| var_corr = (1/k + n_test/n_train) * s_sq | |
| t_stat = d_bar / np.sqrt(var_corr) | |
| p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df=k-1)) | |
| return float(t_stat), float(p_value) | |
| n_total = len(X_train_scaled) | |
| n_test_fold = n_total // N_SPLITS | |
| n_train_fold = n_total - n_test_fold | |
| result_map = {r['Model']: r['_f1_scores'] for r in all_results} | |
| champ_scores = result_map['A4 Champion – Random Forest'] | |
| print('STATISTICAL SIGNIFICANCE TESTS vs A4 Champion') | |
| for r in all_results: | |
| if 'Champion' in r['Model']: | |
| continue | |
| t, p = corrected_resampled_ttest( | |
| r['_f1_scores'], champ_scores, n_train_fold, n_test_fold | |
| ) | |
| print(f' {r["Model"]:<35} t={t:+.3f} p={p:.4f}') | |
| # unecessary eval on the test set? | |
| model_objects = { | |
| 'Soft Voting' : soft_voting, | |
| 'A4 Champion – Random Forest': rf_champion, | |
| } | |
| best_name = results_df.iloc[0]['Model'] | |
| best_model = model_objects[best_name] | |
| print(f'CHAMPION ENSEMBLE: {best_name}') | |
| print(f'CV F1 : {results_df.iloc[0]["F1_mean"]:.4f} +/- {results_df.iloc[0]["F1_std"]:.4f}') | |
| best_model.fit(X_train_scaled, y_train) | |
| y_pred_best = best_model.predict(X_test_scaled) | |
| test_f1 = f1_score(y_test, y_pred_best, average='weighted') | |
| test_acc = accuracy_score(y_test, y_pred_best) | |
| test_prec = precision_score(y_test, y_pred_best, average='weighted', zero_division=0) | |
| test_rec = recall_score(y_test, y_pred_best, average='weighted', zero_division=0) | |
| improvement = (test_f1 - CHAMPION_F1) / CHAMPION_F1 * 100 | |
| print('\n TEST SET RESULTS') | |
| print(f'F1-Score (weighted) : {test_f1:.4f}') | |
| print(f'Accuracy : {test_acc:.4f}') | |
| print(f'Precision : {test_prec:.4f}') | |
| print(f'Recall : {test_rec:.4f}') | |
| print(f'\n A4 original champion F1 : {CHAMPION_F1:.4f}') | |
| test_rows = [] | |
| for name, model in model_objects.items(): | |
| model.fit(X_train_scaled, y_train) | |
| preds = model.predict(X_test_scaled) | |
| test_rows.append({ | |
| 'Model' : name, | |
| 'Test_F1' : f1_score(y_test, preds, average='weighted'), | |
| 'Test_Acc' : accuracy_score(y_test, preds), | |
| 'Test_Prec' : precision_score(y_test, preds, average='weighted', zero_division=0), | |
| 'Test_Recall': recall_score(y_test, preds, average='weighted', zero_division=0), | |
| }) | |
| test_results_df = pd.DataFrame(test_rows).sort_values('Test_F1', ascending=False) | |
| print('TEST SET COMPARISON – ALL MODELS') | |
| print(test_results_df.to_string(index=False)) | |
| print(f'CLASSIFICATION REPORT: {best_name}') | |
| print(classification_report(y_test, y_pred_best, zero_division=0)) | |
| # save model | |
| artifact = { | |
| 'model' : best_model, | |
| 'model_name' : best_name, | |
| 'scaler' : scaler, | |
| 'feature_columns' : feature_columns, | |
| 'cv_metrics': { | |
| 'f1_mean' : float(results_df.iloc[0]['F1_mean']), | |
| 'f1_std' : float(results_df.iloc[0]['F1_std']), | |
| 'accuracy_mean': float(results_df.iloc[0]['Accuracy_mean']), | |
| }, | |
| 'test_metrics': { | |
| 'f1' : float(test_f1), | |
| 'accuracy' : float(test_acc), | |
| 'precision': float(test_prec), | |
| 'recall' : float(test_rec), | |
| }, | |
| 'a4_champion_f1' : CHAMPION_F1, | |
| 'improvement_pct': float(improvement), | |
| } | |
| out_path = OUT_DIR / 'ensemble_classification_champion.pkl' | |
| with open(out_path, 'wb') as f: | |
| pickle.dump(artifact, f) | |
| print(f'Saved: {out_path}') | |