github_sync / A5b /cv_baseline.py
Bachstelze
update pickle with classes
99097e8
import os
import pickle
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy import stats
from sklearn.model_selection import (
StratifiedKFold, cross_validate
)
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
classification_report, confusion_matrix
)
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import (
RandomForestClassifier,
VotingClassifier,
BaggingClassifier,
StackingClassifier,
)
import xgboost as xgb
import lightgbm as lgb
warnings.filterwarnings('ignore')
np.random.seed(42)
REPO_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
DATA_DIR = os.path.join(REPO_ROOT, 'Datasets_all')
OUT_DIR = Path('models')
OUT_DIR.mkdir(exist_ok=True)
RANDOM_STATE = 42
N_SPLITS = 5
CHAMPION_F1 = 0.6110 # Score from A4
movement_features_df = pd.read_csv(os.path.join(DATA_DIR, 'aimoscores.csv'))
weaklink_scores_df = pd.read_csv(os.path.join(DATA_DIR, 'scores_and_weaklink.csv'))
print('Movement features shape:', movement_features_df.shape)
print('Weak link scores shape:', weaklink_scores_df.shape)
DUPLICATE_NASM_COLS = [
'No_1_NASM_Deviation',
'No_2_NASM_Deviation',
'No_3_NASM_Deviation',
'No_4_NASM_Deviation',
'No_5_NASM_Deviation',
]
movement_features_df = movement_features_df.drop(columns=DUPLICATE_NASM_COLS)
print('Shape after duplicate removal:', movement_features_df.shape)
weaklink_categories = [
'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward',
'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward',
'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward',
'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward',
'RightKneeMovesOutward', 'RightShoulderElevation',
]
weaklink_scores_df['WeakestLink'] = (
weaklink_scores_df[weaklink_categories].idxmax(axis=1)
)
print('Weakest Link class distribution:')
print(weaklink_scores_df['WeakestLink'].value_counts())
# Merge Datasets
target_df = weaklink_scores_df[['ID', 'WeakestLink']].copy()
merged_df = movement_features_df.merge(target_df, on='ID', how='inner')
print('Merged dataset shape:', merged_df.shape)
EXCLUDE_COLS = ['ID', 'WeakestLink', 'EstimatedScore']
feature_columns = [c for c in merged_df.columns if c not in EXCLUDE_COLS]
X = merged_df[feature_columns].values
y = merged_df['WeakestLink'].values
print(f'Feature matrix shape : {X.shape}')
print(f'Number of features : {len(feature_columns)}')
print(f'Number of classes : {len(np.unique(y))}')
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(f'Total samples : {X.shape[0]}')
cv_strategy = StratifiedKFold(
n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE
)
def evaluate_cv(model, X, y, cv, name='Model'):
scoring = {
'accuracy' : 'accuracy',
'f1' : 'f1_weighted',
'precision': 'precision_weighted',
'recall' : 'recall_weighted',
}
cv_res = cross_validate(model, X, y, cv=cv, scoring=scoring)
return {
'Model' : name,
'Accuracy_mean' : cv_res['test_accuracy'].mean(),
'Accuracy_std' : cv_res['test_accuracy'].std(),
'F1_mean' : cv_res['test_f1'].mean(),
'F1_std' : cv_res['test_f1'].std(),
'Precision_mean': cv_res['test_precision'].mean(),
'Recall_mean' : cv_res['test_recall'].mean(),
'_f1_scores' : cv_res['test_f1'],
}
rf_champion = RandomForestClassifier(
n_estimators=200, max_depth=15,
min_samples_split=5, min_samples_leaf=2,
class_weight='balanced',
random_state=RANDOM_STATE, n_jobs=-1
)
champ_cv = evaluate_cv(
rf_champion, X_scaled, y, cv_strategy,
name='A4 Champion – Random Forest'
)
soft_voting = VotingClassifier(
estimators=[
('rf', RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, class_weight='balanced_subsample',
random_state=RANDOM_STATE, n_jobs=-1)),
('lr', LogisticRegression( max_iter=1000, class_weight='balanced',random_state=RANDOM_STATE)),
('xgb', xgb.XGBClassifier( n_estimators=200, max_depth=6, learning_rate=0.1, subsample=0.8,
colsample_bytree=0.8, random_state=RANDOM_STATE,class_weight='balanced', n_jobs=-1 )),
('lgb', lgb.LGBMClassifier( n_estimators=200, learning_rate=0.1, class_weight='balanced',subsample=0.8, colsample_bytree=0.8,
random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1 )),
('knn', KNeighborsClassifier(n_neighbors=7)),
('lda', LinearDiscriminantAnalysis()),
],
voting='soft',
n_jobs=-1,
)
sv_cv = evaluate_cv(soft_voting, X_scaled, y, cv_strategy, name='Soft Voting')
all_results = [champ_cv, sv_cv]
results_df = (
pd.DataFrame([{k: v for k, v in r.items() if k != '_f1_scores'}
for r in all_results])
.sort_values('F1_mean', ascending=False)
.reset_index(drop=True)
)
print('5-FOLD CROSS-VALIDATION SUMMARY')
print(results_df[['Model','F1_mean','F1_std','Accuracy_mean',
'Precision_mean','Recall_mean']].to_string(index=False))
# Statistical Significance Test (t-test)
def corrected_resampled_ttest(scores_a, scores_b, n_train, n_test):
k = len(scores_a)
diff = scores_a - scores_b
d_bar = diff.mean()
s_sq = diff.var(ddof=1)
var_corr = (1/k + n_test/n_train) * s_sq
t_stat = d_bar / np.sqrt(var_corr)
p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df=k-1))
return float(t_stat), float(p_value)
n_total = len(X_scaled)
n_test_fold = n_total // N_SPLITS
n_train_fold = n_total - n_test_fold
result_map = {r['Model']: r['_f1_scores'] for r in all_results}
champ_scores = result_map['A4 Champion – Random Forest']
print('STATISTICAL SIGNIFICANCE TESTS vs A4 Champion')
for r in all_results:
if 'Champion' in r['Model']:
continue
t, p = corrected_resampled_ttest(
r['_f1_scores'], champ_scores, n_train_fold, n_test_fold
)
print(f' {r["Model"]:<35} t={t:+.3f} p={p:.4f}')
# Save model using cross-validation (fit on all data)
model_objects = {
'Soft Voting' : soft_voting,
'A4 Champion – Random Forest': rf_champion,
}
best_name = results_df.iloc[0]['Model']
best_model = model_objects[best_name]
print(f'CHAMPION ENSEMBLE: {best_name}')
print(f'CV F1 : {results_df.iloc[0]["F1_mean"]:.4f} +/- {results_df.iloc[0]["F1_std"]:.4f}')
# Fit best model on all data for final deployment
best_model.fit(X_scaled, y)
# Save model
artifact = {
'model' : best_model,
'model_name' : best_name,
'scaler' : scaler,
'feature_columns' : feature_columns,
'classes': list(best_model.classes_),
'cv_metrics': {
'f1_mean' : float(results_df.iloc[0]['F1_mean']),
'f1_std' : float(results_df.iloc[0]['F1_std']),
'accuracy_mean': float(results_df.iloc[0]['Accuracy_mean']),
'precision_mean': float(results_df.iloc[0]['Precision_mean']),
'recall_mean' : float(results_df.iloc[0]['Recall_mean']),
},
'a4_champion_f1' : CHAMPION_F1,
}
out_path = OUT_DIR / 'ensemble_classification_champion.pkl'
with open(out_path, 'wb') as f:
pickle.dump(artifact, f)
print(f'Saved: {out_path}')