diff --git "a/notebooks/03_modelisation.ipynb" "b/notebooks/03_modelisation.ipynb" new file mode 100644--- /dev/null +++ "b/notebooks/03_modelisation.ipynb" @@ -0,0 +1,5134 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "70148918-76b1-4657-8d02-442a8883e0c0", + "metadata": {}, + "source": [ + "# Project 4 - Identifiez les causes d'attrition au sein d'une ESN¶" + ] + }, + { + "cell_type": "markdown", + "id": "e389e683-cad2-4ef9-a3c8-0b838fdbb5ed", + "metadata": {}, + "source": [ + "# Etape 3: Modelisation\n", + "**Objectif**: construire un premier modéle de classification\n", + "\n", + "**Contenu :**\n", + "\n", + "- Chargement des données traitées\n", + "\n", + "- Entraînement d’un modèle de base (un modèle Dummy, régression logistique, un modèle Dummy, un modèle non-linéaire RandomForest)\n", + "\n", + "- Évaluation : accuracy, precision, rappel, f1-score, roc_auc\n", + "\n", + "- Visualisation de la matrice de confusion\n", + "\n", + "- Premières conclusions sur les performances" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "368ef874-cdb9-4742-b214-56acca76bc11", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Pipeline(steps=[('prep',\n",
+ " ColumnTransformer(transformers=[('ord',\n",
+ " OrdinalEncoder(categories=[['Aucun',\n",
+ " 'Occasionnel',\n",
+ " 'Frequent']],\n",
+ " handle_unknown='use_encoded_value',\n",
+ " unknown_value=-1),\n",
+ " ['frequence_deplacement']),\n",
+ " ('ohe',\n",
+ " OneHotEncoder(handle_unknown='ignore',\n",
+ " sparse_output=False),\n",
+ " ['genre', 'statut_marital',\n",
+ " 'departement', 'poste',\n",
+ " 'domaine_etude']),\n",
+ " ('num', 'pass...\n",
+ " 'nombre_participation_pee',\n",
+ " 'nb_formations_suivies',\n",
+ " 'distance_domicile_travail',\n",
+ " 'niveau_education',\n",
+ " 'annees_depuis_la_derniere_promotion',\n",
+ " 'annes_sous_responsable_actuel',\n",
+ " 'satisfaction_globale',\n",
+ " 'exp_moins_3_years'])])),\n",
+ " ('clf',\n",
+ " RandomForestClassifier(class_weight='balanced', max_depth=8,\n",
+ " min_samples_leaf=5,\n",
+ " min_samples_split=10, n_estimators=200,\n",
+ " n_jobs=-1, random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. | \n", + " | steps | \n", + "[('prep', ...), ('clf', ...)] | \n", + "
| \n", + " | transform_input | \n", + "None | \n", + "
| \n", + " | memory | \n", + "None | \n", + "
| \n", + " | verbose | \n", + "False | \n", + "
| \n", + " | transformers | \n", + "[('ord', ...), ('ohe', ...), ...] | \n", + "
| \n", + " | remainder | \n", + "'drop' | \n", + "
| \n", + " | sparse_threshold | \n", + "0.3 | \n", + "
| \n", + " | n_jobs | \n", + "None | \n", + "
| \n", + " | transformer_weights | \n", + "None | \n", + "
| \n", + " | verbose | \n", + "False | \n", + "
| \n", + " | verbose_feature_names_out | \n", + "True | \n", + "
| \n", + " | force_int_remainder_cols | \n", + "'deprecated' | \n", + "
['frequence_deplacement']
| \n", + " | categories | \n", + "[['Aucun', 'Occasionnel', ...]] | \n", + "
| \n", + " | dtype | \n", + "<class 'numpy.float64'> | \n", + "
| \n", + " | handle_unknown | \n", + "'use_encoded_value' | \n", + "
| \n", + " | unknown_value | \n", + "-1 | \n", + "
| \n", + " | encoded_missing_value | \n", + "nan | \n", + "
| \n", + " | min_frequency | \n", + "None | \n", + "
| \n", + " | max_categories | \n", + "None | \n", + "
['genre', 'statut_marital', 'departement', 'poste', 'domaine_etude']
| \n", + " | categories | \n", + "'auto' | \n", + "
| \n", + " | drop | \n", + "None | \n", + "
| \n", + " | sparse_output | \n", + "False | \n", + "
| \n", + " | dtype | \n", + "<class 'numpy.float64'> | \n", + "
| \n", + " | handle_unknown | \n", + "'ignore' | \n", + "
| \n", + " | min_frequency | \n", + "None | \n", + "
| \n", + " | max_categories | \n", + "None | \n", + "
| \n", + " | feature_name_combiner | \n", + "'concat' | \n", + "
['age', 'revenu_mensuel', 'nombre_experiences_precedentes', 'annees_dans_le_poste_actuel', 'note_evaluation_precedente', 'note_evaluation_actuelle', 'heure_supplementaires', 'augementation_salaire_precedente', 'nombre_participation_pee', 'nb_formations_suivies', 'distance_domicile_travail', 'niveau_education', 'annees_depuis_la_derniere_promotion', 'annes_sous_responsable_actuel', 'satisfaction_globale', 'exp_moins_3_years']
passthrough
| \n", + " | n_estimators | \n", + "200 | \n", + "
| \n", + " | criterion | \n", + "'gini' | \n", + "
| \n", + " | max_depth | \n", + "8 | \n", + "
| \n", + " | min_samples_split | \n", + "10 | \n", + "
| \n", + " | min_samples_leaf | \n", + "5 | \n", + "
| \n", + " | min_weight_fraction_leaf | \n", + "0.0 | \n", + "
| \n", + " | max_features | \n", + "'sqrt' | \n", + "
| \n", + " | max_leaf_nodes | \n", + "None | \n", + "
| \n", + " | min_impurity_decrease | \n", + "0.0 | \n", + "
| \n", + " | bootstrap | \n", + "True | \n", + "
| \n", + " | oob_score | \n", + "False | \n", + "
| \n", + " | n_jobs | \n", + "-1 | \n", + "
| \n", + " | random_state | \n", + "42 | \n", + "
| \n", + " | verbose | \n", + "0 | \n", + "
| \n", + " | warm_start | \n", + "False | \n", + "
| \n", + " | class_weight | \n", + "'balanced' | \n", + "
| \n", + " | ccp_alpha | \n", + "0.0 | \n", + "
| \n", + " | max_samples | \n", + "None | \n", + "
| \n", + " | monotonic_cst | \n", + "None | \n", + "
Pipeline(steps=[('prep',\n",
+ " ColumnTransformer(transformers=[('ord',\n",
+ " OrdinalEncoder(categories=[['Aucun',\n",
+ " 'Occasionnel',\n",
+ " 'Frequent']],\n",
+ " handle_unknown='use_encoded_value',\n",
+ " unknown_value=-1),\n",
+ " ['frequence_deplacement']),\n",
+ " ('ohe',\n",
+ " OneHotEncoder(handle_unknown='ignore',\n",
+ " sparse_output=False),\n",
+ " ['genre', 'statut_marital',\n",
+ " 'departement', 'poste',\n",
+ " 'domaine_etude']),\n",
+ " ('num', 'pass...\n",
+ " 'nombre_participation_pee',\n",
+ " 'nb_formations_suivies',\n",
+ " 'distance_domicile_travail',\n",
+ " 'niveau_education',\n",
+ " 'annees_depuis_la_derniere_promotion',\n",
+ " 'annes_sous_responsable_actuel',\n",
+ " 'satisfaction_globale',\n",
+ " 'exp_moins_3_years'])])),\n",
+ " ('clf',\n",
+ " RandomForestClassifier(class_weight='balanced', max_depth=8,\n",
+ " min_samples_leaf=5,\n",
+ " min_samples_split=10, n_estimators=200,\n",
+ " n_jobs=-1, random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. | \n", + " | steps | \n", + "[('prep', ...), ('clf', ...)] | \n", + "
| \n", + " | transform_input | \n", + "None | \n", + "
| \n", + " | memory | \n", + "None | \n", + "
| \n", + " | verbose | \n", + "False | \n", + "
| \n", + " | transformers | \n", + "[('ord', ...), ('ohe', ...), ...] | \n", + "
| \n", + " | remainder | \n", + "'drop' | \n", + "
| \n", + " | sparse_threshold | \n", + "0.3 | \n", + "
| \n", + " | n_jobs | \n", + "None | \n", + "
| \n", + " | transformer_weights | \n", + "None | \n", + "
| \n", + " | verbose | \n", + "False | \n", + "
| \n", + " | verbose_feature_names_out | \n", + "True | \n", + "
| \n", + " | force_int_remainder_cols | \n", + "'deprecated' | \n", + "
['frequence_deplacement']
| \n", + " | categories | \n", + "[['Aucun', 'Occasionnel', ...]] | \n", + "
| \n", + " | dtype | \n", + "<class 'numpy.float64'> | \n", + "
| \n", + " | handle_unknown | \n", + "'use_encoded_value' | \n", + "
| \n", + " | unknown_value | \n", + "-1 | \n", + "
| \n", + " | encoded_missing_value | \n", + "nan | \n", + "
| \n", + " | min_frequency | \n", + "None | \n", + "
| \n", + " | max_categories | \n", + "None | \n", + "
['genre', 'statut_marital', 'departement', 'poste', 'domaine_etude']
| \n", + " | categories | \n", + "'auto' | \n", + "
| \n", + " | drop | \n", + "None | \n", + "
| \n", + " | sparse_output | \n", + "False | \n", + "
| \n", + " | dtype | \n", + "<class 'numpy.float64'> | \n", + "
| \n", + " | handle_unknown | \n", + "'ignore' | \n", + "
| \n", + " | min_frequency | \n", + "None | \n", + "
| \n", + " | max_categories | \n", + "None | \n", + "
| \n", + " | feature_name_combiner | \n", + "'concat' | \n", + "
['age', 'revenu_mensuel', 'nombre_experiences_precedentes', 'annees_dans_le_poste_actuel', 'note_evaluation_precedente', 'note_evaluation_actuelle', 'heure_supplementaires', 'augementation_salaire_precedente', 'nombre_participation_pee', 'nb_formations_suivies', 'distance_domicile_travail', 'niveau_education', 'annees_depuis_la_derniere_promotion', 'annes_sous_responsable_actuel', 'satisfaction_globale', 'exp_moins_3_years']
passthrough
| \n", + " | n_estimators | \n", + "200 | \n", + "
| \n", + " | criterion | \n", + "'gini' | \n", + "
| \n", + " | max_depth | \n", + "8 | \n", + "
| \n", + " | min_samples_split | \n", + "10 | \n", + "
| \n", + " | min_samples_leaf | \n", + "5 | \n", + "
| \n", + " | min_weight_fraction_leaf | \n", + "0.0 | \n", + "
| \n", + " | max_features | \n", + "'sqrt' | \n", + "
| \n", + " | max_leaf_nodes | \n", + "None | \n", + "
| \n", + " | min_impurity_decrease | \n", + "0.0 | \n", + "
| \n", + " | bootstrap | \n", + "True | \n", + "
| \n", + " | oob_score | \n", + "False | \n", + "
| \n", + " | n_jobs | \n", + "-1 | \n", + "
| \n", + " | random_state | \n", + "42 | \n", + "
| \n", + " | verbose | \n", + "0 | \n", + "
| \n", + " | warm_start | \n", + "False | \n", + "
| \n", + " | class_weight | \n", + "'balanced' | \n", + "
| \n", + " | ccp_alpha | \n", + "0.0 | \n", + "
| \n", + " | max_samples | \n", + "None | \n", + "
| \n", + " | monotonic_cst | \n", + "None | \n", + "