{ "cells": [ { "cell_type": "markdown", "id": "70148918-76b1-4657-8d02-442a8883e0c0", "metadata": {}, "source": [ "# Project 4 - Identifiez les causes d'attrition au sein d'une ESN¶" ] }, { "cell_type": "markdown", "id": "e389e683-cad2-4ef9-a3c8-0b838fdbb5ed", "metadata": {}, "source": [ "# Etape 3: Modelisation\n", "**Objectif**: construire un premier modéle de classification\n", "\n", "**Contenu :**\n", "\n", "- Chargement des données traitées\n", "\n", "- Entraînement d’un modèle de base (un modèle Dummy, régression logistique, un modèle Dummy, un modèle non-linéaire RandomForest)\n", "\n", "- Évaluation : accuracy, precision, rappel, f1-score, roc_auc\n", "\n", "- Visualisation de la matrice de confusion\n", "\n", "- Premières conclusions sur les performances" ] }, { "cell_type": "code", "execution_count": 2, "id": "7cad420e-a77b-4ec4-b5f7-38b360ae9ed7", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 44, "id": "368ef874-cdb9-4742-b214-56acca76bc11", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | age | \n", "genre | \n", "revenu_mensuel | \n", "statut_marital | \n", "departement | \n", "poste | \n", "nombre_experiences_precedentes | \n", "annee_experience_totale | \n", "annees_dans_l_entreprise | \n", "annees_dans_le_poste_actuel | \n", "... | \n", "augementation_salaire_precedente | \n", "attrition | \n", "nombre_participation_pee | \n", "nb_formations_suivies | \n", "distance_domicile_travail | \n", "niveau_education | \n", "domaine_etude | \n", "frequence_deplacement | \n", "annees_depuis_la_derniere_promotion | \n", "annes_sous_responsable_actuel | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "41 | \n", "F | \n", "5993 | \n", "Célibataire | \n", "Commercial | \n", "Cadre Commercial | \n", "8 | \n", "8 | \n", "6 | \n", "4 | \n", "... | \n", "11 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "2 | \n", "Infra & Cloud | \n", "Occasionnel | \n", "0 | \n", "5 | \n", "
| 1 | \n", "49 | \n", "M | \n", "5130 | \n", "Marié(e) | \n", "Consulting | \n", "Assistant de Direction | \n", "1 | \n", "10 | \n", "10 | \n", "7 | \n", "... | \n", "23 | \n", "0 | \n", "1 | \n", "3 | \n", "8 | \n", "1 | \n", "Infra & Cloud | \n", "Frequent | \n", "1 | \n", "7 | \n", "
| 2 | \n", "37 | \n", "M | \n", "2090 | \n", "Célibataire | \n", "Consulting | \n", "Consultant | \n", "6 | \n", "7 | \n", "0 | \n", "0 | \n", "... | \n", "15 | \n", "1 | \n", "0 | \n", "3 | \n", "2 | \n", "2 | \n", "Autre | \n", "Occasionnel | \n", "0 | \n", "0 | \n", "
| 3 | \n", "33 | \n", "F | \n", "2909 | \n", "Marié(e) | \n", "Consulting | \n", "Assistant de Direction | \n", "1 | \n", "8 | \n", "8 | \n", "7 | \n", "... | \n", "11 | \n", "0 | \n", "0 | \n", "3 | \n", "3 | \n", "4 | \n", "Infra & Cloud | \n", "Frequent | \n", "3 | \n", "0 | \n", "
| 4 | \n", "27 | \n", "M | \n", "3468 | \n", "Marié(e) | \n", "Consulting | \n", "Consultant | \n", "9 | \n", "6 | \n", "2 | \n", "2 | \n", "... | \n", "12 | \n", "0 | \n", "1 | \n", "3 | \n", "2 | \n", "1 | \n", "Transformation Digitale | \n", "Occasionnel | \n", "2 | \n", "2 | \n", "
5 rows × 27 columns
\n", "Pipeline(steps=[('prep',\n",
" ColumnTransformer(transformers=[('ord',\n",
" OrdinalEncoder(categories=[['Aucun',\n",
" 'Occasionnel',\n",
" 'Frequent']],\n",
" handle_unknown='use_encoded_value',\n",
" unknown_value=-1),\n",
" ['frequence_deplacement']),\n",
" ('ohe',\n",
" OneHotEncoder(handle_unknown='ignore',\n",
" sparse_output=False),\n",
" ['genre', 'statut_marital',\n",
" 'departement', 'poste',\n",
" 'domaine_etude']),\n",
" ('num', 'pass...\n",
" 'nombre_participation_pee',\n",
" 'nb_formations_suivies',\n",
" 'distance_domicile_travail',\n",
" 'niveau_education',\n",
" 'annees_depuis_la_derniere_promotion',\n",
" 'annes_sous_responsable_actuel',\n",
" 'satisfaction_globale',\n",
" 'exp_moins_3_years'])])),\n",
" ('clf',\n",
" RandomForestClassifier(class_weight='balanced', max_depth=8,\n",
" min_samples_leaf=5,\n",
" min_samples_split=10, n_estimators=200,\n",
" n_jobs=-1, random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. | \n", " | steps | \n", "[('prep', ...), ('clf', ...)] | \n", "
| \n", " | transform_input | \n", "None | \n", "
| \n", " | memory | \n", "None | \n", "
| \n", " | verbose | \n", "False | \n", "
| \n", " | transformers | \n", "[('ord', ...), ('ohe', ...), ...] | \n", "
| \n", " | remainder | \n", "'drop' | \n", "
| \n", " | sparse_threshold | \n", "0.3 | \n", "
| \n", " | n_jobs | \n", "None | \n", "
| \n", " | transformer_weights | \n", "None | \n", "
| \n", " | verbose | \n", "False | \n", "
| \n", " | verbose_feature_names_out | \n", "True | \n", "
| \n", " | force_int_remainder_cols | \n", "'deprecated' | \n", "
['frequence_deplacement']
| \n", " | categories | \n", "[['Aucun', 'Occasionnel', ...]] | \n", "
| \n", " | dtype | \n", "<class 'numpy.float64'> | \n", "
| \n", " | handle_unknown | \n", "'use_encoded_value' | \n", "
| \n", " | unknown_value | \n", "-1 | \n", "
| \n", " | encoded_missing_value | \n", "nan | \n", "
| \n", " | min_frequency | \n", "None | \n", "
| \n", " | max_categories | \n", "None | \n", "
['genre', 'statut_marital', 'departement', 'poste', 'domaine_etude']
| \n", " | categories | \n", "'auto' | \n", "
| \n", " | drop | \n", "None | \n", "
| \n", " | sparse_output | \n", "False | \n", "
| \n", " | dtype | \n", "<class 'numpy.float64'> | \n", "
| \n", " | handle_unknown | \n", "'ignore' | \n", "
| \n", " | min_frequency | \n", "None | \n", "
| \n", " | max_categories | \n", "None | \n", "
| \n", " | feature_name_combiner | \n", "'concat' | \n", "
['age', 'revenu_mensuel', 'nombre_experiences_precedentes', 'annees_dans_le_poste_actuel', 'note_evaluation_precedente', 'note_evaluation_actuelle', 'heure_supplementaires', 'augementation_salaire_precedente', 'nombre_participation_pee', 'nb_formations_suivies', 'distance_domicile_travail', 'niveau_education', 'annees_depuis_la_derniere_promotion', 'annes_sous_responsable_actuel', 'satisfaction_globale', 'exp_moins_3_years']
passthrough
| \n", " | n_estimators | \n", "200 | \n", "
| \n", " | criterion | \n", "'gini' | \n", "
| \n", " | max_depth | \n", "8 | \n", "
| \n", " | min_samples_split | \n", "10 | \n", "
| \n", " | min_samples_leaf | \n", "5 | \n", "
| \n", " | min_weight_fraction_leaf | \n", "0.0 | \n", "
| \n", " | max_features | \n", "'sqrt' | \n", "
| \n", " | max_leaf_nodes | \n", "None | \n", "
| \n", " | min_impurity_decrease | \n", "0.0 | \n", "
| \n", " | bootstrap | \n", "True | \n", "
| \n", " | oob_score | \n", "False | \n", "
| \n", " | n_jobs | \n", "-1 | \n", "
| \n", " | random_state | \n", "42 | \n", "
| \n", " | verbose | \n", "0 | \n", "
| \n", " | warm_start | \n", "False | \n", "
| \n", " | class_weight | \n", "'balanced' | \n", "
| \n", " | ccp_alpha | \n", "0.0 | \n", "
| \n", " | max_samples | \n", "None | \n", "
| \n", " | monotonic_cst | \n", "None | \n", "
Pipeline(steps=[('prep',\n",
" ColumnTransformer(transformers=[('ord',\n",
" OrdinalEncoder(categories=[['Aucun',\n",
" 'Occasionnel',\n",
" 'Frequent']],\n",
" handle_unknown='use_encoded_value',\n",
" unknown_value=-1),\n",
" ['frequence_deplacement']),\n",
" ('ohe',\n",
" OneHotEncoder(handle_unknown='ignore',\n",
" sparse_output=False),\n",
" ['genre', 'statut_marital',\n",
" 'departement', 'poste',\n",
" 'domaine_etude']),\n",
" ('num', 'pass...\n",
" 'nombre_participation_pee',\n",
" 'nb_formations_suivies',\n",
" 'distance_domicile_travail',\n",
" 'niveau_education',\n",
" 'annees_depuis_la_derniere_promotion',\n",
" 'annes_sous_responsable_actuel',\n",
" 'satisfaction_globale',\n",
" 'exp_moins_3_years'])])),\n",
" ('clf',\n",
" RandomForestClassifier(class_weight='balanced', max_depth=8,\n",
" min_samples_leaf=5,\n",
" min_samples_split=10, n_estimators=200,\n",
" n_jobs=-1, random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. | \n", " | steps | \n", "[('prep', ...), ('clf', ...)] | \n", "
| \n", " | transform_input | \n", "None | \n", "
| \n", " | memory | \n", "None | \n", "
| \n", " | verbose | \n", "False | \n", "
| \n", " | transformers | \n", "[('ord', ...), ('ohe', ...), ...] | \n", "
| \n", " | remainder | \n", "'drop' | \n", "
| \n", " | sparse_threshold | \n", "0.3 | \n", "
| \n", " | n_jobs | \n", "None | \n", "
| \n", " | transformer_weights | \n", "None | \n", "
| \n", " | verbose | \n", "False | \n", "
| \n", " | verbose_feature_names_out | \n", "True | \n", "
| \n", " | force_int_remainder_cols | \n", "'deprecated' | \n", "
['frequence_deplacement']
| \n", " | categories | \n", "[['Aucun', 'Occasionnel', ...]] | \n", "
| \n", " | dtype | \n", "<class 'numpy.float64'> | \n", "
| \n", " | handle_unknown | \n", "'use_encoded_value' | \n", "
| \n", " | unknown_value | \n", "-1 | \n", "
| \n", " | encoded_missing_value | \n", "nan | \n", "
| \n", " | min_frequency | \n", "None | \n", "
| \n", " | max_categories | \n", "None | \n", "
['genre', 'statut_marital', 'departement', 'poste', 'domaine_etude']
| \n", " | categories | \n", "'auto' | \n", "
| \n", " | drop | \n", "None | \n", "
| \n", " | sparse_output | \n", "False | \n", "
| \n", " | dtype | \n", "<class 'numpy.float64'> | \n", "
| \n", " | handle_unknown | \n", "'ignore' | \n", "
| \n", " | min_frequency | \n", "None | \n", "
| \n", " | max_categories | \n", "None | \n", "
| \n", " | feature_name_combiner | \n", "'concat' | \n", "
['age', 'revenu_mensuel', 'nombre_experiences_precedentes', 'annees_dans_le_poste_actuel', 'note_evaluation_precedente', 'note_evaluation_actuelle', 'heure_supplementaires', 'augementation_salaire_precedente', 'nombre_participation_pee', 'nb_formations_suivies', 'distance_domicile_travail', 'niveau_education', 'annees_depuis_la_derniere_promotion', 'annes_sous_responsable_actuel', 'satisfaction_globale', 'exp_moins_3_years']
passthrough
| \n", " | n_estimators | \n", "200 | \n", "
| \n", " | criterion | \n", "'gini' | \n", "
| \n", " | max_depth | \n", "8 | \n", "
| \n", " | min_samples_split | \n", "10 | \n", "
| \n", " | min_samples_leaf | \n", "5 | \n", "
| \n", " | min_weight_fraction_leaf | \n", "0.0 | \n", "
| \n", " | max_features | \n", "'sqrt' | \n", "
| \n", " | max_leaf_nodes | \n", "None | \n", "
| \n", " | min_impurity_decrease | \n", "0.0 | \n", "
| \n", " | bootstrap | \n", "True | \n", "
| \n", " | oob_score | \n", "False | \n", "
| \n", " | n_jobs | \n", "-1 | \n", "
| \n", " | random_state | \n", "42 | \n", "
| \n", " | verbose | \n", "0 | \n", "
| \n", " | warm_start | \n", "False | \n", "
| \n", " | class_weight | \n", "'balanced' | \n", "
| \n", " | ccp_alpha | \n", "0.0 | \n", "
| \n", " | max_samples | \n", "None | \n", "
| \n", " | monotonic_cst | \n", "None | \n", "