File size: 5,069 Bytes
de1102d
aac75d5
de1102d
aac75d5
 
de1102d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aac75d5
 
 
 
 
 
de1102d
 
 
aac75d5
de1102d
 
 
 
 
 
 
aac75d5
 
de1102d
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler


def load_raw_data(
    sondage_path="../raw_data/extrait_sondage.csv",
    eval_path="../raw_data/extrait_eval.csv",
    sirh_path="../raw_data/extrait_sirh.csv",
):
    """Charge et merge raw data (comme exploration.py/preparation.py)."""
    sondage = pd.read_csv(sondage_path)
    eval_df = pd.read_csv(eval_path)
    sirh = pd.read_csv(sirh_path)
    # Nettoyage initial (comme exploration.py)
    eval_df["augementation_salaire_precedente"] = eval_df[
        "augementation_salaire_precedente"
    ].apply(lambda x: float(str(x).replace(" %", "")) if isinstance(x, str) else x)
    eval_df["employee_id"] = eval_df["eval_number"].apply(
        lambda x: int(str(x).replace("E_", "")) if isinstance(x, str) else x
    )
    sondage["employee_id"] = sondage["code_sondage"].apply(
        lambda x: int(x) if isinstance(x, (str, int)) else None
    )
    # Merge (assume sur employee_id ; ajuste si clé diff.)
    central_df = pd.merge(sondage, eval_df, on="employee_id", how="inner")
    central_df = pd.merge(
        central_df, sirh, left_on="employee_id", right_on="id_employee", how="inner"
    )
    central_df.drop(
        ["code_sondage", "eval_number", "id_employee", "employee_id"],
        axis=1,
        inplace=True,
        errors="ignore",
    )
    return central_df


def preprocess_data(raw_data_paths=None):
    """
    Pipeline complet : Nettoyage, engineering, encoding, scaling (de preparation/improvement.py).
    Retourne X (features), y (binaire), scaler (pour inférence API).
    Choix : Sans PCA pour interprétabilité ; winsorize outliers (1%) ; OneHot cat. non-ordonnées.
    """
    if raw_data_paths:
        central_df = load_raw_data(**raw_data_paths)
    else:
        central_df = pd.read_csv("../output/central_df.csv")  # Si pré-fusionné

    # Nettoyage (duplicatas, constantes, outliers)
    central_df.drop_duplicates(inplace=True)
    columns_to_drop = (
        ["ayant_enfants"] if len(central_df["ayant_enfants"].unique()) == 1 else []
    )  # Constante
    central_df.drop(columns=columns_to_drop, inplace=True)
    quantitative_cols = central_df.select_dtypes(include=["int64", "float64"]).columns
    for col in quantitative_cols:
        if (
            central_df[col].std() > 0
            and np.sum(np.abs(stats.zscore(central_df[col])) > 3) > 0
        ):
            central_df[col] = winsorize(central_df[col], limits=[0.01, 0.01])

    # Engineering (comme improvement.py : ratios, moyennes ; +1 évite div0)
    central_df["revenu_par_anciennete"] = central_df["revenu_mensuel"] / (
        central_df["annees_dans_l_entreprise"] + 1
    )
    central_df["experience_par_anciennete"] = central_df["annee_experience_totale"] / (
        central_df["annees_dans_l_entreprise"] + 1
    )
    central_df["satisfaction_moyenne"] = central_df[
        [
            "satisfaction_employee_environnement",
            "satisfaction_employee_nature_travail",
            "satisfaction_employee_equipe",
            "satisfaction_employee_equilibre_pro_perso",
        ]
    ].mean(axis=1)
    # Autres (ajoute si pertinents via SHAP : e.g., 'promo_par_anciennete')
    central_df["promo_par_anciennete"] = central_df[
        "annees_depuis_la_derniere_promotion"
    ] / (central_df["annees_dans_l_entreprise"] + 1)

    # Encoding (catégorielles : OneHot non-ord., Ordinal ord.)
    cat_non_ord = ["genre", "statut_marital", "departement", "poste", "domaine_etude"]
    onehot = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    encoded_non_ord = pd.DataFrame(
        onehot.fit_transform(central_df[cat_non_ord]),
        columns=onehot.get_feature_names_out(cat_non_ord),
    )
    cat_ord = ["frequence_deplacement"]  # Ordinal : Aucun=0, Occasionnel=1, Frequent=2
    ordinal = OrdinalEncoder(categories=[["Aucun", "Occasionnel", "Frequent"]])
    encoded_ord = pd.DataFrame(
        ordinal.fit_transform(central_df[cat_ord]), columns=cat_ord
    )

    # Assemblage
    engineered_cols = [
        "revenu_par_anciennete",
        "experience_par_anciennete",
        "satisfaction_moyenne",
        "promo_par_anciennete",
    ]
    df_engineered = pd.concat(
        [
            central_df[quantitative_cols],
            central_df[engineered_cols],
            encoded_non_ord,
            encoded_ord,
            central_df["a_quitte_l_entreprise"],
        ],
        axis=1,
    )  # Inclut cible

    # Scaling (quantitatives + ordinal + engineered)
    cols_to_scale = quantitative_cols.tolist() + engineered_cols + cat_ord
    scaler = StandardScaler()
    df_engineered[cols_to_scale] = scaler.fit_transform(df_engineered[cols_to_scale])

    # Séparation X/y
    y = (df_engineered["a_quitte_l_entreprise"] == "Oui").astype(int)
    X = df_engineered.drop("a_quitte_l_entreprise", axis=1)
    return X, y, scaler, onehot, ordinal  # Retourne encoders/scaler pour inférence API