Spaces:
Running
Running
File size: 5,069 Bytes
de1102d aac75d5 de1102d aac75d5 de1102d aac75d5 de1102d aac75d5 de1102d aac75d5 de1102d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
def load_raw_data(
sondage_path="../raw_data/extrait_sondage.csv",
eval_path="../raw_data/extrait_eval.csv",
sirh_path="../raw_data/extrait_sirh.csv",
):
"""Charge et merge raw data (comme exploration.py/preparation.py)."""
sondage = pd.read_csv(sondage_path)
eval_df = pd.read_csv(eval_path)
sirh = pd.read_csv(sirh_path)
# Nettoyage initial (comme exploration.py)
eval_df["augementation_salaire_precedente"] = eval_df[
"augementation_salaire_precedente"
].apply(lambda x: float(str(x).replace(" %", "")) if isinstance(x, str) else x)
eval_df["employee_id"] = eval_df["eval_number"].apply(
lambda x: int(str(x).replace("E_", "")) if isinstance(x, str) else x
)
sondage["employee_id"] = sondage["code_sondage"].apply(
lambda x: int(x) if isinstance(x, (str, int)) else None
)
# Merge (assume sur employee_id ; ajuste si clé diff.)
central_df = pd.merge(sondage, eval_df, on="employee_id", how="inner")
central_df = pd.merge(
central_df, sirh, left_on="employee_id", right_on="id_employee", how="inner"
)
central_df.drop(
["code_sondage", "eval_number", "id_employee", "employee_id"],
axis=1,
inplace=True,
errors="ignore",
)
return central_df
def preprocess_data(raw_data_paths=None):
"""
Pipeline complet : Nettoyage, engineering, encoding, scaling (de preparation/improvement.py).
Retourne X (features), y (binaire), scaler (pour inférence API).
Choix : Sans PCA pour interprétabilité ; winsorize outliers (1%) ; OneHot cat. non-ordonnées.
"""
if raw_data_paths:
central_df = load_raw_data(**raw_data_paths)
else:
central_df = pd.read_csv("../output/central_df.csv") # Si pré-fusionné
# Nettoyage (duplicatas, constantes, outliers)
central_df.drop_duplicates(inplace=True)
columns_to_drop = (
["ayant_enfants"] if len(central_df["ayant_enfants"].unique()) == 1 else []
) # Constante
central_df.drop(columns=columns_to_drop, inplace=True)
quantitative_cols = central_df.select_dtypes(include=["int64", "float64"]).columns
for col in quantitative_cols:
if (
central_df[col].std() > 0
and np.sum(np.abs(stats.zscore(central_df[col])) > 3) > 0
):
central_df[col] = winsorize(central_df[col], limits=[0.01, 0.01])
# Engineering (comme improvement.py : ratios, moyennes ; +1 évite div0)
central_df["revenu_par_anciennete"] = central_df["revenu_mensuel"] / (
central_df["annees_dans_l_entreprise"] + 1
)
central_df["experience_par_anciennete"] = central_df["annee_experience_totale"] / (
central_df["annees_dans_l_entreprise"] + 1
)
central_df["satisfaction_moyenne"] = central_df[
[
"satisfaction_employee_environnement",
"satisfaction_employee_nature_travail",
"satisfaction_employee_equipe",
"satisfaction_employee_equilibre_pro_perso",
]
].mean(axis=1)
# Autres (ajoute si pertinents via SHAP : e.g., 'promo_par_anciennete')
central_df["promo_par_anciennete"] = central_df[
"annees_depuis_la_derniere_promotion"
] / (central_df["annees_dans_l_entreprise"] + 1)
# Encoding (catégorielles : OneHot non-ord., Ordinal ord.)
cat_non_ord = ["genre", "statut_marital", "departement", "poste", "domaine_etude"]
onehot = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded_non_ord = pd.DataFrame(
onehot.fit_transform(central_df[cat_non_ord]),
columns=onehot.get_feature_names_out(cat_non_ord),
)
cat_ord = ["frequence_deplacement"] # Ordinal : Aucun=0, Occasionnel=1, Frequent=2
ordinal = OrdinalEncoder(categories=[["Aucun", "Occasionnel", "Frequent"]])
encoded_ord = pd.DataFrame(
ordinal.fit_transform(central_df[cat_ord]), columns=cat_ord
)
# Assemblage
engineered_cols = [
"revenu_par_anciennete",
"experience_par_anciennete",
"satisfaction_moyenne",
"promo_par_anciennete",
]
df_engineered = pd.concat(
[
central_df[quantitative_cols],
central_df[engineered_cols],
encoded_non_ord,
encoded_ord,
central_df["a_quitte_l_entreprise"],
],
axis=1,
) # Inclut cible
# Scaling (quantitatives + ordinal + engineered)
cols_to_scale = quantitative_cols.tolist() + engineered_cols + cat_ord
scaler = StandardScaler()
df_engineered[cols_to_scale] = scaler.fit_transform(df_engineered[cols_to_scale])
# Séparation X/y
y = (df_engineered["a_quitte_l_entreprise"] == "Oui").astype(int)
X = df_engineered.drop("a_quitte_l_entreprise", axis=1)
return X, y, scaler, onehot, ordinal # Retourne encoders/scaler pour inférence API
|