ASI-Engineer commited on
Commit
de1102d
·
verified ·
1 Parent(s): e7134b7

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. .flake8 +2 -3
  2. ml_model/preprocess.py +127 -0
  3. ml_model/train_model.py +58 -0
.flake8 CHANGED
@@ -1,5 +1,6 @@
1
  [flake8]
2
  # Exclude dirs pour ignorer libs tierces et noise (venv, git, etc.)
 
3
  exclude =
4
  .venv,
5
  .git,
@@ -11,6 +12,4 @@ exclude =
11
  build,
12
  dist
13
  # Max line pour compat Black (default 88 vs PEP8 79)
14
- max-line-length = 88
15
- # Ignore E501 si trop strict (optionnel, retire si tu veux fixer lines)
16
- ignore = E501
 
1
  [flake8]
2
  # Exclude dirs pour ignorer libs tierces et noise (venv, git, etc.)
3
+ ignore = W503, E501
4
  exclude =
5
  .venv,
6
  .git,
 
12
  build,
13
  dist
14
  # Max line pour compat Black (default 88 vs PEP8 79)
15
+ max-line-length = 88
 
 
ml_model/preprocess.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
4
+ from scipy.stats.mstats import winsorize
5
+ from scipy import stats
6
+
7
+
8
+ def load_raw_data(
9
+ sondage_path="../raw_data/extrait_sondage.csv",
10
+ eval_path="../raw_data/extrait_eval.csv",
11
+ sirh_path="../raw_data/extrait_sirh.csv",
12
+ ):
13
+ """Charge et merge raw data (comme exploration.py/preparation.py)."""
14
+ sondage = pd.read_csv(sondage_path)
15
+ eval_df = pd.read_csv(eval_path)
16
+ sirh = pd.read_csv(sirh_path)
17
+ # Nettoyage initial (comme exploration.py)
18
+ eval_df["augementation_salaire_precedente"] = eval_df[
19
+ "augementation_salaire_precedente"
20
+ ].apply(lambda x: float(str(x).replace(" %", "")) if isinstance(x, str) else x)
21
+ eval_df["employee_id"] = eval_df["eval_number"].apply(
22
+ lambda x: int(str(x).replace("E_", "")) if isinstance(x, str) else x
23
+ )
24
+ sondage["employee_id"] = sondage["code_sondage"].apply(
25
+ lambda x: int(x) if isinstance(x, (str, int)) else None
26
+ )
27
+ # Merge (assume sur employee_id ; ajuste si clé diff.)
28
+ central_df = pd.merge(sondage, eval_df, on="employee_id", how="inner")
29
+ central_df = pd.merge(
30
+ central_df, sirh, left_on="employee_id", right_on="id_employee", how="inner"
31
+ )
32
+ central_df.drop(
33
+ ["code_sondage", "eval_number", "id_employee", "employee_id"],
34
+ axis=1,
35
+ inplace=True,
36
+ errors="ignore",
37
+ )
38
+ return central_df
39
+
40
+
41
+ def preprocess_data(raw_data_paths=None):
42
+ """
43
+ Pipeline complet : Nettoyage, engineering, encoding, scaling (de preparation/improvement.py).
44
+ Retourne X (features), y (binaire), scaler (pour inférence API).
45
+ Choix : Sans PCA pour interprétabilité ; winsorize outliers (1%) ; OneHot cat. non-ordonnées.
46
+ """
47
+ if raw_data_paths:
48
+ central_df = load_raw_data(**raw_data_paths)
49
+ else:
50
+ central_df = pd.read_csv("../output/central_df.csv") # Si pré-fusionné
51
+
52
+ # Nettoyage (duplicatas, constantes, outliers)
53
+ central_df.drop_duplicates(inplace=True)
54
+ columns_to_drop = (
55
+ ["ayant_enfants"] if len(central_df["ayant_enfants"].unique()) == 1 else []
56
+ ) # Constante
57
+ central_df.drop(columns=columns_to_drop, inplace=True)
58
+ quantitative_cols = central_df.select_dtypes(include=["int64", "float64"]).columns
59
+ for col in quantitative_cols:
60
+ if (
61
+ central_df[col].std() > 0
62
+ and np.sum(np.abs(stats.zscore(central_df[col])) > 3) > 0
63
+ ):
64
+ central_df[col] = winsorize(central_df[col], limits=[0.01, 0.01])
65
+
66
+ # Engineering (comme improvement.py : ratios, moyennes ; +1 évite div0)
67
+ central_df["revenu_par_anciennete"] = central_df["revenu_mensuel"] / (
68
+ central_df["annees_dans_l_entreprise"] + 1
69
+ )
70
+ central_df["experience_par_anciennete"] = central_df["annee_experience_totale"] / (
71
+ central_df["annees_dans_l_entreprise"] + 1
72
+ )
73
+ central_df["satisfaction_moyenne"] = central_df[
74
+ [
75
+ "satisfaction_employee_environnement",
76
+ "satisfaction_employee_nature_travail",
77
+ "satisfaction_employee_equipe",
78
+ "satisfaction_employee_equilibre_pro_perso",
79
+ ]
80
+ ].mean(axis=1)
81
+ # Autres (ajoute si pertinents via SHAP : e.g., 'promo_par_anciennete')
82
+ central_df["promo_par_anciennete"] = central_df[
83
+ "annees_depuis_la_derniere_promotion"
84
+ ] / (central_df["annees_dans_l_entreprise"] + 1)
85
+
86
+ # Encoding (catégorielles : OneHot non-ord., Ordinal ord.)
87
+ cat_non_ord = ["genre", "statut_marital", "departement", "poste", "domaine_etude"]
88
+ onehot = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
89
+ encoded_non_ord = pd.DataFrame(
90
+ onehot.fit_transform(central_df[cat_non_ord]),
91
+ columns=onehot.get_feature_names_out(cat_non_ord),
92
+ )
93
+ cat_ord = ["frequence_deplacement"] # Ordinal : Aucun=0, Occasionnel=1, Frequent=2
94
+ ordinal = OrdinalEncoder(categories=[["Aucun", "Occasionnel", "Frequent"]])
95
+ encoded_ord = pd.DataFrame(
96
+ ordinal.fit_transform(central_df[cat_ord]), columns=cat_ord
97
+ )
98
+
99
+ # Assemblage
100
+ df_engineered = pd.concat(
101
+ [
102
+ central_df[quantitative_cols],
103
+ encoded_non_ord,
104
+ encoded_ord,
105
+ central_df["a_quitte_l_entreprise"],
106
+ ],
107
+ axis=1,
108
+ ) # Inclut cible
109
+
110
+ # Scaling (quantitatives + ordinal)
111
+ cols_to_scale = (
112
+ quantitative_cols.tolist()
113
+ + cat_ord
114
+ + [
115
+ "revenu_par_anciennete",
116
+ "experience_par_anciennete",
117
+ "satisfaction_moyenne",
118
+ "promo_par_anciennete",
119
+ ]
120
+ )
121
+ scaler = StandardScaler()
122
+ df_engineered[cols_to_scale] = scaler.fit_transform(df_engineered[cols_to_scale])
123
+
124
+ # Séparation X/y
125
+ y = (df_engineered["a_quitte_l_entreprise"] == "Oui").astype(int)
126
+ X = df_engineered.drop("a_quitte_l_entreprise", axis=1)
127
+ return X, y, scaler, onehot, ordinal # Retourne encoders/scaler pour inférence API
ml_model/train_model.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.model_selection import train_test_split, RandomizedSearchCV
2
+ from sklearn.metrics import classification_report, confusion_matrix
3
+ from imblearn.over_sampling import SMOTE
4
+ from imblearn.pipeline import Pipeline
5
+ from xgboost import XGBClassifier
6
+ from scipy.stats import uniform, randint
7
+
8
+
9
+ def train_model(X, y):
10
+ """
11
+ Train/tune XGBoost avec SMOTE (de optimisation.py/improvement.py).
12
+ Retourne best_model, best_params, cv_f1.
13
+ Choix : RandomizedSearch (efficace large grille) ; SMOTE in-pipeline (gère CV) ; F1 scoring (déséquilibre).
14
+ """
15
+ X_train, X_test, y_train, y_test = train_test_split(
16
+ X, y, test_size=0.2, random_state=42, stratify=y
17
+ )
18
+ ratio = sum(y == 0) / sum(y == 1)
19
+
20
+ pipeline = Pipeline(
21
+ [("sampler", SMOTE(random_state=42)), ("clf", XGBClassifier(random_state=42))]
22
+ )
23
+ param_dist = {
24
+ "clf__max_depth": randint(3, 15),
25
+ "clf__n_estimators": randint(100, 1000),
26
+ "clf__learning_rate": uniform(0.001, 0.5),
27
+ "clf__subsample": uniform(0.4, 0.6),
28
+ "clf__reg_alpha": uniform(0, 3),
29
+ "clf__gamma": uniform(0, 10),
30
+ "clf__colsample_bytree": uniform(0.5, 0.5),
31
+ "clf__min_child_weight": randint(1, 15),
32
+ "clf__scale_pos_weight": uniform(1, ratio),
33
+ "clf__tree_method": ["auto", "hist"], # CPU
34
+ }
35
+
36
+ random = RandomizedSearchCV(
37
+ pipeline,
38
+ param_dist,
39
+ n_iter=1000,
40
+ cv=5,
41
+ scoring="f1",
42
+ n_jobs=-1,
43
+ random_state=42,
44
+ )
45
+ random.fit(X_train, y_train)
46
+
47
+ best_model = random.best_estimator_
48
+ best_params = random.best_params_
49
+ cv_f1 = random.best_score_
50
+
51
+ # Éval test (pédagogique)
52
+ y_pred = best_model.predict(X_test)
53
+ print("Meilleurs params:", best_params)
54
+ print("Meilleur CV F1:", cv_f1)
55
+ print(classification_report(y_test, y_pred))
56
+ print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
57
+
58
+ return best_model, best_params, cv_f1