Spaces:

ASI-Engineer
/

oc_p5-dev

Sleeping

App Files Files Community

ASI-Engineer commited on Dec 21, 2025

Commit

de1102d

verified ·

1 Parent(s): e7134b7

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

.flake8 +2 -3
ml_model/preprocess.py +127 -0
ml_model/train_model.py +58 -0

.flake8 CHANGED Viewed

@@ -1,5 +1,6 @@
 [flake8]
 # Exclude dirs pour ignorer libs tierces et noise (venv, git, etc.)
 exclude =
     .venv,
     .git,
@@ -11,6 +12,4 @@ exclude =
     build,
     dist
 # Max line pour compat Black (default 88 vs PEP8 79)
-max-line-length = 88
-# Ignore E501 si trop strict (optionnel, retire si tu veux fixer lines)
-ignore = E501

 [flake8]
 # Exclude dirs pour ignorer libs tierces et noise (venv, git, etc.)
+ignore = W503, E501
 exclude =
     .venv,
     .git,
     build,
     dist
 # Max line pour compat Black (default 88 vs PEP8 79)
+max-line-length = 88

ml_model/preprocess.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
+from scipy.stats.mstats import winsorize
+from scipy import stats
+def load_raw_data(
+    sondage_path="../raw_data/extrait_sondage.csv",
+    eval_path="../raw_data/extrait_eval.csv",
+    sirh_path="../raw_data/extrait_sirh.csv",
+):
+    """Charge et merge raw data (comme exploration.py/preparation.py)."""
+    sondage = pd.read_csv(sondage_path)
+    eval_df = pd.read_csv(eval_path)
+    sirh = pd.read_csv(sirh_path)
+    # Nettoyage initial (comme exploration.py)
+    eval_df["augementation_salaire_precedente"] = eval_df[
+        "augementation_salaire_precedente"
+    ].apply(lambda x: float(str(x).replace(" %", "")) if isinstance(x, str) else x)
+    eval_df["employee_id"] = eval_df["eval_number"].apply(
+        lambda x: int(str(x).replace("E_", "")) if isinstance(x, str) else x
+    )
+    sondage["employee_id"] = sondage["code_sondage"].apply(
+        lambda x: int(x) if isinstance(x, (str, int)) else None
+    )
+    # Merge (assume sur employee_id ; ajuste si clé diff.)
+    central_df = pd.merge(sondage, eval_df, on="employee_id", how="inner")
+    central_df = pd.merge(
+        central_df, sirh, left_on="employee_id", right_on="id_employee", how="inner"
+    )
+    central_df.drop(
+        ["code_sondage", "eval_number", "id_employee", "employee_id"],
+        axis=1,
+        inplace=True,
+        errors="ignore",
+    )
+    return central_df
+def preprocess_data(raw_data_paths=None):
+    """
+    Pipeline complet : Nettoyage, engineering, encoding, scaling (de preparation/improvement.py).
+    Retourne X (features), y (binaire), scaler (pour inférence API).
+    Choix : Sans PCA pour interprétabilité ; winsorize outliers (1%) ; OneHot cat. non-ordonnées.
+    """
+    if raw_data_paths:
+        central_df = load_raw_data(**raw_data_paths)
+    else:
+        central_df = pd.read_csv("../output/central_df.csv")  # Si pré-fusionné
+    # Nettoyage (duplicatas, constantes, outliers)
+    central_df.drop_duplicates(inplace=True)
+    columns_to_drop = (
+        ["ayant_enfants"] if len(central_df["ayant_enfants"].unique()) == 1 else []
+    )  # Constante
+    central_df.drop(columns=columns_to_drop, inplace=True)
+    quantitative_cols = central_df.select_dtypes(include=["int64", "float64"]).columns
+    for col in quantitative_cols:
+        if (
+            central_df[col].std() > 0
+            and np.sum(np.abs(stats.zscore(central_df[col])) > 3) > 0
+        ):
+            central_df[col] = winsorize(central_df[col], limits=[0.01, 0.01])
+    # Engineering (comme improvement.py : ratios, moyennes ; +1 évite div0)
+    central_df["revenu_par_anciennete"] = central_df["revenu_mensuel"] / (
+        central_df["annees_dans_l_entreprise"] + 1
+    )
+    central_df["experience_par_anciennete"] = central_df["annee_experience_totale"] / (
+        central_df["annees_dans_l_entreprise"] + 1
+    )
+    central_df["satisfaction_moyenne"] = central_df[
+        [
+            "satisfaction_employee_environnement",
+            "satisfaction_employee_nature_travail",
+            "satisfaction_employee_equipe",
+            "satisfaction_employee_equilibre_pro_perso",
+        ]
+    ].mean(axis=1)
+    # Autres (ajoute si pertinents via SHAP : e.g., 'promo_par_anciennete')
+    central_df["promo_par_anciennete"] = central_df[
+        "annees_depuis_la_derniere_promotion"
+    ] / (central_df["annees_dans_l_entreprise"] + 1)
+    # Encoding (catégorielles : OneHot non-ord., Ordinal ord.)
+    cat_non_ord = ["genre", "statut_marital", "departement", "poste", "domaine_etude"]
+    onehot = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
+    encoded_non_ord = pd.DataFrame(
+        onehot.fit_transform(central_df[cat_non_ord]),
+        columns=onehot.get_feature_names_out(cat_non_ord),
+    )
+    cat_ord = ["frequence_deplacement"]  # Ordinal : Aucun=0, Occasionnel=1, Frequent=2
+    ordinal = OrdinalEncoder(categories=[["Aucun", "Occasionnel", "Frequent"]])
+    encoded_ord = pd.DataFrame(
+        ordinal.fit_transform(central_df[cat_ord]), columns=cat_ord
+    )
+    # Assemblage
+    df_engineered = pd.concat(
+        [
+            central_df[quantitative_cols],
+            encoded_non_ord,
+            encoded_ord,
+            central_df["a_quitte_l_entreprise"],
+        ],
+        axis=1,
+    )  # Inclut cible
+    # Scaling (quantitatives + ordinal)
+    cols_to_scale = (
+        quantitative_cols.tolist()
+        + cat_ord
+        + [
+            "revenu_par_anciennete",
+            "experience_par_anciennete",
+            "satisfaction_moyenne",
+            "promo_par_anciennete",
+        ]
+    )
+    scaler = StandardScaler()
+    df_engineered[cols_to_scale] = scaler.fit_transform(df_engineered[cols_to_scale])
+    # Séparation X/y
+    y = (df_engineered["a_quitte_l_entreprise"] == "Oui").astype(int)
+    X = df_engineered.drop("a_quitte_l_entreprise", axis=1)
+    return X, y, scaler, onehot, ordinal  # Retourne encoders/scaler pour inférence API

ml_model/train_model.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from sklearn.model_selection import train_test_split, RandomizedSearchCV
+from sklearn.metrics import classification_report, confusion_matrix
+from imblearn.over_sampling import SMOTE
+from imblearn.pipeline import Pipeline
+from xgboost import XGBClassifier
+from scipy.stats import uniform, randint
+def train_model(X, y):
+    """
+    Train/tune XGBoost avec SMOTE (de optimisation.py/improvement.py).
+    Retourne best_model, best_params, cv_f1.
+    Choix : RandomizedSearch (efficace large grille) ; SMOTE in-pipeline (gère CV) ; F1 scoring (déséquilibre).
+    """
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=42, stratify=y
+    )
+    ratio = sum(y == 0) / sum(y == 1)
+    pipeline = Pipeline(
+        [("sampler", SMOTE(random_state=42)), ("clf", XGBClassifier(random_state=42))]
+    )
+    param_dist = {
+        "clf__max_depth": randint(3, 15),
+        "clf__n_estimators": randint(100, 1000),
+        "clf__learning_rate": uniform(0.001, 0.5),
+        "clf__subsample": uniform(0.4, 0.6),
+        "clf__reg_alpha": uniform(0, 3),
+        "clf__gamma": uniform(0, 10),
+        "clf__colsample_bytree": uniform(0.5, 0.5),
+        "clf__min_child_weight": randint(1, 15),
+        "clf__scale_pos_weight": uniform(1, ratio),
+        "clf__tree_method": ["auto", "hist"],  # CPU
+    }
+    random = RandomizedSearchCV(
+        pipeline,
+        param_dist,
+        n_iter=1000,
+        cv=5,
+        scoring="f1",
+        n_jobs=-1,
+        random_state=42,
+    )
+    random.fit(X_train, y_train)
+    best_model = random.best_estimator_
+    best_params = random.best_params_
+    cv_f1 = random.best_score_
+    # Éval test (pédagogique)
+    y_pred = best_model.predict(X_test)
+    print("Meilleurs params:", best_params)
+    print("Meilleur CV F1:", cv_f1)
+    print(classification_report(y_test, y_pred))
+    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
+    return best_model, best_params, cv_f1