Spaces:

ASI-Engineer
/

oc_p5-dev

Sleeping

App Files Files Community

oc_p5-dev / tests /test_mlflow_quick.py

ASI-Engineer

Upload folder using huggingface_hub

aac75d5 verified 4 months ago

raw

history blame

4.79 kB

	#!/usr/bin/env python3
	"""
	Test rapide MLflow avec n_iter=10 au lieu de 1000.
	"""
	import os

	import mlflow
	from sklearn.model_selection import RandomizedSearchCV

	from ml_model.preprocess import preprocess_data
	from ml_model.train_model import train_model

	# Configure MLflow pour utiliser SQLite (nécessaire pour Model Registry)
	mlflow.set_tracking_uri("sqlite:///mlflow.db")

	# Patch temporaire pour test rapide
	import ml_model.train_model as train_module

	original_train = train_module.train_model


	def quick_train(X, y):
	"""Version rapide avec n_iter=10"""
	import mlflow
	import mlflow.sklearn
	from imblearn.over_sampling import SMOTE
	from imblearn.pipeline import Pipeline as ImbPipeline
	from scipy.stats import randint, uniform
	from sklearn.metrics import classification_report, confusion_matrix
	from sklearn.model_selection import RandomizedSearchCV, train_test_split
	from xgboost import XGBClassifier

	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, random_state=42, stratify=y
	)
	ratio = sum(y == 0) / sum(y == 1)

	pipeline = ImbPipeline(
	[("sampler", SMOTE(random_state=42)), ("clf", XGBClassifier(random_state=42))]
	)
	param_dist = {
	"clf__max_depth": randint(3, 15),
	"clf__n_estimators": randint(100, 1000),
	"clf__learning_rate": uniform(0.001, 0.5),
	"clf__subsample": uniform(0.4, 0.6),
	"clf__reg_alpha": uniform(0, 3),
	"clf__gamma": uniform(0, 10),
	"clf__colsample_bytree": uniform(0.5, 0.5),
	"clf__min_child_weight": randint(1, 15),
	"clf__scale_pos_weight": uniform(1, ratio),
	"clf__tree_method": ["auto", "hist"],
	}

	random = RandomizedSearchCV(
	pipeline,
	param_dist,
	n_iter=10, # 🚀 Test rapide !
	cv=3, # Réduit aussi le CV
	scoring="f1",
	n_jobs=-1,
	random_state=42,
	)

	with mlflow.start_run(run_name="XGBoost_Quick_Test"):
	random.fit(X_train, y_train)

	best_model = random.best_estimator_ # type: ignore[assignment]
	best_params = random.best_params_
	cv_f1 = random.best_score_

	mlflow.log_params(best_params)
	mlflow.log_metric("cv_f1", cv_f1)

	y_pred = best_model.predict(X_test) # type: ignore[attr-defined]
	report = classification_report(y_test, y_pred, output_dict=True) # type: ignore[arg-type]

	mlflow.log_metric("test_precision", float(report["1"]["precision"])) # type: ignore[index]
	mlflow.log_metric("test_recall", float(report["1"]["recall"])) # type: ignore[index]
	mlflow.log_metric("test_f1", float(report["1"]["f1-score"])) # type: ignore[index]

	# Log model et enregistre dans Model Registry
	model_info = mlflow.sklearn.log_model(best_model, "model") # type: ignore[attr-defined]
	mlflow.register_model(
	model_uri=model_info.model_uri, name="XGBoost_Employee_Turnover"
	)

	print("Meilleurs params:", best_params)
	print("Meilleur CV F1:", cv_f1)
	print(classification_report(y_test, y_pred))
	print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

	return best_model, best_params, cv_f1


	if __name__ == "__main__":
	print("🔄 Chargement et prétraitement des données...")
	data_paths = {
	"sondage_path": "data/extrait_sondage.csv",
	"eval_path": "data/extrait_eval.csv",
	"sirh_path": "data/extrait_sirh.csv",
	}

	X, y, scaler, onehot, ordinal = preprocess_data(raw_data_paths=data_paths)
	print(f"✅ Données prétraitées :")
	print(f" X shape: {X.shape}, y shape: {y.shape}")
	print(f" Distribution y: {y.value_counts().to_dict()}")

	print("\n🚀 Lancement du test rapide (n_iter=10, cv=3)...\n")

	best_model, best_params, cv_f1 = quick_train(X, y)

	print(f"\n✅ Test terminé ! CV F1-score: {cv_f1:.4f}")

	print("\n📁 Vérification des artifacts MLflow dans ./mlruns :")
	if os.path.exists("./mlruns"):
	for root, dirs, files in os.walk("./mlruns"):
	level = root.replace("./mlruns", "").count(os.sep)
	if level < 3: # Limite la profondeur
	indent = " " * 2 * level
	print(f"{indent}{os.path.basename(root)}/")
	if level == 2: # Affiche fichiers dans les runs
	subindent = " " * 2 * (level + 1)
	for file in files[:3]:
	print(f"{subindent}{file}")
	if len(files) > 3:
	print(f"{subindent}... (+{len(files) - 3} fichiers)")

	print("\n💡 Pour visualiser les runs MLflow, exécutez :")
	print(" mlflow ui")
	print(" Puis ouvrez http://localhost:5000")