Spaces:

Hodfa71
/

argus-mlops

Sleeping

App Files Files Community

argus-mlops / src /models /trainer.py

hodfa840

Fix scroll reset for HF Spaces double-iframe context

1aa566a about 1 month ago

raw

history blame contribute delete

4.5 kB

	"""Model training with MLflow experiment tracking."""
	from __future__ import annotations

	import time
	from pathlib import Path
	from typing import Optional

	import mlflow
	import mlflow.sklearn
	import numpy as np
	import pandas as pd
	from sklearn.ensemble import GradientBoostingRegressor
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

	from src.data.preprocessing import Preprocessor
	from src.utils.config import settings, resolve
	from src.utils.logging_config import get_logger

	log = get_logger(__name__)


	class ModelTrainer:
	"""Train and evaluate a GradientBoosting model with MLflow tracking."""

	def __init__(self) -> None:
	self.preprocessor = Preprocessor()
	self._setup_mlflow()

	def train(
	self,
	df: pd.DataFrame,
	run_name: Optional[str] = None,
	tags: Optional[dict] = None,
	) -> dict:
	"""Train a new model on `df`.

	Returns a dict with: model, metrics, feature_importances, run_id, artifact_uri.
	"""
	X, y = self.preprocessor.transform_with_target(df)
	if y is None:
	raise ValueError("Training DataFrame must contain the target column.")

	X_train, X_test, y_train, y_test = train_test_split(
	X, y,
	test_size=settings.model.evaluation.test_size,
	random_state=settings.model.hyperparams.random_state,
	)

	hp = settings.model.hyperparams
	model = GradientBoostingRegressor(
	n_estimators=hp.n_estimators,
	max_depth=hp.max_depth,
	learning_rate=hp.learning_rate,
	subsample=hp.subsample,
	min_samples_split=hp.min_samples_split,
	random_state=hp.random_state,
	)

	with mlflow.start_run(run_name=run_name or f"train_{int(time.time())}") as run:
	mlflow.set_tags(tags or {})
	mlflow.log_params({
	"n_estimators": hp.n_estimators,
	"max_depth": hp.max_depth,
	"learning_rate": hp.learning_rate,
	"subsample": hp.subsample,
	"train_samples": len(X_train),
	"test_samples": len(X_test),
	})

	log.info("Training GradientBoosting on %d samples ...", len(X_train))
	t0 = time.perf_counter()
	model.fit(X_train, y_train)
	train_time = time.perf_counter() - t0

	metrics = self._evaluate(model, X_test, y_test)
	metrics["train_time_sec"] = round(train_time, 2)

	mlflow.log_metrics(metrics)
	mlflow.sklearn.log_model(model, artifact_path="model")

	fi = self._feature_importances(model)
	fi_path = resolve("data/logs/feature_importances.json")
	fi.to_json(fi_path, orient="records", indent=2)
	mlflow.log_artifact(str(fi_path))

	run_id = run.info.run_id
	artifact_uri = mlflow.get_artifact_uri("model")

	log.info(
	"Training complete — RMSE=%.4f, MAE=%.4f, R2=%.4f (run_id=%s)",
	metrics["rmse"], metrics["mae"], metrics["r2"], run_id,
	)

	return {
	"model": model,
	"metrics": metrics,
	"feature_importances": fi,
	"run_id": run_id,
	"artifact_uri": artifact_uri,
	"preprocessor": self.preprocessor,
	}

	def _evaluate(
	self,
	model: GradientBoostingRegressor,
	X: pd.DataFrame,
	y: pd.Series,
	) -> dict:
	y_pred = model.predict(X)
	rmse = float(np.sqrt(mean_squared_error(y, y_pred)))
	mae = float(mean_absolute_error(y, y_pred))
	r2 = float(r2_score(y, y_pred))
	return {"rmse": round(rmse, 4), "mae": round(mae, 4), "r2": round(r2, 4)}

	def _feature_importances(self, model: GradientBoostingRegressor) -> pd.DataFrame:
	names = self.preprocessor.feature_names()
	importances = model.feature_importances_
	return (
	pd.DataFrame({"feature": names, "importance": importances})
	.sort_values("importance", ascending=False)
	.reset_index(drop=True)
	)

	def _setup_mlflow(self) -> None:
	tracking_uri = resolve(settings.mlflow.tracking_uri)
	mlflow.set_tracking_uri(tracking_uri.as_uri())
	mlflow.set_experiment(settings.mlflow.experiment_name)
	log.info("MLflow tracking -> %s", tracking_uri)