import mlflow import mlflow.sklearn import pandas as pd from src.utils.mlflow_utils import setup_mlflow from sklearn.pipeline import Pipeline from sklearn.linear_model import LinearRegression, Ridge, Lasso from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, r2_score from src.models.prepare import prepare_data def train_models(df: pd.DataFrame): X_train, X_test, y_train, y_test, preprocessor = prepare_data(df) models = { "LinearRegression": LinearRegression(), "Ridge": Ridge(), "Lasso": Lasso(), "RandomForest": RandomForestRegressor(random_state=42, n_jobs=-1) } setup_mlflow("AQI_Prediction") results = {} for name, model in models.items(): with mlflow.start_run(run_name=name): pipeline = Pipeline([ ("preprocessor", preprocessor), ("model", model) ]) pipeline.fit(X_train, y_train) y_pred = pipeline.predict(X_test) rmse = mean_squared_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) mlflow.log_param("model", name) mlflow.log_metric("rmse", rmse) mlflow.log_metric("r2_score", r2) mlflow.sklearn.log_model(pipeline, "model") results[name] = {"RMSE": rmse, "R2": r2} print(f"{name} → RMSE: {rmse:.2f}, R2: {r2:.4f}") return results