Spaces:

mboukabous
/

train_regression

Sleeping

File size: 8,325 Bytes

829e3ac

"""
This script trains regression models using scikit-learn.
It includes data loading, preprocessing, optional log transformation,
hyperparameter tuning, model evaluation, and saving of models, metrics,
and visualizations.

Usage:
    python train_regression_model.py --model_module MODEL_MODULE --data_path DATA_PATH/DATA_NAME.csv
                                     --target_variable TARGET_VARIABLE

Optional arguments:
    --test_size TEST_SIZE
    --random_state RANDOM_STATE
    --log_transform
    --cv_folds CV_FOLDS
    --scoring_metric SCORING_METRIC
    --model_path MODEL_PATH
    --results_path RESULTS_PATH
    --visualize
    --drop_columns COLUMN_NAMES

Example:
    python train_regression_model.py --model_module linear_regression
                                     --data_path data/house_prices/train.csv
                                     --target_variable SalePrice --drop_columns Id
                                     --log_transform --visualize
"""

import os
import sys
import argparse
import importlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, mean_squared_error, r2_score, mean_absolute_error
import joblib
from timeit import default_timer as timer

def main(args):
    # Change to the root directory of the project
    project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
    os.chdir(project_root)
    sys.path.insert(0, project_root)

    # Import the hyperparameter tuning and the model modules
    from utils.supervised_hyperparameter_tuning import regression_hyperparameter_tuning
    model_module_path = f"models.supervised.regression.{args.model_module}"
    model_module = importlib.import_module(model_module_path)
    
    # Get the model estimator, parameters grid, and the scoring metric
    estimator = model_module.estimator
    param_grid = model_module.param_grid
    scoring_metric = args.scoring_metric or getattr(model_module, 'default_scoring', 'neg_root_mean_squared_error')
    model_name = estimator.__class__.__name__

    # Set default paths if not provided
    args.model_path = args.model_path or os.path.join('saved_models', model_name)
    args.results_path = args.results_path or os.path.join('results', model_name)
    os.makedirs(args.results_path, exist_ok=True)

    # Load the dataset
    df = pd.read_csv(os.path.join(args.data_path))

    # Drop specified columns
    if args.drop_columns:
        columns_to_drop = args.drop_columns.split(',')
        df = df.drop(columns=columns_to_drop)

    # Define target variable and features
    target_variable = args.target_variable
    X = df.drop(columns=[target_variable])
    y = df[target_variable]

    # Ensure target variable is numeric
    if not np.issubdtype(y.dtype, np.number):
        raise ValueError(f"The target variable '{target_variable}' must be numeric for regression tasks.")

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=args.test_size, random_state=args.random_state)

    # Visualize target variable distribution
    if args.visualize:
        plt.figure(figsize=(6, 4))
        sns.histplot(y_train, kde=True)
        plt.title(f'{target_variable} Distribution Before Transformation')
        plt.savefig(os.path.join(args.results_path, 'target_distribution_before.png'))
        plt.show()

    # Optional: Apply log transformation
    if args.log_transform:
        y_train_transformed = np.log1p(y_train)
        y_test_transformed = np.log1p(y_test)
        if args.visualize:
            plt.figure(figsize=(6, 4))
            sns.histplot(y_train_transformed, kde=True, color='green')
            plt.title(f'{target_variable} Distribution After Log Transform')
            plt.savefig(os.path.join(args.results_path, 'target_distribution_after.png'))
            plt.show()
    else:
        y_train_transformed = y_train
        y_test_transformed = y_test

    # Start the timer
    start_time = timer()

    # Perform hyperparameter tuning
    best_model, best_params = regression_hyperparameter_tuning(
        X_train, y_train_transformed, estimator, param_grid,
        cv=args.cv_folds, scoring=scoring_metric)

    # End the timer and calculate how long it took
    end_time = timer()
    train_time = end_time-start_time

    # Evaluate the best model on the test set
    y_pred_transformed = best_model.predict(X_test)

    # Reverse transformation if applied
    if args.log_transform:
        y_pred = np.expm1(y_pred_transformed)
        y_test_actual = np.expm1(y_test_transformed)
    else:
        y_pred = y_pred_transformed
        y_test_actual = y_test_transformed

    # Save the trained model
    model_output_path = os.path.join(args.model_path, 'best_model.pkl')
    os.makedirs(args.model_path, exist_ok=True)
    joblib.dump(best_model, model_output_path)
    print(f"Trained model saved to {model_output_path}")

    # Calculate metrics
    rmse = root_mean_squared_error(y_test_actual, y_pred)
    r2 = r2_score(y_test_actual, y_pred)
    mae = mean_absolute_error(y_test_actual, y_pred)
    mse = mean_squared_error(y_test_actual, y_pred)
    print(f"\n{model_name} Regression Metrics on Test Set:")
    print(f"- RMSE: {rmse:.4f}")
    print(f"- R² Score: {r2:.4f}")
    print(f"- MAE: {mae:.4f}")
    print(f"- MSE: {mse:.4f}")
    print(f"- Training time: {train_time:.4f} seconds")
    # Save metrics
    metrics = {'RMSE': [rmse], 'R2': [r2], 'MAE': [mae], 'MSE': [mse], 'train_time': [train_time]}

    # Save metrics to CSV
    results_df = pd.DataFrame(metrics)
    results_df.to_csv(os.path.join(args.results_path, 'metrics.csv'), index=False)
    print(f"\nMetrics saved to {os.path.join(args.results_path, 'metrics.csv')}")

    if args.visualize:
        # Plot Actual vs. Predicted
        plt.figure(figsize=(8, 6))
        plt.scatter(y_test_actual, y_pred, alpha=0.6, color='blue')
        plt.plot([y_test_actual.min(), y_test_actual.max()],
                [y_test_actual.min(), y_test_actual.max()], 'r--')
        plt.xlabel(f'Actual {target_variable}')
        plt.ylabel(f'Predicted {target_variable}')
        plt.title(f'Actual vs. Predicted {target_variable}')
        plt.savefig(os.path.join(args.results_path, 'actual_vs_predicted.png'))
        plt.show()
        print(f"Visualization saved to {os.path.join(args.results_path, 'actual_vs_predicted.png')}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Train a regression model.")
    # Model module argument
    parser.add_argument('--model_module', type=str, required=True,
                        help='Name of the regression model module to import.')
    # Data arguments
    parser.add_argument('--data_path', type=str, required=True,
                        help='Path to the dataset file including data name.')
    parser.add_argument('--target_variable', type=str, required=True,
                        help='Name of the target variable.')
    parser.add_argument('--drop_columns', type=str, default='',
                        help='Columns to drop from the dataset.')
    # Model arguments
    parser.add_argument('--test_size', type=float, default=0.2,
                        help='Proportion for test split.')
    parser.add_argument('--random_state', type=int, default=42,
                        help='Random seed.')
    parser.add_argument('--log_transform', action='store_true',
                        help='Apply log transformation to the target variable.')
    parser.add_argument('--cv_folds', type=int, default=5,
                        help='Number of cross-validation folds.')
    parser.add_argument('--scoring_metric', type=str, default=None,
                        help='Scoring metric for model evaluation.')
    # Output arguments
    parser.add_argument('--model_path', type=str, default=None,
                        help='Path to save the trained model.')
    parser.add_argument('--results_path', type=str, default=None,
                        help='Path to save results and metrics.')
    parser.add_argument('--visualize', action='store_true',
                        help='Generate and save visualizations.')

    args = parser.parse_args()
    main(args)