train_regression / scripts /train_regression_model.py
mboukabous's picture
Add application file
829e3ac
"""
This script trains regression models using scikit-learn.
It includes data loading, preprocessing, optional log transformation,
hyperparameter tuning, model evaluation, and saving of models, metrics,
and visualizations.
Usage:
python train_regression_model.py --model_module MODEL_MODULE --data_path DATA_PATH/DATA_NAME.csv
--target_variable TARGET_VARIABLE
Optional arguments:
--test_size TEST_SIZE
--random_state RANDOM_STATE
--log_transform
--cv_folds CV_FOLDS
--scoring_metric SCORING_METRIC
--model_path MODEL_PATH
--results_path RESULTS_PATH
--visualize
--drop_columns COLUMN_NAMES
Example:
python train_regression_model.py --model_module linear_regression
--data_path data/house_prices/train.csv
--target_variable SalePrice --drop_columns Id
--log_transform --visualize
"""
import os
import sys
import argparse
import importlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, mean_squared_error, r2_score, mean_absolute_error
import joblib
from timeit import default_timer as timer
def main(args):
# Change to the root directory of the project
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
os.chdir(project_root)
sys.path.insert(0, project_root)
# Import the hyperparameter tuning and the model modules
from utils.supervised_hyperparameter_tuning import regression_hyperparameter_tuning
model_module_path = f"models.supervised.regression.{args.model_module}"
model_module = importlib.import_module(model_module_path)
# Get the model estimator, parameters grid, and the scoring metric
estimator = model_module.estimator
param_grid = model_module.param_grid
scoring_metric = args.scoring_metric or getattr(model_module, 'default_scoring', 'neg_root_mean_squared_error')
model_name = estimator.__class__.__name__
# Set default paths if not provided
args.model_path = args.model_path or os.path.join('saved_models', model_name)
args.results_path = args.results_path or os.path.join('results', model_name)
os.makedirs(args.results_path, exist_ok=True)
# Load the dataset
df = pd.read_csv(os.path.join(args.data_path))
# Drop specified columns
if args.drop_columns:
columns_to_drop = args.drop_columns.split(',')
df = df.drop(columns=columns_to_drop)
# Define target variable and features
target_variable = args.target_variable
X = df.drop(columns=[target_variable])
y = df[target_variable]
# Ensure target variable is numeric
if not np.issubdtype(y.dtype, np.number):
raise ValueError(f"The target variable '{target_variable}' must be numeric for regression tasks.")
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=args.test_size, random_state=args.random_state)
# Visualize target variable distribution
if args.visualize:
plt.figure(figsize=(6, 4))
sns.histplot(y_train, kde=True)
plt.title(f'{target_variable} Distribution Before Transformation')
plt.savefig(os.path.join(args.results_path, 'target_distribution_before.png'))
plt.show()
# Optional: Apply log transformation
if args.log_transform:
y_train_transformed = np.log1p(y_train)
y_test_transformed = np.log1p(y_test)
if args.visualize:
plt.figure(figsize=(6, 4))
sns.histplot(y_train_transformed, kde=True, color='green')
plt.title(f'{target_variable} Distribution After Log Transform')
plt.savefig(os.path.join(args.results_path, 'target_distribution_after.png'))
plt.show()
else:
y_train_transformed = y_train
y_test_transformed = y_test
# Start the timer
start_time = timer()
# Perform hyperparameter tuning
best_model, best_params = regression_hyperparameter_tuning(
X_train, y_train_transformed, estimator, param_grid,
cv=args.cv_folds, scoring=scoring_metric)
# End the timer and calculate how long it took
end_time = timer()
train_time = end_time-start_time
# Evaluate the best model on the test set
y_pred_transformed = best_model.predict(X_test)
# Reverse transformation if applied
if args.log_transform:
y_pred = np.expm1(y_pred_transformed)
y_test_actual = np.expm1(y_test_transformed)
else:
y_pred = y_pred_transformed
y_test_actual = y_test_transformed
# Save the trained model
model_output_path = os.path.join(args.model_path, 'best_model.pkl')
os.makedirs(args.model_path, exist_ok=True)
joblib.dump(best_model, model_output_path)
print(f"Trained model saved to {model_output_path}")
# Calculate metrics
rmse = root_mean_squared_error(y_test_actual, y_pred)
r2 = r2_score(y_test_actual, y_pred)
mae = mean_absolute_error(y_test_actual, y_pred)
mse = mean_squared_error(y_test_actual, y_pred)
print(f"\n{model_name} Regression Metrics on Test Set:")
print(f"- RMSE: {rmse:.4f}")
print(f"- R² Score: {r2:.4f}")
print(f"- MAE: {mae:.4f}")
print(f"- MSE: {mse:.4f}")
print(f"- Training time: {train_time:.4f} seconds")
# Save metrics
metrics = {'RMSE': [rmse], 'R2': [r2], 'MAE': [mae], 'MSE': [mse], 'train_time': [train_time]}
# Save metrics to CSV
results_df = pd.DataFrame(metrics)
results_df.to_csv(os.path.join(args.results_path, 'metrics.csv'), index=False)
print(f"\nMetrics saved to {os.path.join(args.results_path, 'metrics.csv')}")
if args.visualize:
# Plot Actual vs. Predicted
plt.figure(figsize=(8, 6))
plt.scatter(y_test_actual, y_pred, alpha=0.6, color='blue')
plt.plot([y_test_actual.min(), y_test_actual.max()],
[y_test_actual.min(), y_test_actual.max()], 'r--')
plt.xlabel(f'Actual {target_variable}')
plt.ylabel(f'Predicted {target_variable}')
plt.title(f'Actual vs. Predicted {target_variable}')
plt.savefig(os.path.join(args.results_path, 'actual_vs_predicted.png'))
plt.show()
print(f"Visualization saved to {os.path.join(args.results_path, 'actual_vs_predicted.png')}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Train a regression model.")
# Model module argument
parser.add_argument('--model_module', type=str, required=True,
help='Name of the regression model module to import.')
# Data arguments
parser.add_argument('--data_path', type=str, required=True,
help='Path to the dataset file including data name.')
parser.add_argument('--target_variable', type=str, required=True,
help='Name of the target variable.')
parser.add_argument('--drop_columns', type=str, default='',
help='Columns to drop from the dataset.')
# Model arguments
parser.add_argument('--test_size', type=float, default=0.2,
help='Proportion for test split.')
parser.add_argument('--random_state', type=int, default=42,
help='Random seed.')
parser.add_argument('--log_transform', action='store_true',
help='Apply log transformation to the target variable.')
parser.add_argument('--cv_folds', type=int, default=5,
help='Number of cross-validation folds.')
parser.add_argument('--scoring_metric', type=str, default=None,
help='Scoring metric for model evaluation.')
# Output arguments
parser.add_argument('--model_path', type=str, default=None,
help='Path to save the trained model.')
parser.add_argument('--results_path', type=str, default=None,
help='Path to save results and metrics.')
parser.add_argument('--visualize', action='store_true',
help='Generate and save visualizations.')
args = parser.parse_args()
main(args)