Spaces:
Sleeping
Sleeping
File size: 8,325 Bytes
829e3ac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
"""
This script trains regression models using scikit-learn.
It includes data loading, preprocessing, optional log transformation,
hyperparameter tuning, model evaluation, and saving of models, metrics,
and visualizations.
Usage:
python train_regression_model.py --model_module MODEL_MODULE --data_path DATA_PATH/DATA_NAME.csv
--target_variable TARGET_VARIABLE
Optional arguments:
--test_size TEST_SIZE
--random_state RANDOM_STATE
--log_transform
--cv_folds CV_FOLDS
--scoring_metric SCORING_METRIC
--model_path MODEL_PATH
--results_path RESULTS_PATH
--visualize
--drop_columns COLUMN_NAMES
Example:
python train_regression_model.py --model_module linear_regression
--data_path data/house_prices/train.csv
--target_variable SalePrice --drop_columns Id
--log_transform --visualize
"""
import os
import sys
import argparse
import importlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, mean_squared_error, r2_score, mean_absolute_error
import joblib
from timeit import default_timer as timer
def main(args):
# Change to the root directory of the project
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
os.chdir(project_root)
sys.path.insert(0, project_root)
# Import the hyperparameter tuning and the model modules
from utils.supervised_hyperparameter_tuning import regression_hyperparameter_tuning
model_module_path = f"models.supervised.regression.{args.model_module}"
model_module = importlib.import_module(model_module_path)
# Get the model estimator, parameters grid, and the scoring metric
estimator = model_module.estimator
param_grid = model_module.param_grid
scoring_metric = args.scoring_metric or getattr(model_module, 'default_scoring', 'neg_root_mean_squared_error')
model_name = estimator.__class__.__name__
# Set default paths if not provided
args.model_path = args.model_path or os.path.join('saved_models', model_name)
args.results_path = args.results_path or os.path.join('results', model_name)
os.makedirs(args.results_path, exist_ok=True)
# Load the dataset
df = pd.read_csv(os.path.join(args.data_path))
# Drop specified columns
if args.drop_columns:
columns_to_drop = args.drop_columns.split(',')
df = df.drop(columns=columns_to_drop)
# Define target variable and features
target_variable = args.target_variable
X = df.drop(columns=[target_variable])
y = df[target_variable]
# Ensure target variable is numeric
if not np.issubdtype(y.dtype, np.number):
raise ValueError(f"The target variable '{target_variable}' must be numeric for regression tasks.")
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=args.test_size, random_state=args.random_state)
# Visualize target variable distribution
if args.visualize:
plt.figure(figsize=(6, 4))
sns.histplot(y_train, kde=True)
plt.title(f'{target_variable} Distribution Before Transformation')
plt.savefig(os.path.join(args.results_path, 'target_distribution_before.png'))
plt.show()
# Optional: Apply log transformation
if args.log_transform:
y_train_transformed = np.log1p(y_train)
y_test_transformed = np.log1p(y_test)
if args.visualize:
plt.figure(figsize=(6, 4))
sns.histplot(y_train_transformed, kde=True, color='green')
plt.title(f'{target_variable} Distribution After Log Transform')
plt.savefig(os.path.join(args.results_path, 'target_distribution_after.png'))
plt.show()
else:
y_train_transformed = y_train
y_test_transformed = y_test
# Start the timer
start_time = timer()
# Perform hyperparameter tuning
best_model, best_params = regression_hyperparameter_tuning(
X_train, y_train_transformed, estimator, param_grid,
cv=args.cv_folds, scoring=scoring_metric)
# End the timer and calculate how long it took
end_time = timer()
train_time = end_time-start_time
# Evaluate the best model on the test set
y_pred_transformed = best_model.predict(X_test)
# Reverse transformation if applied
if args.log_transform:
y_pred = np.expm1(y_pred_transformed)
y_test_actual = np.expm1(y_test_transformed)
else:
y_pred = y_pred_transformed
y_test_actual = y_test_transformed
# Save the trained model
model_output_path = os.path.join(args.model_path, 'best_model.pkl')
os.makedirs(args.model_path, exist_ok=True)
joblib.dump(best_model, model_output_path)
print(f"Trained model saved to {model_output_path}")
# Calculate metrics
rmse = root_mean_squared_error(y_test_actual, y_pred)
r2 = r2_score(y_test_actual, y_pred)
mae = mean_absolute_error(y_test_actual, y_pred)
mse = mean_squared_error(y_test_actual, y_pred)
print(f"\n{model_name} Regression Metrics on Test Set:")
print(f"- RMSE: {rmse:.4f}")
print(f"- R² Score: {r2:.4f}")
print(f"- MAE: {mae:.4f}")
print(f"- MSE: {mse:.4f}")
print(f"- Training time: {train_time:.4f} seconds")
# Save metrics
metrics = {'RMSE': [rmse], 'R2': [r2], 'MAE': [mae], 'MSE': [mse], 'train_time': [train_time]}
# Save metrics to CSV
results_df = pd.DataFrame(metrics)
results_df.to_csv(os.path.join(args.results_path, 'metrics.csv'), index=False)
print(f"\nMetrics saved to {os.path.join(args.results_path, 'metrics.csv')}")
if args.visualize:
# Plot Actual vs. Predicted
plt.figure(figsize=(8, 6))
plt.scatter(y_test_actual, y_pred, alpha=0.6, color='blue')
plt.plot([y_test_actual.min(), y_test_actual.max()],
[y_test_actual.min(), y_test_actual.max()], 'r--')
plt.xlabel(f'Actual {target_variable}')
plt.ylabel(f'Predicted {target_variable}')
plt.title(f'Actual vs. Predicted {target_variable}')
plt.savefig(os.path.join(args.results_path, 'actual_vs_predicted.png'))
plt.show()
print(f"Visualization saved to {os.path.join(args.results_path, 'actual_vs_predicted.png')}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Train a regression model.")
# Model module argument
parser.add_argument('--model_module', type=str, required=True,
help='Name of the regression model module to import.')
# Data arguments
parser.add_argument('--data_path', type=str, required=True,
help='Path to the dataset file including data name.')
parser.add_argument('--target_variable', type=str, required=True,
help='Name of the target variable.')
parser.add_argument('--drop_columns', type=str, default='',
help='Columns to drop from the dataset.')
# Model arguments
parser.add_argument('--test_size', type=float, default=0.2,
help='Proportion for test split.')
parser.add_argument('--random_state', type=int, default=42,
help='Random seed.')
parser.add_argument('--log_transform', action='store_true',
help='Apply log transformation to the target variable.')
parser.add_argument('--cv_folds', type=int, default=5,
help='Number of cross-validation folds.')
parser.add_argument('--scoring_metric', type=str, default=None,
help='Scoring metric for model evaluation.')
# Output arguments
parser.add_argument('--model_path', type=str, default=None,
help='Path to save the trained model.')
parser.add_argument('--results_path', type=str, default=None,
help='Path to save results and metrics.')
parser.add_argument('--visualize', action='store_true',
help='Generate and save visualizations.')
args = parser.parse_args()
main(args)
|