Spaces:
Sleeping
Sleeping
File size: 10,294 Bytes
7c045bd 441e594 7c045bd 441e594 7c045bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 |
"""
This script trains classification models using scikit-learn.
It handles data loading, preprocessing, hyperparameter tuning,
model evaluation with classification metrics, and saving of models,
metrics, and visualizations.
Usage:
python train_classification_model.py --model_module MODEL_MODULE --data_path DATA_PATH/DATA_NAME.csv
--target_variable TARGET_VARIABLE
Optional arguments:
--test_size TEST_SIZE
--random_state RANDOM_STATE
--cv_folds CV_FOLDS
--scoring_metric SCORING_METRIC
--model_path MODEL_PATH
--results_path RESULTS_PATH
--visualize
--drop_columns COLUMN_NAMES
Example:
python train_classification_model.py --model_module logistic_regression
--data_path data/adult_income/train.csv
--target_variable income_bracket --drop_columns Id
--scoring_metric accuracy --visualize
"""
import os
import sys
import argparse
import importlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
confusion_matrix, ConfusionMatrixDisplay)
import joblib
from timeit import default_timer as timer
def main(args):
# Change to the root directory of the project
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
os.chdir(project_root)
sys.path.insert(0, project_root)
# Import the hyperparameter tuning and the model modules
from utils.supervised_hyperparameter_tuning import classification_hyperparameter_tuning
model_module_path = f"models.supervised.classification.{args.model_module}"
model_module = importlib.import_module(model_module_path)
# Get the model estimator, parameters grid, and scoring metric
estimator = model_module.estimator
param_grid = model_module.param_grid
scoring_metric = args.scoring_metric or getattr(model_module, 'default_scoring', 'accuracy')
model_name = estimator.__class__.__name__
# Set default paths if not provided
args.model_path = args.model_path or os.path.join('saved_models', model_name)
args.results_path = args.results_path or os.path.join('results', model_name)
os.makedirs(args.results_path, exist_ok=True)
# Load the dataset
df = pd.read_csv(os.path.join(args.data_path))
# Drop specified columns
if args.drop_columns:
columns_to_drop = args.drop_columns.split(',')
df = df.drop(columns=columns_to_drop)
# Define target variable and features
target_variable = args.target_variable
X = df.drop(columns=[target_variable])
y = df[target_variable]
# Ensure target variable is not numeric (or at least, is categorical)
# It's fine if it's numeric labels for classes, but typically classification is categorical.
# We'll just run as is and rely on the estimator to handle it.
# If needed, we can print a note:
if np.issubdtype(y.dtype, np.number) and len(np.unique(y)) > 20:
# Large number of unique values might indicate a regression-like problem
print(f"Warning: The target variable '{target_variable}' seems to have many unique numeric values. Ensure it's truly a classification problem.")
# Encode target variable if not numeric
if y.dtype == 'object' or not np.issubdtype(y.dtype, np.number):
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
# Save label encoder so that we can interpret predictions later
# Create model_path directory if not exists
os.makedirs(args.model_path, exist_ok=True)
joblib.dump(le, os.path.join(args.model_path, 'label_encoder.pkl'))
print("LabelEncoder applied to target variable. Classes:", le.classes_)
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=args.test_size, random_state=args.random_state)
# Start the timer
start_time = timer()
# Perform hyperparameter tuning (classification)
best_model, best_params = classification_hyperparameter_tuning(
X_train, y_train, estimator, param_grid,
cv=args.cv_folds, scoring=scoring_metric)
# End the timer and calculate how long it took
end_time = timer()
train_time = end_time - start_time
# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
# Calculate classification metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
print(f"\n{model_name} Classification Metrics on Test Set:")
print(f"- Accuracy: {accuracy:.4f}")
print(f"- Precision: {precision:.4f}")
print(f"- Recall: {recall:.4f}")
print(f"- F1 Score: {f1:.4f}")
print(f"- Training Time: {train_time:.4f} seconds")
# Save the trained model
model_output_path = os.path.join(args.model_path, 'best_model.pkl')
os.makedirs(args.model_path, exist_ok=True)
joblib.dump(best_model, model_output_path)
print(f"Trained model saved to {model_output_path}")
# Save metrics to CSV
metrics = {
'Accuracy': [accuracy],
'Precision': [precision],
'Recall': [recall],
'F1 Score': [f1],
'train_time': [train_time]
}
results_df = pd.DataFrame(metrics)
results_df.to_csv(os.path.join(args.results_path, 'metrics.csv'), index=False)
print(f"\nMetrics saved to {os.path.join(args.results_path, 'metrics.csv')}")
if args.visualize:
# Plot Classification Metrics
plt.figure(figsize=(8, 6))
metric_names = list(metrics.keys())
metric_values = [value[0] for value in metrics.values() if value[0] is not None and isinstance(value[0], (int,float))]
plt.bar(metric_names[:-1], metric_values[:-1], color='skyblue', alpha=0.8) # exclude train_time from plotting
plt.ylim(0, 1)
plt.xlabel('Metrics')
plt.ylabel('Scores')
plt.title('Classification Metrics')
plt.savefig(os.path.join(args.results_path, 'classification_metrics.png'))
plt.show()
print(f"Visualization saved to {os.path.join(args.results_path, 'classification_metrics.png')}")
# Display and save the confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# Load the label encoder (if it exists)
label_encoder_path = os.path.join(args.model_path, "label_encoder.pkl")
if os.path.exists(label_encoder_path):
label_encoder = joblib.load(label_encoder_path)
# Decode the predicted and true labels
y_test_decoded = label_encoder.inverse_transform(y_test)
y_pred_decoded = label_encoder.inverse_transform(y_pred)
display_labels = label_encoder.classes_
else:
# If no encoder, use the original numeric labels
y_test_decoded = y_test
y_pred_decoded = y_pred
display_labels = None # Numeric labels will be used by default
# Save confusion matrix
conf_mat = confusion_matrix(y_test_decoded, y_pred_decoded)
plt.figure(figsize=(10, 8)) # Increased figure size for better spacing
disp = ConfusionMatrixDisplay(conf_mat, display_labels=display_labels)
# Customize the plot
disp.plot(cmap="Blues", values_format="d", ax=plt.gca())
plt.title("Confusion Matrix", fontsize=16, pad=20) # Increased font size and added padding
plt.xticks(rotation=45, ha="right", fontsize=12) # Rotated x-axis labels and increased font size
plt.yticks(fontsize=12) # Increased font size for y-axis labels
plt.xlabel("Predicted Label", fontsize=14) # Added font size for x-axis label
plt.ylabel("True Label", fontsize=14) # Added font size for y-axis label
# Save the improved plot
cm_path = os.path.join(args.results_path, "confusion_matrix.png")
plt.savefig(cm_path, bbox_inches="tight") # Ensures no clipping of labels
plt.show()
print(f"Confusion matrix saved to {cm_path}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Train a classification model.")
# Model module argument
parser.add_argument('--model_module', type=str, required=True,
help='Name of the classification model module to import.')
# Data arguments
parser.add_argument('--data_path', type=str, required=True,
help='Path to the dataset file including data name.')
parser.add_argument('--target_variable', type=str, required=True,
help='Name of the target variable (categorical).')
parser.add_argument('--drop_columns', type=str, default='',
help='Columns to drop from the dataset.')
# Model arguments
parser.add_argument('--test_size', type=float, default=0.2,
help='Proportion for test split.')
parser.add_argument('--random_state', type=int, default=42,
help='Random seed.')
parser.add_argument('--cv_folds', type=int, default=5,
help='Number of cross-validation folds.')
parser.add_argument('--scoring_metric', type=str, default=None,
help='Scoring metric for model evaluation (e.g., accuracy, f1, roc_auc).')
# Output arguments
parser.add_argument('--model_path', type=str, default=None,
help='Path to save the trained model.')
parser.add_argument('--results_path', type=str, default=None,
help='Path to save results and metrics.')
parser.add_argument('--visualize', action='store_true',
help='Generate and save visualizations (classification metrics chart and confusion matrix).')
args = parser.parse_args()
main(args)
|