Spaces:

mboukabous
/

train_classificator

Sleeping

File size: 10,294 Bytes


"""
This script trains classification models using scikit-learn.
It handles data loading, preprocessing, hyperparameter tuning,
model evaluation with classification metrics, and saving of models,
metrics, and visualizations.

Usage:
    python train_classification_model.py --model_module MODEL_MODULE --data_path DATA_PATH/DATA_NAME.csv
                                         --target_variable TARGET_VARIABLE

Optional arguments:
    --test_size TEST_SIZE
    --random_state RANDOM_STATE
    --cv_folds CV_FOLDS
    --scoring_metric SCORING_METRIC
    --model_path MODEL_PATH
    --results_path RESULTS_PATH
    --visualize
    --drop_columns COLUMN_NAMES

Example:
    python train_classification_model.py --model_module logistic_regression
                                         --data_path data/adult_income/train.csv
                                         --target_variable income_bracket --drop_columns Id
                                         --scoring_metric accuracy --visualize
"""

import os
import sys
import argparse
import importlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             confusion_matrix, ConfusionMatrixDisplay)
import joblib
from timeit import default_timer as timer

def main(args):
    # Change to the root directory of the project
    project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
    os.chdir(project_root)
    sys.path.insert(0, project_root)

    # Import the hyperparameter tuning and the model modules
    from utils.supervised_hyperparameter_tuning import classification_hyperparameter_tuning
    model_module_path = f"models.supervised.classification.{args.model_module}"
    model_module = importlib.import_module(model_module_path)

    # Get the model estimator, parameters grid, and scoring metric
    estimator = model_module.estimator
    param_grid = model_module.param_grid
    scoring_metric = args.scoring_metric or getattr(model_module, 'default_scoring', 'accuracy')
    model_name = estimator.__class__.__name__

    # Set default paths if not provided
    args.model_path = args.model_path or os.path.join('saved_models', model_name)
    args.results_path = args.results_path or os.path.join('results', model_name)
    os.makedirs(args.results_path, exist_ok=True)

    # Load the dataset
    df = pd.read_csv(os.path.join(args.data_path))

    # Drop specified columns
    if args.drop_columns:
        columns_to_drop = args.drop_columns.split(',')
        df = df.drop(columns=columns_to_drop)

    # Define target variable and features
    target_variable = args.target_variable
    X = df.drop(columns=[target_variable])
    y = df[target_variable]

    # Ensure target variable is not numeric (or at least, is categorical)
    # It's fine if it's numeric labels for classes, but typically classification is categorical.
    # We'll just run as is and rely on the estimator to handle it.
    # If needed, we can print a note:
    if np.issubdtype(y.dtype, np.number) and len(np.unique(y)) > 20:
        # Large number of unique values might indicate a regression-like problem
        print(f"Warning: The target variable '{target_variable}' seems to have many unique numeric values. Ensure it's truly a classification problem.")

    # Encode target variable if not numeric
    if y.dtype == 'object' or not np.issubdtype(y.dtype, np.number):
        from sklearn.preprocessing import LabelEncoder
        le = LabelEncoder()
        y = le.fit_transform(y)

        # Save label encoder so that we can interpret predictions later
        # Create model_path directory if not exists
        os.makedirs(args.model_path, exist_ok=True)
        joblib.dump(le, os.path.join(args.model_path, 'label_encoder.pkl'))
        print("LabelEncoder applied to target variable. Classes:", le.classes_)

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=args.test_size, random_state=args.random_state)

    # Start the timer
    start_time = timer()

    # Perform hyperparameter tuning (classification)
    best_model, best_params = classification_hyperparameter_tuning(
        X_train, y_train, estimator, param_grid,
        cv=args.cv_folds, scoring=scoring_metric)

    # End the timer and calculate how long it took
    end_time = timer()
    train_time = end_time - start_time

    # Evaluate the best model on the test set
    y_pred = best_model.predict(X_test)

    # Calculate classification metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

    print(f"\n{model_name} Classification Metrics on Test Set:")
    print(f"- Accuracy: {accuracy:.4f}")
    print(f"- Precision: {precision:.4f}")
    print(f"- Recall: {recall:.4f}")
    print(f"- F1 Score: {f1:.4f}")
    print(f"- Training Time: {train_time:.4f} seconds")

    # Save the trained model
    model_output_path = os.path.join(args.model_path, 'best_model.pkl')
    os.makedirs(args.model_path, exist_ok=True)
    joblib.dump(best_model, model_output_path)
    print(f"Trained model saved to {model_output_path}")

    # Save metrics to CSV
    metrics = {
        'Accuracy': [accuracy],
        'Precision': [precision],
        'Recall': [recall],
        'F1 Score': [f1],
        'train_time': [train_time]
    }
    results_df = pd.DataFrame(metrics)
    results_df.to_csv(os.path.join(args.results_path, 'metrics.csv'), index=False)
    print(f"\nMetrics saved to {os.path.join(args.results_path, 'metrics.csv')}")

    if args.visualize:
        # Plot Classification Metrics
        plt.figure(figsize=(8, 6))
        metric_names = list(metrics.keys())
        metric_values = [value[0] for value in metrics.values() if value[0] is not None and isinstance(value[0], (int,float))]
        plt.bar(metric_names[:-1], metric_values[:-1], color='skyblue', alpha=0.8)  # exclude train_time from plotting
        plt.ylim(0, 1)
        plt.xlabel('Metrics')
        plt.ylabel('Scores')
        plt.title('Classification Metrics')
        plt.savefig(os.path.join(args.results_path, 'classification_metrics.png'))
        plt.show()
        print(f"Visualization saved to {os.path.join(args.results_path, 'classification_metrics.png')}")

        # Display and save the confusion matrix
        from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
        
        # Load the label encoder (if it exists)
        label_encoder_path = os.path.join(args.model_path, "label_encoder.pkl")
        if os.path.exists(label_encoder_path):
            label_encoder = joblib.load(label_encoder_path)
            # Decode the predicted and true labels
            y_test_decoded = label_encoder.inverse_transform(y_test)
            y_pred_decoded = label_encoder.inverse_transform(y_pred)
            display_labels = label_encoder.classes_
        else:
            # If no encoder, use the original numeric labels
            y_test_decoded = y_test
            y_pred_decoded = y_pred
            display_labels = None  # Numeric labels will be used by default
        
        # Save confusion matrix
        conf_mat = confusion_matrix(y_test_decoded, y_pred_decoded)
        plt.figure(figsize=(10, 8))  # Increased figure size for better spacing
        disp = ConfusionMatrixDisplay(conf_mat, display_labels=display_labels)

        # Customize the plot
        disp.plot(cmap="Blues", values_format="d", ax=plt.gca())
        plt.title("Confusion Matrix", fontsize=16, pad=20)  # Increased font size and added padding
        plt.xticks(rotation=45, ha="right", fontsize=12)  # Rotated x-axis labels and increased font size
        plt.yticks(fontsize=12)  # Increased font size for y-axis labels
        plt.xlabel("Predicted Label", fontsize=14)  # Added font size for x-axis label
        plt.ylabel("True Label", fontsize=14)  # Added font size for y-axis label

        # Save the improved plot
        cm_path = os.path.join(args.results_path, "confusion_matrix.png")
        plt.savefig(cm_path, bbox_inches="tight")  # Ensures no clipping of labels
        plt.show()

        print(f"Confusion matrix saved to {cm_path}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Train a classification model.")
    # Model module argument
    parser.add_argument('--model_module', type=str, required=True,
                        help='Name of the classification model module to import.')
    # Data arguments
    parser.add_argument('--data_path', type=str, required=True,
                        help='Path to the dataset file including data name.')
    parser.add_argument('--target_variable', type=str, required=True,
                        help='Name of the target variable (categorical).')
    parser.add_argument('--drop_columns', type=str, default='',
                        help='Columns to drop from the dataset.')
    # Model arguments
    parser.add_argument('--test_size', type=float, default=0.2,
                        help='Proportion for test split.')
    parser.add_argument('--random_state', type=int, default=42,
                        help='Random seed.')
    parser.add_argument('--cv_folds', type=int, default=5,
                        help='Number of cross-validation folds.')
    parser.add_argument('--scoring_metric', type=str, default=None,
                        help='Scoring metric for model evaluation (e.g., accuracy, f1, roc_auc).')
    # Output arguments
    parser.add_argument('--model_path', type=str, default=None,
                        help='Path to save the trained model.')
    parser.add_argument('--results_path', type=str, default=None,
                        help='Path to save results and metrics.')
    parser.add_argument('--visualize', action='store_true',
                        help='Generate and save visualizations (classification metrics chart and confusion matrix).')

    args = parser.parse_args()
    main(args)