Spaces:
Sleeping
Sleeping
| from sklearn.linear_model import LinearRegression, LogisticRegression | |
| from sklearn.naive_bayes import GaussianNB | |
| from sklearn.tree import DecisionTreeClassifier | |
| from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor | |
| from sklearn.svm import SVC, SVR | |
| from sklearn.neighbors import KNeighborsClassifier | |
| from sklearn.model_selection import train_test_split | |
| from xgboost import XGBClassifier, XGBRegressor | |
| from catboost import CatBoostClassifier | |
| from utils.metrics import classification_metrics, regression_metrics | |
| from utils.data_cleaner import prepare_data | |
| import pandas as pd | |
| import logging | |
| # Configure logging for this module | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| def train_model(df, target_column, model_name): | |
| """Trains a supervised machine learning model based on the specified model name. | |
| Args: | |
| df (pd.DataFrame): The input DataFrame containing features and target. | |
| target_column (str): The name of the target column. | |
| model_name (str): The name of the model to train (e.g., "Logistic Regression", "Random Forest"). | |
| Returns: | |
| tuple: A tuple containing: | |
| - model: The trained model object. | |
| - metrics (dict): A dictionary of evaluation metrics. | |
| - y_test (pd.Series): Actual target values from the test set. | |
| - y_pred (np.array): Predicted target values for the test set. | |
| - y_pred_proba (np.array, optional): Predicted probabilities for classification tasks. | |
| - X_test (pd.DataFrame): Feature values from the test set. | |
| - error (str, optional): An error message if training fails. | |
| """ | |
| try: | |
| # Prepare data: clean, encode, scale, and split into features (X) and target (y) | |
| X, y, label_encoders, is_classification = prepare_data(df, target_column) | |
| # Split data into training and testing sets | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| model = None | |
| # Initialize the selected model | |
| if model_name == "Logistic Regression": | |
| if not is_classification: | |
| return None, "Logistic Regression is for classification tasks.", None, None, None, None | |
| model = LogisticRegression(max_iter=1000) # Increased max_iter for convergence | |
| elif model_name == "Naive Bayes": | |
| if not is_classification: | |
| return None, "Naive Bayes is for classification tasks.", None, None, None, None | |
| model = GaussianNB() | |
| elif model_name == "Decision Tree": | |
| model = DecisionTreeClassifier(random_state=42) | |
| elif model_name == "Random Forest": | |
| if is_classification: | |
| model = RandomForestClassifier(random_state=42) | |
| else: | |
| model = RandomForestRegressor(random_state=42) | |
| elif model_name == "SVM": | |
| if is_classification: | |
| model = SVC(probability=True, random_state=42) # probability=True for ROC curve | |
| else: | |
| model = SVR() | |
| elif model_name == "KNN": | |
| if not is_classification: | |
| return None, "KNN is for classification tasks.", None, None, None, None | |
| model = KNeighborsClassifier() | |
| elif model_name == "XGBoost": | |
| if is_classification: | |
| model = XGBClassifier(eval_metric='mlogloss', use_label_encoder=False, random_state=42) | |
| else: | |
| model = XGBRegressor(random_state=42) | |
| elif model_name == "CatBoost": | |
| if not is_classification: | |
| return None, "CatBoost is for classification tasks.", None, None, None, None | |
| model = CatBoostClassifier(verbose=0, random_state=42) # verbose=0 to suppress output | |
| elif model_name == "Linear Regression": | |
| if is_classification: | |
| return None, "Linear Regression is for regression tasks.", None, None, None, None | |
| model = LinearRegression() | |
| else: | |
| return None, "Model not found.", None, None, None, None | |
| # Train the model | |
| model.fit(X_train, y_train) | |
| y_pred = model.predict(X_test) | |
| y_pred_proba = None | |
| # Get prediction probabilities for classification models (needed for ROC curve) | |
| if is_classification and hasattr(model, 'predict_proba'): | |
| y_pred_proba = model.predict_proba(X_test) | |
| # Calculate evaluation metrics | |
| if is_classification: | |
| metrics = classification_metrics(y_test, y_pred) | |
| else: | |
| metrics = regression_metrics(y_test, y_pred) | |
| logging.info(f"Successfully trained {model_name} model.") | |
| return model, metrics, y_test, y_pred, y_pred_proba, X_test | |
| except Exception as e: | |
| logging.error(f"An error occurred during model training for {model_name}: {e}", exc_info=True) | |
| return None, f"An error occurred during model training: {e}", None, None, None, None | |