Spaces:

VashuTheGreat2
/

ML-Learner

Sleeping

File size: 9,771 Bytes

c01955c

import logging
import sys
import io
import base64
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification, make_regression
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)
from sklearn.decomposition import PCA
from src.CodeRunAndModelTrain.models.model_train_models import Train as TrainSchema
from exception import MyException
from utils.main_utils import read_yaml_file_sync
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.svm import SVR, LinearSVR, SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor,
    RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
)
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron, PassiveAggressiveClassifier, RidgeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB


from src.CodeRunAndModelTrain.constants import MODEL_TRAIN_CONFIG
import os
class TrainController:
    def __init__(self):
        self.config = read_yaml_file_sync(MODEL_TRAIN_CONFIG)
        self.model_map = self._create_model_map()

    def _create_model_map(self):
        """
        Dynamically creates a mapping of model names to their respective scikit-learn classes
        based on the 'class' attribute in the YAML configuration.
        """
        model_map = {}
        # Iterate over both classification and regression model definitions in config
        for category in ["classification_models", "regression_models"]:
            if category in self.config:
                for model_key, model_info in self.config[category].items():
                    class_name = model_info.get("class")
                    if class_name:
                        # Look up the class name in the current module's namespace
                        cls = getattr(sys.modules[__name__], class_name, None)
                        if cls:
                            model_map[model_key] = cls
                        else:
                            logging.warning(f"Class {class_name} for model {model_key} not found in imports.")
        return model_map

    async def train(self, schema: TrainSchema):
        logging.info("Entering the train method")
        try:
            model_type = schema.type.lower()
            config_key = "classification_models" if model_type == "classification" else "regression_models"
            
            if config_key not in self.config:
                raise ValueError(f"Invalid model type: {model_type}")
            
            model_name = schema.model_name
            if model_name not in self.config[config_key]:
                raise ValueError(f"Model {model_name} not found in configuration for {model_type}")

            model_class = self.model_map.get(model_name)
            if not model_class:
                raise ValueError(f"Model class for {model_name} not implemented in model_map")

            model = model_class(**schema.model_params)
            
            if model_type == "classification":
                X, y = make_classification(**schema.make_dataset)
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                res = await self.eval_classification(model, X_test, y_test, y_pred)
                return res
            
            elif model_type == "regression":
                X, y = make_regression(**schema.make_dataset)
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                res = await self.eval_regression(model, X_test, y_test, y_pred)
                return res

        except Exception as e:
            raise MyException(e, sys)

    def _save_plot(self, name: str):
        """Helper to save plt figure to buffer and file, returning base64 string."""
        os.makedirs("fig", exist_ok=True)
        buf = io.BytesIO()
        plt.savefig(buf, format='png')
        buf.seek(0)
        img_bytes = buf.read()
        img_str = base64.b64encode(img_bytes).decode('utf-8')
        
        with open(f"fig/{name}.png", "wb") as f:
            f.write(img_bytes)
        
        plt.close()
        return img_str

    async def eval_regression(self, model, X_test, y_true, y_pred, title="Regression performance"):
        mae = mean_absolute_error(y_true, y_pred)
        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_true, y_pred)
        residuals = y_true - y_pred

        res = {
            "mae": mae,
            "mse": mse,
            "rmse": rmse,
            "r2": r2,
            "plots": []
        }

        # 1. Actual vs Predicted
        plt.figure(figsize=(6, 5))
        plt.scatter(y_true, y_pred, alpha=0.6, edgecolor="k")
        min_val, max_val = min(y_true.min(), y_pred.min()), max(y_true.max(), y_pred.max())
        plt.plot([min_val, max_val], [min_val, max_val], "r--", label="Perfect fit")
        plt.xlabel("True values")
        plt.ylabel("Predicted values")
        plt.title(f"{title} - Actual vs Predicted")
        plt.legend(); plt.grid(alpha=0.3); plt.tight_layout()
        res['plots'].append(self._save_plot("regression_actual_vs_pred"))

        # 2. Residuals Plot
        plt.figure(figsize=(6, 5))
        plt.scatter(y_pred, residuals, alpha=0.6, edgecolor="k", color='orange')
        plt.axhline(y=0, color='r', linestyle='--')
        plt.xlabel("Predicted values")
        plt.ylabel("Residuals")
        plt.title(f"{title} - Residuals Plot")
        plt.grid(alpha=0.3); plt.tight_layout()
        res['plots'].append(self._save_plot("regression_residuals"))

        # 3. Error Distribution
        plt.figure(figsize=(6, 5))
        sns.histplot(residuals, kde=True, color='green')
        plt.xlabel("Residual Value")
        plt.title(f"{title} - Error Distribution")
        plt.grid(alpha=0.3); plt.tight_layout()
        res['plots'].append(self._save_plot("regression_error_dist"))
        
        return res

    async def eval_classification(self, model, X_test, y_true, y_pred, labels=None, title="Classification performance"):
        acc = accuracy_score(y_true, y_pred)
        prec = precision_score(y_true, y_pred, average="weighted", zero_division=0)
        rec = recall_score(y_true, y_pred, average="weighted", zero_division=0)
        f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0)

        res = {
            "accuracy": acc,
            "precision": prec,
            "recall": rec,
            "f1": f1,
            "plots": []
        }

        # 1. Confusion Matrix
        cm = confusion_matrix(y_true, y_pred, labels=labels)
        plt.figure(figsize=(6, 5))
        tick_labels = labels if labels is not None else "auto"
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=tick_labels, yticklabels=tick_labels)
        plt.xlabel("Predicted"); plt.ylabel("True")
        plt.title(f"{title} - Confusion Matrix")
        plt.tight_layout()
        res['plots'].append(self._save_plot("classification_cm"))

        # 2. Feature Importance (if available)
        importance = None
        if hasattr(model, "feature_importances_"):
            importance = model.feature_importances_
        elif hasattr(model, "coef_"):
            importance = np.abs(model.coef_[0]) if model.coef_.ndim > 1 else np.abs(model.coef_)

        if importance is not None:
            plt.figure(figsize=(6, 5))
            indices = np.argsort(importance)
            plt.barh(range(len(importance)), importance[indices], align='center')
            plt.yticks(range(len(importance)), [f"Feature {i}" for i in indices])
            plt.xlabel("Relative Importance")
            plt.title(f"{title} - Feature Importance")
            plt.tight_layout()
            res['plots'].append(self._save_plot("classification_importance"))

        # 3. Decision regions (PCA)
        res['plots'].append(await self.plot_decision_regions_pca(model, X_test, y_true))
        
        return res

    async def plot_decision_regions_pca(self, model, X, y, title="Decision regions (PCA)", cmap="viridis"):
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X)

        x_min, x_max = X_pca[:, 0].min() - 1.0, X_pca[:, 0].max() + 1.0
        y_min, y_max = X_pca[:, 1].min() - 1.0, X_pca[:, 1].max() + 1.0
        xx, yy = np.meshgrid(np.linspace(x_min, x_max, 300), np.linspace(y_min, y_max, 300))

        grid_pca = np.c_[xx.ravel(), yy.ravel()]
        grid_original = pca.inverse_transform(grid_pca)
        Z = model.predict(grid_original)
        Z = Z.reshape(xx.shape).astype(float) if Z.dtype.kind in 'U S' else Z.reshape(xx.shape)

        plt.figure(figsize=(7, 6))
        plt.contourf(xx, yy, Z, alpha=0.3, cmap=cmap)
        plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap=cmap, edgecolor="k", s=40)
        plt.xlabel("PC1"); plt.ylabel("PC2")
        plt.title(title); plt.grid(alpha=0.2); plt.tight_layout()
        
        return self._save_plot("decision_regions")