import logging import sys import io import base64 import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.datasets import make_classification, make_regression from sklearn.metrics import ( mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report ) from sklearn.decomposition import PCA from src.CodeRunAndModelTrain.models.model_train_models import Train as TrainSchema from exception import MyException from utils.main_utils import read_yaml_file_sync from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier from sklearn.svm import SVR, LinearSVR, SVC, LinearSVC, NuSVC from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier from sklearn.ensemble import ( RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier ) from sklearn.neural_network import MLPRegressor, MLPClassifier from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron, PassiveAggressiveClassifier, RidgeClassifier from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB from src.CodeRunAndModelTrain.constants import MODEL_TRAIN_CONFIG import os class TrainController: def __init__(self): self.config = read_yaml_file_sync(MODEL_TRAIN_CONFIG) self.model_map = self._create_model_map() def _create_model_map(self): """ Dynamically creates a mapping of model names to their respective scikit-learn classes based on the 'class' attribute in the YAML configuration. """ model_map = {} # Iterate over both classification and regression model definitions in config for category in ["classification_models", "regression_models"]: if category in self.config: for model_key, model_info in self.config[category].items(): class_name = model_info.get("class") if class_name: # Look up the class name in the current module's namespace cls = getattr(sys.modules[__name__], class_name, None) if cls: model_map[model_key] = cls else: logging.warning(f"Class {class_name} for model {model_key} not found in imports.") return model_map async def train(self, schema: TrainSchema): logging.info("Entering the train method") try: model_type = schema.type.lower() config_key = "classification_models" if model_type == "classification" else "regression_models" if config_key not in self.config: raise ValueError(f"Invalid model type: {model_type}") model_name = schema.model_name if model_name not in self.config[config_key]: raise ValueError(f"Model {model_name} not found in configuration for {model_type}") model_class = self.model_map.get(model_name) if not model_class: raise ValueError(f"Model class for {model_name} not implemented in model_map") model = model_class(**schema.model_params) if model_type == "classification": X, y = make_classification(**schema.make_dataset) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) model.fit(X_train, y_train) y_pred = model.predict(X_test) res = await self.eval_classification(model, X_test, y_test, y_pred) return res elif model_type == "regression": X, y = make_regression(**schema.make_dataset) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) model.fit(X_train, y_train) y_pred = model.predict(X_test) res = await self.eval_regression(model, X_test, y_test, y_pred) return res except Exception as e: raise MyException(e, sys) def _save_plot(self, name: str): """Helper to save plt figure to buffer and file, returning base64 string.""" os.makedirs("fig", exist_ok=True) buf = io.BytesIO() plt.savefig(buf, format='png') buf.seek(0) img_bytes = buf.read() img_str = base64.b64encode(img_bytes).decode('utf-8') with open(f"fig/{name}.png", "wb") as f: f.write(img_bytes) plt.close() return img_str async def eval_regression(self, model, X_test, y_true, y_pred, title="Regression performance"): mae = mean_absolute_error(y_true, y_pred) mse = mean_squared_error(y_true, y_pred) rmse = np.sqrt(mse) r2 = r2_score(y_true, y_pred) residuals = y_true - y_pred res = { "mae": mae, "mse": mse, "rmse": rmse, "r2": r2, "plots": [] } # 1. Actual vs Predicted plt.figure(figsize=(6, 5)) plt.scatter(y_true, y_pred, alpha=0.6, edgecolor="k") min_val, max_val = min(y_true.min(), y_pred.min()), max(y_true.max(), y_pred.max()) plt.plot([min_val, max_val], [min_val, max_val], "r--", label="Perfect fit") plt.xlabel("True values") plt.ylabel("Predicted values") plt.title(f"{title} - Actual vs Predicted") plt.legend(); plt.grid(alpha=0.3); plt.tight_layout() res['plots'].append(self._save_plot("regression_actual_vs_pred")) # 2. Residuals Plot plt.figure(figsize=(6, 5)) plt.scatter(y_pred, residuals, alpha=0.6, edgecolor="k", color='orange') plt.axhline(y=0, color='r', linestyle='--') plt.xlabel("Predicted values") plt.ylabel("Residuals") plt.title(f"{title} - Residuals Plot") plt.grid(alpha=0.3); plt.tight_layout() res['plots'].append(self._save_plot("regression_residuals")) # 3. Error Distribution plt.figure(figsize=(6, 5)) sns.histplot(residuals, kde=True, color='green') plt.xlabel("Residual Value") plt.title(f"{title} - Error Distribution") plt.grid(alpha=0.3); plt.tight_layout() res['plots'].append(self._save_plot("regression_error_dist")) return res async def eval_classification(self, model, X_test, y_true, y_pred, labels=None, title="Classification performance"): acc = accuracy_score(y_true, y_pred) prec = precision_score(y_true, y_pred, average="weighted", zero_division=0) rec = recall_score(y_true, y_pred, average="weighted", zero_division=0) f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0) res = { "accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "plots": [] } # 1. Confusion Matrix cm = confusion_matrix(y_true, y_pred, labels=labels) plt.figure(figsize=(6, 5)) tick_labels = labels if labels is not None else "auto" sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=tick_labels, yticklabels=tick_labels) plt.xlabel("Predicted"); plt.ylabel("True") plt.title(f"{title} - Confusion Matrix") plt.tight_layout() res['plots'].append(self._save_plot("classification_cm")) # 2. Feature Importance (if available) importance = None if hasattr(model, "feature_importances_"): importance = model.feature_importances_ elif hasattr(model, "coef_"): importance = np.abs(model.coef_[0]) if model.coef_.ndim > 1 else np.abs(model.coef_) if importance is not None: plt.figure(figsize=(6, 5)) indices = np.argsort(importance) plt.barh(range(len(importance)), importance[indices], align='center') plt.yticks(range(len(importance)), [f"Feature {i}" for i in indices]) plt.xlabel("Relative Importance") plt.title(f"{title} - Feature Importance") plt.tight_layout() res['plots'].append(self._save_plot("classification_importance")) # 3. Decision regions (PCA) res['plots'].append(await self.plot_decision_regions_pca(model, X_test, y_true)) return res async def plot_decision_regions_pca(self, model, X, y, title="Decision regions (PCA)", cmap="viridis"): pca = PCA(n_components=2) X_pca = pca.fit_transform(X) x_min, x_max = X_pca[:, 0].min() - 1.0, X_pca[:, 0].max() + 1.0 y_min, y_max = X_pca[:, 1].min() - 1.0, X_pca[:, 1].max() + 1.0 xx, yy = np.meshgrid(np.linspace(x_min, x_max, 300), np.linspace(y_min, y_max, 300)) grid_pca = np.c_[xx.ravel(), yy.ravel()] grid_original = pca.inverse_transform(grid_pca) Z = model.predict(grid_original) Z = Z.reshape(xx.shape).astype(float) if Z.dtype.kind in 'U S' else Z.reshape(xx.shape) plt.figure(figsize=(7, 6)) plt.contourf(xx, yy, Z, alpha=0.3, cmap=cmap) plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap=cmap, edgecolor="k", s=40) plt.xlabel("PC1"); plt.ylabel("PC2") plt.title(title); plt.grid(alpha=0.2); plt.tight_layout() return self._save_plot("decision_regions")