Spaces:
Sleeping
Sleeping
File size: 9,771 Bytes
c01955c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 | import logging
import sys
import io
import base64
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification, make_regression
from sklearn.metrics import (
mean_absolute_error, mean_squared_error, r2_score,
accuracy_score, precision_score, recall_score, f1_score,
confusion_matrix, classification_report
)
from sklearn.decomposition import PCA
from src.CodeRunAndModelTrain.models.model_train_models import Train as TrainSchema
from exception import MyException
from utils.main_utils import read_yaml_file_sync
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.svm import SVR, LinearSVR, SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import (
RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor,
RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
)
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron, PassiveAggressiveClassifier, RidgeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from src.CodeRunAndModelTrain.constants import MODEL_TRAIN_CONFIG
import os
class TrainController:
def __init__(self):
self.config = read_yaml_file_sync(MODEL_TRAIN_CONFIG)
self.model_map = self._create_model_map()
def _create_model_map(self):
"""
Dynamically creates a mapping of model names to their respective scikit-learn classes
based on the 'class' attribute in the YAML configuration.
"""
model_map = {}
# Iterate over both classification and regression model definitions in config
for category in ["classification_models", "regression_models"]:
if category in self.config:
for model_key, model_info in self.config[category].items():
class_name = model_info.get("class")
if class_name:
# Look up the class name in the current module's namespace
cls = getattr(sys.modules[__name__], class_name, None)
if cls:
model_map[model_key] = cls
else:
logging.warning(f"Class {class_name} for model {model_key} not found in imports.")
return model_map
async def train(self, schema: TrainSchema):
logging.info("Entering the train method")
try:
model_type = schema.type.lower()
config_key = "classification_models" if model_type == "classification" else "regression_models"
if config_key not in self.config:
raise ValueError(f"Invalid model type: {model_type}")
model_name = schema.model_name
if model_name not in self.config[config_key]:
raise ValueError(f"Model {model_name} not found in configuration for {model_type}")
model_class = self.model_map.get(model_name)
if not model_class:
raise ValueError(f"Model class for {model_name} not implemented in model_map")
model = model_class(**schema.model_params)
if model_type == "classification":
X, y = make_classification(**schema.make_dataset)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
res = await self.eval_classification(model, X_test, y_test, y_pred)
return res
elif model_type == "regression":
X, y = make_regression(**schema.make_dataset)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
res = await self.eval_regression(model, X_test, y_test, y_pred)
return res
except Exception as e:
raise MyException(e, sys)
def _save_plot(self, name: str):
"""Helper to save plt figure to buffer and file, returning base64 string."""
os.makedirs("fig", exist_ok=True)
buf = io.BytesIO()
plt.savefig(buf, format='png')
buf.seek(0)
img_bytes = buf.read()
img_str = base64.b64encode(img_bytes).decode('utf-8')
with open(f"fig/{name}.png", "wb") as f:
f.write(img_bytes)
plt.close()
return img_str
async def eval_regression(self, model, X_test, y_true, y_pred, title="Regression performance"):
mae = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_true, y_pred)
residuals = y_true - y_pred
res = {
"mae": mae,
"mse": mse,
"rmse": rmse,
"r2": r2,
"plots": []
}
# 1. Actual vs Predicted
plt.figure(figsize=(6, 5))
plt.scatter(y_true, y_pred, alpha=0.6, edgecolor="k")
min_val, max_val = min(y_true.min(), y_pred.min()), max(y_true.max(), y_pred.max())
plt.plot([min_val, max_val], [min_val, max_val], "r--", label="Perfect fit")
plt.xlabel("True values")
plt.ylabel("Predicted values")
plt.title(f"{title} - Actual vs Predicted")
plt.legend(); plt.grid(alpha=0.3); plt.tight_layout()
res['plots'].append(self._save_plot("regression_actual_vs_pred"))
# 2. Residuals Plot
plt.figure(figsize=(6, 5))
plt.scatter(y_pred, residuals, alpha=0.6, edgecolor="k", color='orange')
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.title(f"{title} - Residuals Plot")
plt.grid(alpha=0.3); plt.tight_layout()
res['plots'].append(self._save_plot("regression_residuals"))
# 3. Error Distribution
plt.figure(figsize=(6, 5))
sns.histplot(residuals, kde=True, color='green')
plt.xlabel("Residual Value")
plt.title(f"{title} - Error Distribution")
plt.grid(alpha=0.3); plt.tight_layout()
res['plots'].append(self._save_plot("regression_error_dist"))
return res
async def eval_classification(self, model, X_test, y_true, y_pred, labels=None, title="Classification performance"):
acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred, average="weighted", zero_division=0)
rec = recall_score(y_true, y_pred, average="weighted", zero_division=0)
f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0)
res = {
"accuracy": acc,
"precision": prec,
"recall": rec,
"f1": f1,
"plots": []
}
# 1. Confusion Matrix
cm = confusion_matrix(y_true, y_pred, labels=labels)
plt.figure(figsize=(6, 5))
tick_labels = labels if labels is not None else "auto"
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=tick_labels, yticklabels=tick_labels)
plt.xlabel("Predicted"); plt.ylabel("True")
plt.title(f"{title} - Confusion Matrix")
plt.tight_layout()
res['plots'].append(self._save_plot("classification_cm"))
# 2. Feature Importance (if available)
importance = None
if hasattr(model, "feature_importances_"):
importance = model.feature_importances_
elif hasattr(model, "coef_"):
importance = np.abs(model.coef_[0]) if model.coef_.ndim > 1 else np.abs(model.coef_)
if importance is not None:
plt.figure(figsize=(6, 5))
indices = np.argsort(importance)
plt.barh(range(len(importance)), importance[indices], align='center')
plt.yticks(range(len(importance)), [f"Feature {i}" for i in indices])
plt.xlabel("Relative Importance")
plt.title(f"{title} - Feature Importance")
plt.tight_layout()
res['plots'].append(self._save_plot("classification_importance"))
# 3. Decision regions (PCA)
res['plots'].append(await self.plot_decision_regions_pca(model, X_test, y_true))
return res
async def plot_decision_regions_pca(self, model, X, y, title="Decision regions (PCA)", cmap="viridis"):
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
x_min, x_max = X_pca[:, 0].min() - 1.0, X_pca[:, 0].max() + 1.0
y_min, y_max = X_pca[:, 1].min() - 1.0, X_pca[:, 1].max() + 1.0
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 300), np.linspace(y_min, y_max, 300))
grid_pca = np.c_[xx.ravel(), yy.ravel()]
grid_original = pca.inverse_transform(grid_pca)
Z = model.predict(grid_original)
Z = Z.reshape(xx.shape).astype(float) if Z.dtype.kind in 'U S' else Z.reshape(xx.shape)
plt.figure(figsize=(7, 6))
plt.contourf(xx, yy, Z, alpha=0.3, cmap=cmap)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap=cmap, edgecolor="k", s=40)
plt.xlabel("PC1"); plt.ylabel("PC2")
plt.title(title); plt.grid(alpha=0.2); plt.tight_layout()
return self._save_plot("decision_regions")
|