NSS / src /components /model_trainer.py
Kshitijk20's picture
adding mlfow registered model loading
5aa701f
import os, sys
from src.utils.ml_utils.model.estimator import NetworkSecurityModel
from src.exception.exception import NetworkSecurityException
from src.logging.logger import logging
from src.entity.artifact_entity import DataTransformationArtifact, ModelTrainerArtifact
from src.entity.config_entity import Model_trainer_config
from src.utils.main_utils.utils import save_object, load_object
from src.utils.main_utils.utils import load_numpy_array_data, evaluate_models
from src.utils.ml_utils.metric.classification_metric import classification_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
RandomForestClassifier,
AdaBoostClassifier,
GradientBoostingClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score
import mlflow
import dagshub
import os
from dotenv import load_dotenv
load_dotenv()
# Only initialize DagHub once, and make it optional
try:
if not os.getenv("DAGSHUB_INITIALIZED"):
dagshub.init(repo_owner='kshitijk146', repo_name='MLOPS_project_network_Security_system', mlflow=True)
os.environ["DAGSHUB_INITIALIZED"] = "1"
logging.info("✅ DagHub/MLflow initialized in model_trainer")
except Exception as e:
logging.warning(f"⚠️ DagHub initialization failed: {e}. Continuing without MLflow tracking.")
class ModelTrainer:
def __init__(self, model_trainer_config: Model_trainer_config, data_transformation_artifact: DataTransformationArtifact):
try:
self.model_trainer_config = model_trainer_config
self.data_transformation_artifact = data_transformation_artifact
except Exception as e:
raise NetworkSecurityException(e, sys) from e
def track_mlflow(self, best_model, preprocessor, classificationmetric):
"""Log model, preprocessor, and metrics to MLflow"""
with mlflow.start_run() as run:
f1_score = classificationmetric.f1_score
precision_score = classificationmetric.precision_score
recall_score = classificationmetric.recall_score
mlflow.log_metric("f1_score", f1_score)
mlflow.log_metric("precision_score", precision_score)
mlflow.log_metric("recall_score", recall_score)
# Log both model and preprocessor
mlflow.sklearn.log_model(best_model, "model")
mlflow.sklearn.log_model(preprocessor, "preprocessor")
# Log run ID for easy retrieval
logging.info(f"✅ Models logged to MLflow - Run ID: {run.info.run_id}")
return run.info.run_id
def train_model(self, x_train, y_train,x_test, y_test):
models = {
"KNN": KNeighborsClassifier(),
"Decision Tree": DecisionTreeClassifier(),
"Random Forest": RandomForestClassifier(verbose=True),
"AdaBoost": AdaBoostClassifier(),
"Gradient Boosting": GradientBoostingClassifier(verbose=True),
"logistic regression": LogisticRegression(verbose=True)
}
params = {
"KNN": {
'n_neighbors': [3, 5, 7],
'weights': ['uniform', 'distance'],
'metric': ['euclidean']
},
"Decision Tree": {
'criterion': ['gini', 'entropy'],
'max_depth': [None, 5, 10],
'min_samples_split': [2, 5],
'min_samples_leaf': [1, 2]
},
"Random Forest": {
'n_estimators': [50, 100],
'max_depth': [None, 5],
'min_samples_split': [2, 5],
'min_samples_leaf': [1, 2],
'max_features': ['sqrt']
},
"AdaBoost": {
'n_estimators': [50, 100],
'learning_rate': [0.1, 1.0],
# 'algorithm': ['SAMME.R']
},
"Gradient Boosting": {
'n_estimators': [50, 100],
'learning_rate': [0.1],
'max_depth': [3, 5],
'min_samples_split': [2],
'min_samples_leaf': [1],
'max_features': ['sqrt']
},
"logistic regression": {
'C': [1.0, 10.0],
'penalty': ['l2'],
'solver': ['liblinear']
}
}
model_report:dict = evaluate_models(
x_train = x_train,y_train = y_train,x_test = x_test,y_test = y_test,models = models,params = params)
# to get the best model score from the dict
best_model_score = max(sorted(model_report.values()))
# to get best model name from dict
best_model_name = list(model_report.keys())[
list(model_report.values()).index(best_model_score)
]
logging.info(f"best model name: {best_model_name}")
best_model = models[best_model_name]
y_train_pred = best_model.predict(x_train)
classification_train_metric= classification_score(y_true = y_train, y_pred=y_train_pred)
y_test_pred = best_model.predict(x_test)
classification_test_metric = classification_score(y_true = y_test, y_pred=y_test_pred)
preprocessor = load_object(file_path=self.data_transformation_artifact.transformed_object_file_path)
# Track to MLflow (logs model + preprocessor)
self.track_mlflow(best_model, preprocessor, classification_train_metric)
model_dir_path = os.path.dirname(self.model_trainer_config.trained_model_file_path)
os.makedirs(model_dir_path, exist_ok=True)
NetwerkModel= NetworkSecurityModel(preprocessing_object=preprocessor, trained_model_object=best_model)
save_object(self.model_trainer_config.trained_model_file_path, obj=NetwerkModel)
save_object("final_model/model.pkl", best_model)
model_trainer_artifact = ModelTrainerArtifact(trained_model_file_path=self.model_trainer_config.trained_model_file_path, train_metric_artifact=classification_train_metric, test_metric_artifact=classification_test_metric)
logging.info(f"Model trainer artifact: {model_trainer_artifact}")
return model_trainer_artifact
def initiate_model_trainer(self)-> ModelTrainerArtifact:
try:
train_file_path = self.data_transformation_artifact.transformed_train_file_path
test_file_path = self.data_transformation_artifact.transformed_test_file_path
# loading training array and testing array
train_array = load_numpy_array_data(train_file_path)
test_array = load_numpy_array_data(test_file_path)
x_train, y_train, x_test, y_test = (
train_array[:, :-1],
train_array[:, -1],
test_array[:, :-1],
test_array[:, -1],
)
model = self.train_model(x_train, y_train, x_test=x_test, y_test=y_test)
except Exception as e:
raise NetworkSecurityException(e, sys) from e