Spaces:

Kshitijk20
/

NSS

Sleeping

App Files Files Community

NSS / src /components /model_trainer.py

Kshitijk20

adding mlfow registered model loading

5aa701f about 2 months ago

raw

history blame contribute delete

7.23 kB

	import os, sys
	from src.utils.ml_utils.model.estimator import NetworkSecurityModel
	from src.exception.exception import NetworkSecurityException
	from src.logging.logger import logging
	from src.entity.artifact_entity import DataTransformationArtifact, ModelTrainerArtifact
	from src.entity.config_entity import Model_trainer_config
	from src.utils.main_utils.utils import save_object, load_object
	from src.utils.main_utils.utils import load_numpy_array_data, evaluate_models
	from src.utils.ml_utils.metric.classification_metric import classification_score
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.ensemble import (
	RandomForestClassifier,
	AdaBoostClassifier,
	GradientBoostingClassifier,
	)
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import r2_score
	import mlflow
	import dagshub
	import os
	from dotenv import load_dotenv
	load_dotenv()

	# Only initialize DagHub once, and make it optional
	try:
	if not os.getenv("DAGSHUB_INITIALIZED"):
	dagshub.init(repo_owner='kshitijk146', repo_name='MLOPS_project_network_Security_system', mlflow=True)
	os.environ["DAGSHUB_INITIALIZED"] = "1"
	logging.info("✅ DagHub/MLflow initialized in model_trainer")
	except Exception as e:
	logging.warning(f"⚠️ DagHub initialization failed: {e}. Continuing without MLflow tracking.")

	class ModelTrainer:
	def __init__(self, model_trainer_config: Model_trainer_config, data_transformation_artifact: DataTransformationArtifact):
	try:
	self.model_trainer_config = model_trainer_config
	self.data_transformation_artifact = data_transformation_artifact
	except Exception as e:
	raise NetworkSecurityException(e, sys) from e

	def track_mlflow(self, best_model, preprocessor, classificationmetric):
	"""Log model, preprocessor, and metrics to MLflow"""
	with mlflow.start_run() as run:
	f1_score = classificationmetric.f1_score
	precision_score = classificationmetric.precision_score
	recall_score = classificationmetric.recall_score

	mlflow.log_metric("f1_score", f1_score)
	mlflow.log_metric("precision_score", precision_score)
	mlflow.log_metric("recall_score", recall_score)

	# Log both model and preprocessor
	mlflow.sklearn.log_model(best_model, "model")
	mlflow.sklearn.log_model(preprocessor, "preprocessor")

	# Log run ID for easy retrieval
	logging.info(f"✅ Models logged to MLflow - Run ID: {run.info.run_id}")
	return run.info.run_id

	def train_model(self, x_train, y_train,x_test, y_test):
	models = {
	"KNN": KNeighborsClassifier(),
	"Decision Tree": DecisionTreeClassifier(),
	"Random Forest": RandomForestClassifier(verbose=True),
	"AdaBoost": AdaBoostClassifier(),
	"Gradient Boosting": GradientBoostingClassifier(verbose=True),
	"logistic regression": LogisticRegression(verbose=True)
	}
	params = {
	"KNN": {
	'n_neighbors': [3, 5, 7],
	'weights': ['uniform', 'distance'],
	'metric': ['euclidean']
	},
	"Decision Tree": {
	'criterion': ['gini', 'entropy'],
	'max_depth': [None, 5, 10],
	'min_samples_split': [2, 5],
	'min_samples_leaf': [1, 2]
	},
	"Random Forest": {
	'n_estimators': [50, 100],
	'max_depth': [None, 5],
	'min_samples_split': [2, 5],
	'min_samples_leaf': [1, 2],
	'max_features': ['sqrt']
	},
	"AdaBoost": {
	'n_estimators': [50, 100],
	'learning_rate': [0.1, 1.0],
	# 'algorithm': ['SAMME.R']
	},
	"Gradient Boosting": {
	'n_estimators': [50, 100],
	'learning_rate': [0.1],
	'max_depth': [3, 5],
	'min_samples_split': [2],
	'min_samples_leaf': [1],
	'max_features': ['sqrt']
	},
	"logistic regression": {
	'C': [1.0, 10.0],
	'penalty': ['l2'],
	'solver': ['liblinear']
	}
	}

	model_report:dict = evaluate_models(
	x_train = x_train,y_train = y_train,x_test = x_test,y_test = y_test,models = models,params = params)

	# to get the best model score from the dict
	best_model_score = max(sorted(model_report.values()))

	# to get best model name from dict
	best_model_name = list(model_report.keys())[
	list(model_report.values()).index(best_model_score)
	]
	logging.info(f"best model name: {best_model_name}")
	best_model = models[best_model_name]
	y_train_pred = best_model.predict(x_train)
	classification_train_metric= classification_score(y_true = y_train, y_pred=y_train_pred)

	y_test_pred = best_model.predict(x_test)
	classification_test_metric = classification_score(y_true = y_test, y_pred=y_test_pred)

	preprocessor = load_object(file_path=self.data_transformation_artifact.transformed_object_file_path)

	# Track to MLflow (logs model + preprocessor)
	self.track_mlflow(best_model, preprocessor, classification_train_metric)
	model_dir_path = os.path.dirname(self.model_trainer_config.trained_model_file_path)
	os.makedirs(model_dir_path, exist_ok=True)

	NetwerkModel= NetworkSecurityModel(preprocessing_object=preprocessor, trained_model_object=best_model)
	save_object(self.model_trainer_config.trained_model_file_path, obj=NetwerkModel)
	save_object("final_model/model.pkl", best_model)

	model_trainer_artifact = ModelTrainerArtifact(trained_model_file_path=self.model_trainer_config.trained_model_file_path, train_metric_artifact=classification_train_metric, test_metric_artifact=classification_test_metric)
	logging.info(f"Model trainer artifact: {model_trainer_artifact}")
	return model_trainer_artifact

	def initiate_model_trainer(self)-> ModelTrainerArtifact:
	try:
	train_file_path = self.data_transformation_artifact.transformed_train_file_path
	test_file_path = self.data_transformation_artifact.transformed_test_file_path

	# loading training array and testing array
	train_array = load_numpy_array_data(train_file_path)
	test_array = load_numpy_array_data(test_file_path)
	x_train, y_train, x_test, y_test = (
	train_array[:, :-1],
	train_array[:, -1],
	test_array[:, :-1],
	test_array[:, -1],
	)
	model = self.train_model(x_train, y_train, x_test=x_test, y_test=y_test)
	except Exception as e:
	raise NetworkSecurityException(e, sys) from e