Spaces:

Kshitijk20
/

NSS

Sleeping

File size: 7,231 Bytes

import os, sys
from src.utils.ml_utils.model.estimator import NetworkSecurityModel
from src.exception.exception import NetworkSecurityException
from src.logging.logger import logging
from src.entity.artifact_entity import DataTransformationArtifact, ModelTrainerArtifact
from src.entity.config_entity import Model_trainer_config
from src.utils.main_utils.utils import save_object, load_object
from src.utils.main_utils.utils import load_numpy_array_data, evaluate_models
from src.utils.ml_utils.metric.classification_metric import classification_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier, 
)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score
import mlflow 
import dagshub
import os 
from dotenv import load_dotenv
load_dotenv()

# Only initialize DagHub once, and make it optional
try:
    if not os.getenv("DAGSHUB_INITIALIZED"):
        dagshub.init(repo_owner='kshitijk146', repo_name='MLOPS_project_network_Security_system', mlflow=True)
        os.environ["DAGSHUB_INITIALIZED"] = "1"
        logging.info("✅ DagHub/MLflow initialized in model_trainer")
except Exception as e:
    logging.warning(f"⚠️ DagHub initialization failed: {e}. Continuing without MLflow tracking.")

class ModelTrainer:
    def __init__(self, model_trainer_config: Model_trainer_config, data_transformation_artifact: DataTransformationArtifact):
        try:
            self.model_trainer_config = model_trainer_config
            self.data_transformation_artifact = data_transformation_artifact
        except Exception as e:
            raise NetworkSecurityException(e, sys) from e
    
    def track_mlflow(self, best_model, preprocessor, classificationmetric):
        """Log model, preprocessor, and metrics to MLflow"""
        with mlflow.start_run() as run:
            f1_score = classificationmetric.f1_score
            precision_score = classificationmetric.precision_score
            recall_score = classificationmetric.recall_score
        
            mlflow.log_metric("f1_score", f1_score)
            mlflow.log_metric("precision_score", precision_score)
            mlflow.log_metric("recall_score", recall_score)
            
            # Log both model and preprocessor
            mlflow.sklearn.log_model(best_model, "model")
            mlflow.sklearn.log_model(preprocessor, "preprocessor")
            
            # Log run ID for easy retrieval
            logging.info(f"✅ Models logged to MLflow - Run ID: {run.info.run_id}")
            return run.info.run_id
        
    def train_model(self, x_train, y_train,x_test, y_test):
        models = {
            "KNN": KNeighborsClassifier(),
            "Decision Tree": DecisionTreeClassifier(),
            "Random Forest": RandomForestClassifier(verbose=True),
            "AdaBoost": AdaBoostClassifier(),
            "Gradient Boosting": GradientBoostingClassifier(verbose=True),
            "logistic regression": LogisticRegression(verbose=True)
        }
        params = {
            "KNN": {
                'n_neighbors': [3, 5, 7],  
                'weights': ['uniform', 'distance'],
                'metric': ['euclidean']  
            },
            "Decision Tree": {
                'criterion': ['gini', 'entropy'], 
                'max_depth': [None, 5, 10],  
                'min_samples_split': [2, 5], 
                'min_samples_leaf': [1, 2]  
            },
            "Random Forest": {
                'n_estimators': [50, 100],  
                'max_depth': [None, 5],  
                'min_samples_split': [2, 5],  
                'min_samples_leaf': [1, 2],  
                'max_features': ['sqrt'] 
            },
            "AdaBoost": {
                'n_estimators': [50, 100],  
                'learning_rate': [0.1, 1.0],  
                # 'algorithm': ['SAMME.R']  
            },
            "Gradient Boosting": {
                'n_estimators': [50, 100],
                'learning_rate': [0.1],  
                'max_depth': [3, 5], 
                'min_samples_split': [2],  
                'min_samples_leaf': [1],  
                'max_features': ['sqrt'] 
            },
            "logistic regression": {
                'C': [1.0, 10.0],  
                'penalty': ['l2'],  
                'solver': ['liblinear']  
            }
        }

        model_report:dict = evaluate_models(
            x_train = x_train,y_train = y_train,x_test = x_test,y_test = y_test,models = models,params = params)
        
        # to get the best model score from the dict
        best_model_score = max(sorted(model_report.values()))
        
        # to get best model name from dict 
        best_model_name = list(model_report.keys())[
            list(model_report.values()).index(best_model_score)
        ]
        logging.info(f"best model name: {best_model_name}")
        best_model = models[best_model_name]
        y_train_pred = best_model.predict(x_train)
        classification_train_metric= classification_score(y_true = y_train, y_pred=y_train_pred)
        
        y_test_pred = best_model.predict(x_test)
        classification_test_metric = classification_score(y_true = y_test, y_pred=y_test_pred)
        
        preprocessor  = load_object(file_path=self.data_transformation_artifact.transformed_object_file_path)
        
        # Track to MLflow (logs model + preprocessor)
        self.track_mlflow(best_model, preprocessor, classification_train_metric)
        model_dir_path = os.path.dirname(self.model_trainer_config.trained_model_file_path)
        os.makedirs(model_dir_path, exist_ok=True)
        
        NetwerkModel= NetworkSecurityModel(preprocessing_object=preprocessor, trained_model_object=best_model)
        save_object(self.model_trainer_config.trained_model_file_path, obj=NetwerkModel)
        save_object("final_model/model.pkl", best_model)
        
        model_trainer_artifact = ModelTrainerArtifact(trained_model_file_path=self.model_trainer_config.trained_model_file_path, train_metric_artifact=classification_train_metric, test_metric_artifact=classification_test_metric)
        logging.info(f"Model trainer artifact: {model_trainer_artifact}")
        return model_trainer_artifact
    
    def initiate_model_trainer(self)-> ModelTrainerArtifact:
        try:
            train_file_path = self.data_transformation_artifact.transformed_train_file_path
            test_file_path = self.data_transformation_artifact.transformed_test_file_path
            
            # loading training array and testing array
            train_array = load_numpy_array_data(train_file_path)
            test_array = load_numpy_array_data(test_file_path)
            x_train, y_train, x_test, y_test = (
                train_array[:, :-1],
                train_array[:, -1],
                test_array[:, :-1],
                test_array[:, -1],
            )
            model = self.train_model(x_train, y_train, x_test=x_test, y_test=y_test)
        except Exception as e:
            raise NetworkSecurityException(e, sys) from e