Spaces:
Sleeping
Sleeping
File size: 7,231 Bytes
a21e473 5a72624 5aa701f a21e473 1b347a0 a21e473 1b347a0 a21e473 1b347a0 a21e473 1b347a0 a21e473 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | import os, sys
from src.utils.ml_utils.model.estimator import NetworkSecurityModel
from src.exception.exception import NetworkSecurityException
from src.logging.logger import logging
from src.entity.artifact_entity import DataTransformationArtifact, ModelTrainerArtifact
from src.entity.config_entity import Model_trainer_config
from src.utils.main_utils.utils import save_object, load_object
from src.utils.main_utils.utils import load_numpy_array_data, evaluate_models
from src.utils.ml_utils.metric.classification_metric import classification_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
RandomForestClassifier,
AdaBoostClassifier,
GradientBoostingClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score
import mlflow
import dagshub
import os
from dotenv import load_dotenv
load_dotenv()
# Only initialize DagHub once, and make it optional
try:
if not os.getenv("DAGSHUB_INITIALIZED"):
dagshub.init(repo_owner='kshitijk146', repo_name='MLOPS_project_network_Security_system', mlflow=True)
os.environ["DAGSHUB_INITIALIZED"] = "1"
logging.info("✅ DagHub/MLflow initialized in model_trainer")
except Exception as e:
logging.warning(f"⚠️ DagHub initialization failed: {e}. Continuing without MLflow tracking.")
class ModelTrainer:
def __init__(self, model_trainer_config: Model_trainer_config, data_transformation_artifact: DataTransformationArtifact):
try:
self.model_trainer_config = model_trainer_config
self.data_transformation_artifact = data_transformation_artifact
except Exception as e:
raise NetworkSecurityException(e, sys) from e
def track_mlflow(self, best_model, preprocessor, classificationmetric):
"""Log model, preprocessor, and metrics to MLflow"""
with mlflow.start_run() as run:
f1_score = classificationmetric.f1_score
precision_score = classificationmetric.precision_score
recall_score = classificationmetric.recall_score
mlflow.log_metric("f1_score", f1_score)
mlflow.log_metric("precision_score", precision_score)
mlflow.log_metric("recall_score", recall_score)
# Log both model and preprocessor
mlflow.sklearn.log_model(best_model, "model")
mlflow.sklearn.log_model(preprocessor, "preprocessor")
# Log run ID for easy retrieval
logging.info(f"✅ Models logged to MLflow - Run ID: {run.info.run_id}")
return run.info.run_id
def train_model(self, x_train, y_train,x_test, y_test):
models = {
"KNN": KNeighborsClassifier(),
"Decision Tree": DecisionTreeClassifier(),
"Random Forest": RandomForestClassifier(verbose=True),
"AdaBoost": AdaBoostClassifier(),
"Gradient Boosting": GradientBoostingClassifier(verbose=True),
"logistic regression": LogisticRegression(verbose=True)
}
params = {
"KNN": {
'n_neighbors': [3, 5, 7],
'weights': ['uniform', 'distance'],
'metric': ['euclidean']
},
"Decision Tree": {
'criterion': ['gini', 'entropy'],
'max_depth': [None, 5, 10],
'min_samples_split': [2, 5],
'min_samples_leaf': [1, 2]
},
"Random Forest": {
'n_estimators': [50, 100],
'max_depth': [None, 5],
'min_samples_split': [2, 5],
'min_samples_leaf': [1, 2],
'max_features': ['sqrt']
},
"AdaBoost": {
'n_estimators': [50, 100],
'learning_rate': [0.1, 1.0],
# 'algorithm': ['SAMME.R']
},
"Gradient Boosting": {
'n_estimators': [50, 100],
'learning_rate': [0.1],
'max_depth': [3, 5],
'min_samples_split': [2],
'min_samples_leaf': [1],
'max_features': ['sqrt']
},
"logistic regression": {
'C': [1.0, 10.0],
'penalty': ['l2'],
'solver': ['liblinear']
}
}
model_report:dict = evaluate_models(
x_train = x_train,y_train = y_train,x_test = x_test,y_test = y_test,models = models,params = params)
# to get the best model score from the dict
best_model_score = max(sorted(model_report.values()))
# to get best model name from dict
best_model_name = list(model_report.keys())[
list(model_report.values()).index(best_model_score)
]
logging.info(f"best model name: {best_model_name}")
best_model = models[best_model_name]
y_train_pred = best_model.predict(x_train)
classification_train_metric= classification_score(y_true = y_train, y_pred=y_train_pred)
y_test_pred = best_model.predict(x_test)
classification_test_metric = classification_score(y_true = y_test, y_pred=y_test_pred)
preprocessor = load_object(file_path=self.data_transformation_artifact.transformed_object_file_path)
# Track to MLflow (logs model + preprocessor)
self.track_mlflow(best_model, preprocessor, classification_train_metric)
model_dir_path = os.path.dirname(self.model_trainer_config.trained_model_file_path)
os.makedirs(model_dir_path, exist_ok=True)
NetwerkModel= NetworkSecurityModel(preprocessing_object=preprocessor, trained_model_object=best_model)
save_object(self.model_trainer_config.trained_model_file_path, obj=NetwerkModel)
save_object("final_model/model.pkl", best_model)
model_trainer_artifact = ModelTrainerArtifact(trained_model_file_path=self.model_trainer_config.trained_model_file_path, train_metric_artifact=classification_train_metric, test_metric_artifact=classification_test_metric)
logging.info(f"Model trainer artifact: {model_trainer_artifact}")
return model_trainer_artifact
def initiate_model_trainer(self)-> ModelTrainerArtifact:
try:
train_file_path = self.data_transformation_artifact.transformed_train_file_path
test_file_path = self.data_transformation_artifact.transformed_test_file_path
# loading training array and testing array
train_array = load_numpy_array_data(train_file_path)
test_array = load_numpy_array_data(test_file_path)
x_train, y_train, x_test, y_test = (
train_array[:, :-1],
train_array[:, -1],
test_array[:, :-1],
test_array[:, -1],
)
model = self.train_model(x_train, y_train, x_test=x_test, y_test=y_test)
except Exception as e:
raise NetworkSecurityException(e, sys) from e |