Spaces:
Sleeping
Sleeping
File size: 9,467 Bytes
4c01182 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 | import sys, os
import json
import pandas as pd
from typing import Any
from pathlib import Path
from .data_preprocessing import DataPreprocessing
from src.core.constants import PARAMS_FILE
from src.core.logger import logging
from src.core.exception import AppException
from src.core.configuration import AppConfiguration
from src.utils import create_directory, read_yaml, load_obj
from sklearn.metrics import (accuracy_score, precision_score, recall_score,f1_score, roc_auc_score)
import gc
import dagshub
import mlflow
from mlflow.xgboost import log_model
from src.script.model_wrapper import CustomModel
from dotenv import load_dotenv
load_dotenv()
# get environment variables
uri = os.getenv("MLFLOW_URI")
dagshub_token = os.getenv("DAGSHUB_TOKEN")
dagshub_username = os.getenv("OWNER")
if not dagshub_token or not dagshub_username:
raise EnvironmentError("Dagshub environment variables is not set")
os.environ["MLFLOW_TRACKING_USERNAME"] = dagshub_username
os.environ["MLFLOW_TRACKING_PASSWORD"] = dagshub_token
mlflow.set_tracking_uri(uri) # type: ignore
# For local use
# =================================================================================
# repo_owner = os.getenv("OWNER")
# repo_name = os.getenv("REPO")
#
# mlflow.set_tracking_uri(uri)
# if repo_owner is None:
# raise EnvironmentError("Missing dagshub logging environment credentials.")
# dagshub.init(repo_owner=repo_owner, repo_name=repo_name, mlflow=True)
# ==================================================================================
class ModelEvaluation:
def __init__(self, config = AppConfiguration()):
"""
Initializes the ModelEvaluation object by creating a model evaluation configuration.
Args:
config (AppConfiguration): The configuration object containing the application configuration.
"""
try:
self.evaluation_config = config.model_evaluation_config()
except Exception as e:
logging.error(f"Failed to create model evaluation configuration: {e}", exc_info=True)
raise AppException(e, sys)
def save_report(self, report: dict):
"""
Saves the model evaluation metrics to a JSON file in the "reports" directory.
Args:
report (dict): A dictionary containing the model evaluation metrics.
"""
try:
create_directory(Path("reports"))
with open("reports/metrics.json", 'w') as f:
json.dump(report, f, indent=4)
except Exception as e:
logging.error(f"Failed to save model evaluation report: {e}", exc_info=True)
raise AppException(e, sys)
def save_experiment_info(self, model_name: str, run_id: str):
"""
Saves the model name and the Mlflow run ID to a JSON file
for logging purposes.
Args:
model_name (str): The name of the model
run_id (str): The Mlflow run ID
"""
try:
experiment_info = {
'model' : model_name,
'run_id' : run_id,
}
exp_info_path = Path("reports/experiment.json")
with open(exp_info_path, 'w') as f:
json.dump(experiment_info, f, indent=4)
except Exception as e:
logging.error(f"Failed to save model experiment info: {e}", exc_info=True)
raise AppException(e, sys)
def evaluate(self, model:Any, model_name:str, vectorizer:Any, vectorizer_name:str,
eval_threshold:float, df:pd.DataFrame) -> tuple[dict, dict]:
"""
Evaluates the given model and vectorizer on the given dataset.
Args:
model (Any): The model to evaluate.
model_name (str): The name of the model.
vectorizer (Any): The vectorizer to use.
vectorizer_name (str): The name of the vectorizer.
df (pd.DataFrame): The test dataset to evaluate on.
Returns:
tuple: A tuple containing the evaluation report and the model parameters.
"""
try:
X_test = df.drop(columns='Label')
# Perform feature extraction
X_test = vectorizer.transform(X_test["Content"])
logging.info(f"Evaluating model: {model_name}")
# Predict using the model
y_probs = model.predict_proba(X_test)[:, 1]
y_pred = (y_probs >= eval_threshold).astype(int)
# Get the true labels
y_test = df['Label'].values
evaluation_report = {
"model": model_name,
"vectorizer": vectorizer_name,
"threshold": eval_threshold,
"accuracy": accuracy_score(y_test, y_pred), # type: ignore
"precision": precision_score(y_test, y_pred), # type: ignore
"recall": recall_score(y_test, y_pred), # type: ignore
"f1 score": f1_score(y_test, y_pred), # type: ignore
"roc_auc": roc_auc_score(y_test, y_probs) # type: ignore
}
# Get XGBoost model parameters
model_params: dict = model.get_xgb_params()
return (
evaluation_report,
model_params
)
except Exception as e:
logging.error(f"Failed in model evaluation process: {e}", exc_info=True)
raise AppException(e, sys)
def initiate_model_evaluation():
"""
Initiates the model evaluation process by creating a ModelEvaluation object,
which then evaluates the model using the evaluate method and logs the
evaluation metrics in Mlflow.
"""
eval_obj = ModelEvaluation()
preprocessor = DataPreprocessing()
logging.info(f"{'='*20}Model Evaluation{'='*20}")
# get model name
config_params = read_yaml(PARAMS_FILE)
model_name = config_params.model_training.model_name
vectorizer_name = config_params.feature_engineering.vectorizer
eval_threshold = config_params.model_evaluation.threshold
test_data_path = eval_obj.evaluation_config.test_data_path
model_path = eval_obj.evaluation_config.models_dir
test_df = pd.read_parquet(test_data_path)
test_df.dropna(how='any', inplace=True)
processed_test_df = preprocessor.preprocess(test_df, filename="preprocessed_test_data.feather")
mlflow.set_experiment("DVC Pipeline Model Experiments")
with mlflow.start_run(run_name=model_name) as run:
try:
model = load_obj(location_path=model_path, obj_name="model.joblib")
vectorizer = load_obj(location_path=model_path, obj_name="vectorizer.joblib")
evaluation_report, model_params = eval_obj.evaluate(model=model, model_name=model_name, vectorizer=vectorizer,
vectorizer_name=vectorizer_name, eval_threshold=eval_threshold, df=processed_test_df)
eval_obj.save_report(evaluation_report)
logging.info("Logging model and different parameters in Mlflow")
# Log model evaluation metrics in Mlflow
# remove the model and vectorizer name from the report
evaluation_report.pop("model", None)
evaluation_report.pop("vectorizer", None)
for metric_name, metric_score in evaluation_report.items():
mlflow.log_metric(metric_name, metric_score)
# log model parameters in Mlflow
if model_params is not None:
for param_name, param_value in model_params.items():
mlflow.log_param(param_name, param_value)
mlflow.log_param("model", model_name)
mlflow.log_param("vectorizer", vectorizer_name)
else:
logging.warning("No model parameters found. Skipping parameter logging")
# Custom Model wraps the classifer and the vectorizer into a single Python Model object
final_model = CustomModel(model=model, vectorizer=vectorizer)
# Log the model and artifacts
mlflow.pyfunc.log_model(
artifact_path = model_name,
python_model = final_model,
artifacts={"vectorizer": os.path.join(model_path, "vectorizer.joblib"),
"model": os.path.join(model_path, "model.joblib"),
"booster": os.path.join(model_path, "booster.json"),
"metrics": os.path.join("reports", "metrics.json")}
)
# save model info
eval_obj.save_experiment_info(model_name=model_name, run_id=run.info.run_id)
logging.info("Model evaluation completed")
#free memory
del test_df, processed_test_df, model, vectorizer
gc.collect()
except Exception as e:
logging.error(f"Error during model evaluation: {e}", exc_info=True)
raise AppException(e, sys)
if __name__ == "__main__":
initiate_model_evaluation() |