Upload 21 files
Browse files- src/__init__.py +0 -0
- src/__pycache__/__init__.cpython-310.pyc +0 -0
- src/explanability/__init__.py +0 -0
- src/explanability/__pycache__/__init__.cpython-310.pyc +0 -0
- src/explanability/__pycache__/shap_explainer.cpython-310.pyc +0 -0
- src/explanability/shap_explainer.py +20 -0
- src/monitoring/__init___.py +0 -0
- src/monitoring/__pycache__/db.cpython-310.pyc +0 -0
- src/monitoring/__pycache__/drift.cpython-310.pyc +0 -0
- src/monitoring/db.py +60 -0
- src/monitoring/drift.py +35 -0
- src/pipeline/__init__.py +0 -0
- src/pipeline/__pycache__/__init__.cpython-310.pyc +0 -0
- src/pipeline/__pycache__/predict_pipeline.cpython-310.pyc +0 -0
- src/pipeline/__pycache__/retrain_pipeline.cpython-310.pyc +0 -0
- src/pipeline/__pycache__/train_pipeline.cpython-310.pyc +0 -0
- src/pipeline/predict_pipeline.py +47 -0
- src/pipeline/retrain_pipeline.py +36 -0
- src/pipeline/train_pipeline.py +103 -0
- src/utils/__init.py +0 -0
- src/utils/common.py +9 -0
src/__init__.py
ADDED
|
File without changes
|
src/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (142 Bytes). View file
|
|
|
src/explanability/__init__.py
ADDED
|
File without changes
|
src/explanability/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (156 Bytes). View file
|
|
|
src/explanability/__pycache__/shap_explainer.cpython-310.pyc
ADDED
|
Binary file (938 Bytes). View file
|
|
|
src/explanability/shap_explainer.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import shap
|
| 2 |
+
import joblib
|
| 3 |
+
import os
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
class ShapExplainer:
|
| 7 |
+
|
| 8 |
+
def __init__(self):
|
| 9 |
+
BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
|
| 10 |
+
ARTIFACTS_PATH = os.path.join(BASE_DIR, "artifacts")
|
| 11 |
+
|
| 12 |
+
self.model = joblib.load(os.path.join(ARTIFACTS_PATH, "xgb_model.pkl"))
|
| 13 |
+
|
| 14 |
+
# 🔥 FIX: TreeExplainer is specifically built for tree-based models like XGBoost
|
| 15 |
+
self.explainer = shap.TreeExplainer(self.model)
|
| 16 |
+
|
| 17 |
+
def explain(self, data: pd.DataFrame):
|
| 18 |
+
# Generate SHAP values for the given data
|
| 19 |
+
shap_values = self.explainer(data)
|
| 20 |
+
return shap_values
|
src/monitoring/__init___.py
ADDED
|
File without changes
|
src/monitoring/__pycache__/db.cpython-310.pyc
ADDED
|
Binary file (2.35 kB). View file
|
|
|
src/monitoring/__pycache__/drift.cpython-310.pyc
ADDED
|
Binary file (1.15 kB). View file
|
|
|
src/monitoring/db.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from sqlalchemy import create_engine, Column, Integer, Float
|
| 4 |
+
from sqlalchemy.orm import declarative_base, sessionmaker
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
|
| 7 |
+
# Load environment variables (.env)
|
| 8 |
+
load_dotenv()
|
| 9 |
+
|
| 10 |
+
# --- NEW: Cloud Database Connection via SQLAlchemy ---
|
| 11 |
+
DATABASE_URL = os.getenv("DATABASE_URL")
|
| 12 |
+
if not DATABASE_URL:
|
| 13 |
+
BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
|
| 14 |
+
DB_PATH = os.path.join(BASE_DIR, "data", "fraud.db")
|
| 15 |
+
DATABASE_URL = f"sqlite:///{DB_PATH}"
|
| 16 |
+
|
| 17 |
+
# Initialize Engine and Session
|
| 18 |
+
engine = create_engine(DATABASE_URL)
|
| 19 |
+
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
| 20 |
+
Base = declarative_base()
|
| 21 |
+
|
| 22 |
+
# 🔹 Define the schema for the transactions table
|
| 23 |
+
class TransactionLog(Base):
|
| 24 |
+
__tablename__ = "transactions"
|
| 25 |
+
id = Column(Integer, primary_key=True, index=True)
|
| 26 |
+
Time = Column(Float)
|
| 27 |
+
V1 = Column(Float); V2 = Column(Float); V3 = Column(Float); V4 = Column(Float)
|
| 28 |
+
V5 = Column(Float); V6 = Column(Float); V7 = Column(Float); V8 = Column(Float)
|
| 29 |
+
V9 = Column(Float); V10 = Column(Float); V11 = Column(Float); V12 = Column(Float)
|
| 30 |
+
V13 = Column(Float); V14 = Column(Float); V15 = Column(Float); V16 = Column(Float)
|
| 31 |
+
V17 = Column(Float); V18 = Column(Float); V19 = Column(Float); V20 = Column(Float)
|
| 32 |
+
V21 = Column(Float); V22 = Column(Float); V23 = Column(Float); V24 = Column(Float)
|
| 33 |
+
V25 = Column(Float); V26 = Column(Float); V27 = Column(Float); V28 = Column(Float)
|
| 34 |
+
Amount = Column(Float)
|
| 35 |
+
prediction = Column(Integer)
|
| 36 |
+
probability = Column(Float)
|
| 37 |
+
|
| 38 |
+
# 🔥 HUMAN-IN-THE-LOOP UPGRADE: The true label provided by a human analyst later
|
| 39 |
+
Actual_Class = Column(Integer, nullable=True)
|
| 40 |
+
|
| 41 |
+
def init_db():
|
| 42 |
+
"""Create tables if they do not exist."""
|
| 43 |
+
Base.metadata.create_all(bind=engine)
|
| 44 |
+
|
| 45 |
+
def save_to_db(data: dict, pred: int, prob: float):
|
| 46 |
+
db = SessionLocal()
|
| 47 |
+
try:
|
| 48 |
+
new_tx = TransactionLog(**data, prediction=pred, probability=prob)
|
| 49 |
+
db.add(new_tx)
|
| 50 |
+
db.commit()
|
| 51 |
+
except Exception as e:
|
| 52 |
+
# 🔥 THE FIX: If anything goes wrong, roll back the transaction so the DB doesn't freeze
|
| 53 |
+
db.rollback()
|
| 54 |
+
print(f"⚠️ Database insertion failed: {e}")
|
| 55 |
+
finally:
|
| 56 |
+
db.close()
|
| 57 |
+
|
| 58 |
+
def load_data_from_db():
|
| 59 |
+
"""Load all database records into a Pandas DataFrame for Retraining/Drift detection."""
|
| 60 |
+
return pd.read_sql("SELECT * FROM transactions", engine)
|
src/monitoring/drift.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from evidently.report import Report
|
| 3 |
+
from evidently.metric_preset import DataDriftPreset
|
| 4 |
+
import os
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
| 8 |
+
from src.monitoring.db import load_data_from_db
|
| 9 |
+
|
| 10 |
+
def detect_drift(reference_path):
|
| 11 |
+
# Load reference (original training data)
|
| 12 |
+
reference_data = pd.read_csv(reference_path).drop("Class", axis=1)
|
| 13 |
+
|
| 14 |
+
# Load current data from live DB
|
| 15 |
+
current_data = load_data_from_db()
|
| 16 |
+
|
| 17 |
+
# Drop tracking columns so we only compare the raw features (V1-V28, Time, Amount)
|
| 18 |
+
current_data = current_data.drop(["id", "prediction", "probability", "Actual_Class"], axis=1)
|
| 19 |
+
|
| 20 |
+
if len(current_data) < 50:
|
| 21 |
+
print("⚠️ Not enough live data for drift detection")
|
| 22 |
+
return None
|
| 23 |
+
|
| 24 |
+
# Run statistical tests via Evidently AI
|
| 25 |
+
report = Report(metrics=[DataDriftPreset()])
|
| 26 |
+
report.run(reference_data=reference_data, current_data=current_data)
|
| 27 |
+
|
| 28 |
+
BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
|
| 29 |
+
REPORT_PATH = os.path.join(BASE_DIR, "reports")
|
| 30 |
+
os.makedirs(REPORT_PATH, exist_ok=True)
|
| 31 |
+
|
| 32 |
+
path = os.path.join(REPORT_PATH, "drift_report.html")
|
| 33 |
+
report.save_html(path)
|
| 34 |
+
|
| 35 |
+
return path
|
src/pipeline/__init__.py
ADDED
|
File without changes
|
src/pipeline/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (151 Bytes). View file
|
|
|
src/pipeline/__pycache__/predict_pipeline.cpython-310.pyc
ADDED
|
Binary file (1.61 kB). View file
|
|
|
src/pipeline/__pycache__/retrain_pipeline.cpython-310.pyc
ADDED
|
Binary file (1.21 kB). View file
|
|
|
src/pipeline/__pycache__/train_pipeline.cpython-310.pyc
ADDED
|
Binary file (3.47 kB). View file
|
|
|
src/pipeline/predict_pipeline.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Prediction Pipeline
|
| 3 |
+
|
| 4 |
+
Loads trained models and scaler, applies preprocessing,
|
| 5 |
+
and returns final prediction using an ensemble threshold.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import joblib
|
| 9 |
+
import os
|
| 10 |
+
import pandas as pd
|
| 11 |
+
|
| 12 |
+
class PredictPipeline:
|
| 13 |
+
|
| 14 |
+
def __init__(self):
|
| 15 |
+
BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
|
| 16 |
+
ARTIFACTS_PATH = os.path.join(BASE_DIR, "artifacts")
|
| 17 |
+
|
| 18 |
+
self.rf_model = joblib.load(os.path.join(ARTIFACTS_PATH, "rf_model.pkl"))
|
| 19 |
+
self.xgb_model = joblib.load(os.path.join(ARTIFACTS_PATH, "xgb_model.pkl"))
|
| 20 |
+
self.scaler = joblib.load(os.path.join(ARTIFACTS_PATH, "scaler.pkl"))
|
| 21 |
+
|
| 22 |
+
def preprocess(self, data: pd.DataFrame):
|
| 23 |
+
"""Apply same preprocessing as training"""
|
| 24 |
+
data = data.copy()
|
| 25 |
+
|
| 26 |
+
# 🔥 THE FIX: Add .flatten() to strip away the array brackets
|
| 27 |
+
# This turns [[value]] into a raw, safe float for SHAP
|
| 28 |
+
data["Amount"] = self.scaler.transform(data[["Amount"]]).flatten()
|
| 29 |
+
|
| 30 |
+
return data
|
| 31 |
+
|
| 32 |
+
def predict(self, data: pd.DataFrame):
|
| 33 |
+
"""Make prediction using ensemble logic"""
|
| 34 |
+
# Preprocess the data first
|
| 35 |
+
data = self.preprocess(data)
|
| 36 |
+
|
| 37 |
+
# Get fraud probabilities from both models
|
| 38 |
+
rf_prob = self.rf_model.predict_proba(data)[:, 1]
|
| 39 |
+
xgb_prob = self.xgb_model.predict_proba(data)[:, 1]
|
| 40 |
+
|
| 41 |
+
# Ensemble: Average the probabilities
|
| 42 |
+
final_prob = (rf_prob + xgb_prob) / 2
|
| 43 |
+
|
| 44 |
+
# 🔥 FIX: Lowered Decision Threshold from 0.5 down to 0.15
|
| 45 |
+
final_pred = (final_prob > 0.15).astype(int)
|
| 46 |
+
|
| 47 |
+
return final_pred[0], final_prob[0]
|
src/pipeline/retrain_pipeline.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
import pandas as pd
|
| 4 |
+
|
| 5 |
+
# Fix path
|
| 6 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
|
| 7 |
+
|
| 8 |
+
from src.monitoring.db import load_data_from_db
|
| 9 |
+
from src.pipeline.train_pipeline import train_pipeline
|
| 10 |
+
|
| 11 |
+
def retrain():
|
| 12 |
+
print("🔁 Retraining started...")
|
| 13 |
+
df = load_data_from_db()
|
| 14 |
+
|
| 15 |
+
# 🔥 FIX: Prevent the "Echo Chamber"
|
| 16 |
+
# We only keep rows where a human has verified the result and filled in Actual_Class
|
| 17 |
+
verified_data = df.dropna(subset=['Actual_Class'])
|
| 18 |
+
|
| 19 |
+
if len(verified_data) < 50:
|
| 20 |
+
print("❌ Not enough human-verified data to retrain yet.")
|
| 21 |
+
raise ValueError("Need at least 50 verified records to retrain safely.")
|
| 22 |
+
|
| 23 |
+
# Drop the machine's old predictions, we only want the human's truth
|
| 24 |
+
verified_data = verified_data.drop(["id", "prediction", "probability"], axis=1)
|
| 25 |
+
verified_data = verified_data.rename(columns={"Actual_Class": "Class"})
|
| 26 |
+
|
| 27 |
+
BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
|
| 28 |
+
DATA_PATH = os.path.join(BASE_DIR, "data")
|
| 29 |
+
os.makedirs(DATA_PATH, exist_ok=True)
|
| 30 |
+
file_path = os.path.join(DATA_PATH, "retrain_data.csv")
|
| 31 |
+
|
| 32 |
+
verified_data.to_csv(file_path, index=False)
|
| 33 |
+
|
| 34 |
+
# Pass the verified data to the training loop
|
| 35 |
+
train_pipeline(file_path)
|
| 36 |
+
print("✅ Retraining completed")
|
src/pipeline/train_pipeline.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Training Pipeline with Hyperparameter Tuning and DagsHub MLflow Integration
|
| 3 |
+
|
| 4 |
+
Steps:
|
| 5 |
+
1. Load data
|
| 6 |
+
2. Split data
|
| 7 |
+
3. Scale features
|
| 8 |
+
4. Handle Imbalance (SMOTE)
|
| 9 |
+
5. Train Random Forest (baseline)
|
| 10 |
+
6. Tune XGBoost (GridSearchCV) with scale_pos_weight
|
| 11 |
+
7. Save models + scaler to MLflow and local artifacts
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import os
|
| 15 |
+
import joblib
|
| 16 |
+
import pandas as pd
|
| 17 |
+
import dagshub
|
| 18 |
+
import mlflow
|
| 19 |
+
import mlflow.sklearn
|
| 20 |
+
from sklearn.model_selection import train_test_split, GridSearchCV
|
| 21 |
+
from sklearn.preprocessing import StandardScaler
|
| 22 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 23 |
+
from xgboost import XGBClassifier
|
| 24 |
+
from imblearn.over_sampling import SMOTE
|
| 25 |
+
from dotenv import load_dotenv
|
| 26 |
+
|
| 27 |
+
load_dotenv()
|
| 28 |
+
|
| 29 |
+
def train_pipeline(data_path: str):
|
| 30 |
+
|
| 31 |
+
# --- NEW: Initialize DagsHub MLflow Tracking ---
|
| 32 |
+
dagshub_uri = os.getenv("MLFLOW_TRACKING_URI")
|
| 33 |
+
if dagshub_uri:
|
| 34 |
+
import urllib.parse
|
| 35 |
+
repo_owner, repo_name = urllib.parse.urlparse(dagshub_uri).path.strip('/').split('/')[:2]
|
| 36 |
+
repo_name = repo_name.replace(".mlflow", "")
|
| 37 |
+
dagshub.init(repo_owner=repo_owner, repo_name=repo_name, mlflow=True)
|
| 38 |
+
|
| 39 |
+
mlflow.set_experiment("Adaptive_Fraud_Detection")
|
| 40 |
+
|
| 41 |
+
dagshub.init(repo_owner='MohitParmar78', repo_name='adaptive-fraud-detection-mlops', mlflow=True)
|
| 42 |
+
|
| 43 |
+
with mlflow.start_run(run_name="Retrain_Ensemble_Weighted"):
|
| 44 |
+
print("🔹 Step 1: Loading dataset...")
|
| 45 |
+
df = pd.read_csv(data_path)
|
| 46 |
+
X = df.drop("Class", axis=1)
|
| 47 |
+
y = df["Class"]
|
| 48 |
+
|
| 49 |
+
print("🔹 Step 2: Splitting data...")
|
| 50 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
|
| 51 |
+
|
| 52 |
+
print("🔹 Step 3: Scaling 'Amount' feature...")
|
| 53 |
+
scaler = StandardScaler()
|
| 54 |
+
X_train["Amount"] = scaler.fit_transform(X_train[["Amount"]])
|
| 55 |
+
X_test["Amount"] = scaler.transform(X_test[["Amount"]])
|
| 56 |
+
|
| 57 |
+
print("🔹 Step 4: Applying SMOTE (handle imbalance)...")
|
| 58 |
+
smote = SMOTE(random_state=42)
|
| 59 |
+
if len(set(y_train)) > 1:
|
| 60 |
+
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
|
| 61 |
+
else:
|
| 62 |
+
print("⚠️ Only one class present → skipping SMOTE")
|
| 63 |
+
X_train_res, y_train_res = X_train, y_train
|
| 64 |
+
|
| 65 |
+
print("🔹 Step 5: Training Random Forest (baseline)...")
|
| 66 |
+
rf_model = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
|
| 67 |
+
rf_model.fit(X_train_res, y_train_res)
|
| 68 |
+
|
| 69 |
+
# 🔥 FIX: Extreme Class Imbalance Weighting
|
| 70 |
+
# 284315 Normal / 492 Fraud ≈ 578. Forces XGBoost to prioritize catching fraud.
|
| 71 |
+
print("🔹 Step 6: Hyperparameter tuning XGBoost...")
|
| 72 |
+
|
| 73 |
+
xgb = XGBClassifier(eval_metric="logloss", scale_pos_weight=578, n_jobs=-1)
|
| 74 |
+
|
| 75 |
+
# 🔥 PRODUCTION GRID: Now it will test 18 different combinations to find the best F1 score
|
| 76 |
+
param_grid = {
|
| 77 |
+
"n_estimators": [100, 200, 300],
|
| 78 |
+
"max_depth": [3, 5],
|
| 79 |
+
"learning_rate": [0.01, 0.05, 0.1]
|
| 80 |
+
}
|
| 81 |
+
grid = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring="f1", cv=3, verbose=1, n_jobs=-1)
|
| 82 |
+
grid.fit(X_train_res, y_train_res)
|
| 83 |
+
best_xgb = grid.best_estimator_
|
| 84 |
+
|
| 85 |
+
# Log to DagsHub
|
| 86 |
+
mlflow.log_params(grid.best_params_)
|
| 87 |
+
mlflow.log_metric("best_cv_f1_score", grid.best_score_)
|
| 88 |
+
|
| 89 |
+
print("🔹 Step 7: Saving models and scaler...")
|
| 90 |
+
BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
|
| 91 |
+
ARTIFACTS_PATH = os.path.join(BASE_DIR, "artifacts")
|
| 92 |
+
os.makedirs(ARTIFACTS_PATH, exist_ok=True)
|
| 93 |
+
|
| 94 |
+
joblib.dump(rf_model, os.path.join(ARTIFACTS_PATH, "rf_model.pkl"))
|
| 95 |
+
joblib.dump(best_xgb, os.path.join(ARTIFACTS_PATH, "xgb_model.pkl"))
|
| 96 |
+
joblib.dump(scaler, os.path.join(ARTIFACTS_PATH, "scaler.pkl"))
|
| 97 |
+
|
| 98 |
+
# Save to remote registry
|
| 99 |
+
mlflow.sklearn.log_model(best_xgb, "xgboost_fraud_model")
|
| 100 |
+
mlflow.sklearn.log_model(rf_model, "rf_fraud_model")
|
| 101 |
+
|
| 102 |
+
if __name__ == "__main__":
|
| 103 |
+
train_pipeline("data/creditcard.csv")
|
src/utils/__init.py
ADDED
|
File without changes
|
src/utils/common.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
|
| 4 |
+
# Get project root
|
| 5 |
+
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
|
| 6 |
+
|
| 7 |
+
# Add to Python path
|
| 8 |
+
if PROJECT_ROOT not in sys.path:
|
| 9 |
+
sys.path.append(PROJECT_ROOT)
|