MohitRajput45 commited on
Commit
97ac315
·
verified ·
1 Parent(s): 2e52ea1

Upload 21 files

Browse files
src/__init__.py ADDED
File without changes
src/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (142 Bytes). View file
 
src/explanability/__init__.py ADDED
File without changes
src/explanability/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (156 Bytes). View file
 
src/explanability/__pycache__/shap_explainer.cpython-310.pyc ADDED
Binary file (938 Bytes). View file
 
src/explanability/shap_explainer.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import shap
2
+ import joblib
3
+ import os
4
+ import pandas as pd
5
+
6
+ class ShapExplainer:
7
+
8
+ def __init__(self):
9
+ BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
10
+ ARTIFACTS_PATH = os.path.join(BASE_DIR, "artifacts")
11
+
12
+ self.model = joblib.load(os.path.join(ARTIFACTS_PATH, "xgb_model.pkl"))
13
+
14
+ # 🔥 FIX: TreeExplainer is specifically built for tree-based models like XGBoost
15
+ self.explainer = shap.TreeExplainer(self.model)
16
+
17
+ def explain(self, data: pd.DataFrame):
18
+ # Generate SHAP values for the given data
19
+ shap_values = self.explainer(data)
20
+ return shap_values
src/monitoring/__init___.py ADDED
File without changes
src/monitoring/__pycache__/db.cpython-310.pyc ADDED
Binary file (2.35 kB). View file
 
src/monitoring/__pycache__/drift.cpython-310.pyc ADDED
Binary file (1.15 kB). View file
 
src/monitoring/db.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ from sqlalchemy import create_engine, Column, Integer, Float
4
+ from sqlalchemy.orm import declarative_base, sessionmaker
5
+ from dotenv import load_dotenv
6
+
7
+ # Load environment variables (.env)
8
+ load_dotenv()
9
+
10
+ # --- NEW: Cloud Database Connection via SQLAlchemy ---
11
+ DATABASE_URL = os.getenv("DATABASE_URL")
12
+ if not DATABASE_URL:
13
+ BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
14
+ DB_PATH = os.path.join(BASE_DIR, "data", "fraud.db")
15
+ DATABASE_URL = f"sqlite:///{DB_PATH}"
16
+
17
+ # Initialize Engine and Session
18
+ engine = create_engine(DATABASE_URL)
19
+ SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
20
+ Base = declarative_base()
21
+
22
+ # 🔹 Define the schema for the transactions table
23
+ class TransactionLog(Base):
24
+ __tablename__ = "transactions"
25
+ id = Column(Integer, primary_key=True, index=True)
26
+ Time = Column(Float)
27
+ V1 = Column(Float); V2 = Column(Float); V3 = Column(Float); V4 = Column(Float)
28
+ V5 = Column(Float); V6 = Column(Float); V7 = Column(Float); V8 = Column(Float)
29
+ V9 = Column(Float); V10 = Column(Float); V11 = Column(Float); V12 = Column(Float)
30
+ V13 = Column(Float); V14 = Column(Float); V15 = Column(Float); V16 = Column(Float)
31
+ V17 = Column(Float); V18 = Column(Float); V19 = Column(Float); V20 = Column(Float)
32
+ V21 = Column(Float); V22 = Column(Float); V23 = Column(Float); V24 = Column(Float)
33
+ V25 = Column(Float); V26 = Column(Float); V27 = Column(Float); V28 = Column(Float)
34
+ Amount = Column(Float)
35
+ prediction = Column(Integer)
36
+ probability = Column(Float)
37
+
38
+ # 🔥 HUMAN-IN-THE-LOOP UPGRADE: The true label provided by a human analyst later
39
+ Actual_Class = Column(Integer, nullable=True)
40
+
41
+ def init_db():
42
+ """Create tables if they do not exist."""
43
+ Base.metadata.create_all(bind=engine)
44
+
45
+ def save_to_db(data: dict, pred: int, prob: float):
46
+ db = SessionLocal()
47
+ try:
48
+ new_tx = TransactionLog(**data, prediction=pred, probability=prob)
49
+ db.add(new_tx)
50
+ db.commit()
51
+ except Exception as e:
52
+ # 🔥 THE FIX: If anything goes wrong, roll back the transaction so the DB doesn't freeze
53
+ db.rollback()
54
+ print(f"⚠️ Database insertion failed: {e}")
55
+ finally:
56
+ db.close()
57
+
58
+ def load_data_from_db():
59
+ """Load all database records into a Pandas DataFrame for Retraining/Drift detection."""
60
+ return pd.read_sql("SELECT * FROM transactions", engine)
src/monitoring/drift.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from evidently.report import Report
3
+ from evidently.metric_preset import DataDriftPreset
4
+ import os
5
+ import sys
6
+
7
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
8
+ from src.monitoring.db import load_data_from_db
9
+
10
+ def detect_drift(reference_path):
11
+ # Load reference (original training data)
12
+ reference_data = pd.read_csv(reference_path).drop("Class", axis=1)
13
+
14
+ # Load current data from live DB
15
+ current_data = load_data_from_db()
16
+
17
+ # Drop tracking columns so we only compare the raw features (V1-V28, Time, Amount)
18
+ current_data = current_data.drop(["id", "prediction", "probability", "Actual_Class"], axis=1)
19
+
20
+ if len(current_data) < 50:
21
+ print("⚠️ Not enough live data for drift detection")
22
+ return None
23
+
24
+ # Run statistical tests via Evidently AI
25
+ report = Report(metrics=[DataDriftPreset()])
26
+ report.run(reference_data=reference_data, current_data=current_data)
27
+
28
+ BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
29
+ REPORT_PATH = os.path.join(BASE_DIR, "reports")
30
+ os.makedirs(REPORT_PATH, exist_ok=True)
31
+
32
+ path = os.path.join(REPORT_PATH, "drift_report.html")
33
+ report.save_html(path)
34
+
35
+ return path
src/pipeline/__init__.py ADDED
File without changes
src/pipeline/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (151 Bytes). View file
 
src/pipeline/__pycache__/predict_pipeline.cpython-310.pyc ADDED
Binary file (1.61 kB). View file
 
src/pipeline/__pycache__/retrain_pipeline.cpython-310.pyc ADDED
Binary file (1.21 kB). View file
 
src/pipeline/__pycache__/train_pipeline.cpython-310.pyc ADDED
Binary file (3.47 kB). View file
 
src/pipeline/predict_pipeline.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Prediction Pipeline
3
+
4
+ Loads trained models and scaler, applies preprocessing,
5
+ and returns final prediction using an ensemble threshold.
6
+ """
7
+
8
+ import joblib
9
+ import os
10
+ import pandas as pd
11
+
12
+ class PredictPipeline:
13
+
14
+ def __init__(self):
15
+ BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
16
+ ARTIFACTS_PATH = os.path.join(BASE_DIR, "artifacts")
17
+
18
+ self.rf_model = joblib.load(os.path.join(ARTIFACTS_PATH, "rf_model.pkl"))
19
+ self.xgb_model = joblib.load(os.path.join(ARTIFACTS_PATH, "xgb_model.pkl"))
20
+ self.scaler = joblib.load(os.path.join(ARTIFACTS_PATH, "scaler.pkl"))
21
+
22
+ def preprocess(self, data: pd.DataFrame):
23
+ """Apply same preprocessing as training"""
24
+ data = data.copy()
25
+
26
+ # 🔥 THE FIX: Add .flatten() to strip away the array brackets
27
+ # This turns [[value]] into a raw, safe float for SHAP
28
+ data["Amount"] = self.scaler.transform(data[["Amount"]]).flatten()
29
+
30
+ return data
31
+
32
+ def predict(self, data: pd.DataFrame):
33
+ """Make prediction using ensemble logic"""
34
+ # Preprocess the data first
35
+ data = self.preprocess(data)
36
+
37
+ # Get fraud probabilities from both models
38
+ rf_prob = self.rf_model.predict_proba(data)[:, 1]
39
+ xgb_prob = self.xgb_model.predict_proba(data)[:, 1]
40
+
41
+ # Ensemble: Average the probabilities
42
+ final_prob = (rf_prob + xgb_prob) / 2
43
+
44
+ # 🔥 FIX: Lowered Decision Threshold from 0.5 down to 0.15
45
+ final_pred = (final_prob > 0.15).astype(int)
46
+
47
+ return final_pred[0], final_prob[0]
src/pipeline/retrain_pipeline.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import pandas as pd
4
+
5
+ # Fix path
6
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
7
+
8
+ from src.monitoring.db import load_data_from_db
9
+ from src.pipeline.train_pipeline import train_pipeline
10
+
11
+ def retrain():
12
+ print("🔁 Retraining started...")
13
+ df = load_data_from_db()
14
+
15
+ # 🔥 FIX: Prevent the "Echo Chamber"
16
+ # We only keep rows where a human has verified the result and filled in Actual_Class
17
+ verified_data = df.dropna(subset=['Actual_Class'])
18
+
19
+ if len(verified_data) < 50:
20
+ print("❌ Not enough human-verified data to retrain yet.")
21
+ raise ValueError("Need at least 50 verified records to retrain safely.")
22
+
23
+ # Drop the machine's old predictions, we only want the human's truth
24
+ verified_data = verified_data.drop(["id", "prediction", "probability"], axis=1)
25
+ verified_data = verified_data.rename(columns={"Actual_Class": "Class"})
26
+
27
+ BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
28
+ DATA_PATH = os.path.join(BASE_DIR, "data")
29
+ os.makedirs(DATA_PATH, exist_ok=True)
30
+ file_path = os.path.join(DATA_PATH, "retrain_data.csv")
31
+
32
+ verified_data.to_csv(file_path, index=False)
33
+
34
+ # Pass the verified data to the training loop
35
+ train_pipeline(file_path)
36
+ print("✅ Retraining completed")
src/pipeline/train_pipeline.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Training Pipeline with Hyperparameter Tuning and DagsHub MLflow Integration
3
+
4
+ Steps:
5
+ 1. Load data
6
+ 2. Split data
7
+ 3. Scale features
8
+ 4. Handle Imbalance (SMOTE)
9
+ 5. Train Random Forest (baseline)
10
+ 6. Tune XGBoost (GridSearchCV) with scale_pos_weight
11
+ 7. Save models + scaler to MLflow and local artifacts
12
+ """
13
+
14
+ import os
15
+ import joblib
16
+ import pandas as pd
17
+ import dagshub
18
+ import mlflow
19
+ import mlflow.sklearn
20
+ from sklearn.model_selection import train_test_split, GridSearchCV
21
+ from sklearn.preprocessing import StandardScaler
22
+ from sklearn.ensemble import RandomForestClassifier
23
+ from xgboost import XGBClassifier
24
+ from imblearn.over_sampling import SMOTE
25
+ from dotenv import load_dotenv
26
+
27
+ load_dotenv()
28
+
29
+ def train_pipeline(data_path: str):
30
+
31
+ # --- NEW: Initialize DagsHub MLflow Tracking ---
32
+ dagshub_uri = os.getenv("MLFLOW_TRACKING_URI")
33
+ if dagshub_uri:
34
+ import urllib.parse
35
+ repo_owner, repo_name = urllib.parse.urlparse(dagshub_uri).path.strip('/').split('/')[:2]
36
+ repo_name = repo_name.replace(".mlflow", "")
37
+ dagshub.init(repo_owner=repo_owner, repo_name=repo_name, mlflow=True)
38
+
39
+ mlflow.set_experiment("Adaptive_Fraud_Detection")
40
+
41
+ dagshub.init(repo_owner='MohitParmar78', repo_name='adaptive-fraud-detection-mlops', mlflow=True)
42
+
43
+ with mlflow.start_run(run_name="Retrain_Ensemble_Weighted"):
44
+ print("🔹 Step 1: Loading dataset...")
45
+ df = pd.read_csv(data_path)
46
+ X = df.drop("Class", axis=1)
47
+ y = df["Class"]
48
+
49
+ print("🔹 Step 2: Splitting data...")
50
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
51
+
52
+ print("🔹 Step 3: Scaling 'Amount' feature...")
53
+ scaler = StandardScaler()
54
+ X_train["Amount"] = scaler.fit_transform(X_train[["Amount"]])
55
+ X_test["Amount"] = scaler.transform(X_test[["Amount"]])
56
+
57
+ print("🔹 Step 4: Applying SMOTE (handle imbalance)...")
58
+ smote = SMOTE(random_state=42)
59
+ if len(set(y_train)) > 1:
60
+ X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
61
+ else:
62
+ print("⚠️ Only one class present → skipping SMOTE")
63
+ X_train_res, y_train_res = X_train, y_train
64
+
65
+ print("🔹 Step 5: Training Random Forest (baseline)...")
66
+ rf_model = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
67
+ rf_model.fit(X_train_res, y_train_res)
68
+
69
+ # 🔥 FIX: Extreme Class Imbalance Weighting
70
+ # 284315 Normal / 492 Fraud ≈ 578. Forces XGBoost to prioritize catching fraud.
71
+ print("🔹 Step 6: Hyperparameter tuning XGBoost...")
72
+
73
+ xgb = XGBClassifier(eval_metric="logloss", scale_pos_weight=578, n_jobs=-1)
74
+
75
+ # 🔥 PRODUCTION GRID: Now it will test 18 different combinations to find the best F1 score
76
+ param_grid = {
77
+ "n_estimators": [100, 200, 300],
78
+ "max_depth": [3, 5],
79
+ "learning_rate": [0.01, 0.05, 0.1]
80
+ }
81
+ grid = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring="f1", cv=3, verbose=1, n_jobs=-1)
82
+ grid.fit(X_train_res, y_train_res)
83
+ best_xgb = grid.best_estimator_
84
+
85
+ # Log to DagsHub
86
+ mlflow.log_params(grid.best_params_)
87
+ mlflow.log_metric("best_cv_f1_score", grid.best_score_)
88
+
89
+ print("🔹 Step 7: Saving models and scaler...")
90
+ BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
91
+ ARTIFACTS_PATH = os.path.join(BASE_DIR, "artifacts")
92
+ os.makedirs(ARTIFACTS_PATH, exist_ok=True)
93
+
94
+ joblib.dump(rf_model, os.path.join(ARTIFACTS_PATH, "rf_model.pkl"))
95
+ joblib.dump(best_xgb, os.path.join(ARTIFACTS_PATH, "xgb_model.pkl"))
96
+ joblib.dump(scaler, os.path.join(ARTIFACTS_PATH, "scaler.pkl"))
97
+
98
+ # Save to remote registry
99
+ mlflow.sklearn.log_model(best_xgb, "xgboost_fraud_model")
100
+ mlflow.sklearn.log_model(rf_model, "rf_fraud_model")
101
+
102
+ if __name__ == "__main__":
103
+ train_pipeline("data/creditcard.csv")
src/utils/__init.py ADDED
File without changes
src/utils/common.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ # Get project root
5
+ PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
6
+
7
+ # Add to Python path
8
+ if PROJECT_ROOT not in sys.path:
9
+ sys.path.append(PROJECT_ROOT)