Spaces:

MohitRajput45
/

Fraud-Guard-Intelligence

Sleeping

App Files Files Community

MohitRajput45 commited on Apr 29

Commit

97ac315

verified ·

1 Parent(s): 2e52ea1

Upload 21 files

Browse files

Files changed (21) hide show

src/__init__.py +0 -0
src/__pycache__/__init__.cpython-310.pyc +0 -0
src/explanability/__init__.py +0 -0
src/explanability/__pycache__/__init__.cpython-310.pyc +0 -0
src/explanability/__pycache__/shap_explainer.cpython-310.pyc +0 -0
src/explanability/shap_explainer.py +20 -0
src/monitoring/__init___.py +0 -0
src/monitoring/__pycache__/db.cpython-310.pyc +0 -0
src/monitoring/__pycache__/drift.cpython-310.pyc +0 -0
src/monitoring/db.py +60 -0
src/monitoring/drift.py +35 -0
src/pipeline/__init__.py +0 -0
src/pipeline/__pycache__/__init__.cpython-310.pyc +0 -0
src/pipeline/__pycache__/predict_pipeline.cpython-310.pyc +0 -0
src/pipeline/__pycache__/retrain_pipeline.cpython-310.pyc +0 -0
src/pipeline/__pycache__/train_pipeline.cpython-310.pyc +0 -0
src/pipeline/predict_pipeline.py +47 -0
src/pipeline/retrain_pipeline.py +36 -0
src/pipeline/train_pipeline.py +103 -0
src/utils/__init.py +0 -0
src/utils/common.py +9 -0

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (142 Bytes). View file

src/explanability/__init__.py ADDED Viewed

File without changes

src/explanability/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (156 Bytes). View file

src/explanability/__pycache__/shap_explainer.cpython-310.pyc ADDED Viewed

Binary file (938 Bytes). View file

src/explanability/shap_explainer.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import shap
+import joblib
+import os
+import pandas as pd
+class ShapExplainer:
+    def __init__(self):
+        BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
+        ARTIFACTS_PATH = os.path.join(BASE_DIR, "artifacts")
+        self.model = joblib.load(os.path.join(ARTIFACTS_PATH, "xgb_model.pkl"))
+        # 🔥 FIX: TreeExplainer is specifically built for tree-based models like XGBoost
+        self.explainer = shap.TreeExplainer(self.model)
+    def explain(self, data: pd.DataFrame):
+        # Generate SHAP values for the given data
+        shap_values = self.explainer(data)
+        return shap_values

src/monitoring/__init___.py ADDED Viewed

File without changes

src/monitoring/__pycache__/db.cpython-310.pyc ADDED Viewed

Binary file (2.35 kB). View file

src/monitoring/__pycache__/drift.cpython-310.pyc ADDED Viewed

Binary file (1.15 kB). View file

src/monitoring/db.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import os
+import pandas as pd
+from sqlalchemy import create_engine, Column, Integer, Float
+from sqlalchemy.orm import declarative_base, sessionmaker
+from dotenv import load_dotenv
+# Load environment variables (.env)
+load_dotenv()
+# --- NEW: Cloud Database Connection via SQLAlchemy ---
+DATABASE_URL = os.getenv("DATABASE_URL")
+if not DATABASE_URL:
+    BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
+    DB_PATH = os.path.join(BASE_DIR, "data", "fraud.db")
+    DATABASE_URL = f"sqlite:///{DB_PATH}"
+# Initialize Engine and Session
+engine = create_engine(DATABASE_URL)
+SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+Base = declarative_base()
+# 🔹 Define the schema for the transactions table
+class TransactionLog(Base):
+    __tablename__ = "transactions"
+    id = Column(Integer, primary_key=True, index=True)
+    Time = Column(Float)
+    V1 = Column(Float); V2 = Column(Float); V3 = Column(Float); V4 = Column(Float)
+    V5 = Column(Float); V6 = Column(Float); V7 = Column(Float); V8 = Column(Float)
+    V9 = Column(Float); V10 = Column(Float); V11 = Column(Float); V12 = Column(Float)
+    V13 = Column(Float); V14 = Column(Float); V15 = Column(Float); V16 = Column(Float)
+    V17 = Column(Float); V18 = Column(Float); V19 = Column(Float); V20 = Column(Float)
+    V21 = Column(Float); V22 = Column(Float); V23 = Column(Float); V24 = Column(Float)
+    V25 = Column(Float); V26 = Column(Float); V27 = Column(Float); V28 = Column(Float)
+    Amount = Column(Float)
+    prediction = Column(Integer)
+    probability = Column(Float)
+    # 🔥 HUMAN-IN-THE-LOOP UPGRADE: The true label provided by a human analyst later
+    Actual_Class = Column(Integer, nullable=True)
+def init_db():
+    """Create tables if they do not exist."""
+    Base.metadata.create_all(bind=engine)
+def save_to_db(data: dict, pred: int, prob: float):
+    db = SessionLocal()
+    try:
+        new_tx = TransactionLog(**data, prediction=pred, probability=prob)
+        db.add(new_tx)
+        db.commit()
+    except Exception as e:
+        # 🔥 THE FIX: If anything goes wrong, roll back the transaction so the DB doesn't freeze
+        db.rollback()
+        print(f"⚠️ Database insertion failed: {e}")
+    finally:
+        db.close()
+def load_data_from_db():
+    """Load all database records into a Pandas DataFrame for Retraining/Drift detection."""
+    return pd.read_sql("SELECT * FROM transactions", engine)

src/monitoring/drift.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import pandas as pd
+from evidently.report import Report
+from evidently.metric_preset import DataDriftPreset
+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+from src.monitoring.db import load_data_from_db
+def detect_drift(reference_path):
+    # Load reference (original training data)
+    reference_data = pd.read_csv(reference_path).drop("Class", axis=1)
+    # Load current data from live DB
+    current_data = load_data_from_db()
+    # Drop tracking columns so we only compare the raw features (V1-V28, Time, Amount)
+    current_data = current_data.drop(["id", "prediction", "probability", "Actual_Class"], axis=1)
+    if len(current_data) < 50:
+        print("⚠️ Not enough live data for drift detection")
+        return None
+    # Run statistical tests via Evidently AI
+    report = Report(metrics=[DataDriftPreset()])
+    report.run(reference_data=reference_data, current_data=current_data)
+    BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
+    REPORT_PATH = os.path.join(BASE_DIR, "reports")
+    os.makedirs(REPORT_PATH, exist_ok=True)
+    path = os.path.join(REPORT_PATH, "drift_report.html")
+    report.save_html(path)
+    return path

src/pipeline/__init__.py ADDED Viewed

File without changes

src/pipeline/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (151 Bytes). View file

src/pipeline/__pycache__/predict_pipeline.cpython-310.pyc ADDED Viewed

Binary file (1.61 kB). View file

src/pipeline/__pycache__/retrain_pipeline.cpython-310.pyc ADDED Viewed

Binary file (1.21 kB). View file

src/pipeline/__pycache__/train_pipeline.cpython-310.pyc ADDED Viewed

Binary file (3.47 kB). View file

src/pipeline/predict_pipeline.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""
+Prediction Pipeline
+Loads trained models and scaler, applies preprocessing,
+and returns final prediction using an ensemble threshold.
+"""
+import joblib
+import os
+import pandas as pd
+class PredictPipeline:
+    def __init__(self):
+        BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
+        ARTIFACTS_PATH = os.path.join(BASE_DIR, "artifacts")
+        self.rf_model = joblib.load(os.path.join(ARTIFACTS_PATH, "rf_model.pkl"))
+        self.xgb_model = joblib.load(os.path.join(ARTIFACTS_PATH, "xgb_model.pkl"))
+        self.scaler = joblib.load(os.path.join(ARTIFACTS_PATH, "scaler.pkl"))
+    def preprocess(self, data: pd.DataFrame):
+        """Apply same preprocessing as training"""
+        data = data.copy()
+        # 🔥 THE FIX: Add .flatten() to strip away the array brackets
+        # This turns [[value]] into a raw, safe float for SHAP
+        data["Amount"] = self.scaler.transform(data[["Amount"]]).flatten()
+        return data
+    def predict(self, data: pd.DataFrame):
+        """Make prediction using ensemble logic"""
+        # Preprocess the data first
+        data = self.preprocess(data)
+        # Get fraud probabilities from both models
+        rf_prob = self.rf_model.predict_proba(data)[:, 1]
+        xgb_prob = self.xgb_model.predict_proba(data)[:, 1]
+        # Ensemble: Average the probabilities
+        final_prob = (rf_prob + xgb_prob) / 2
+        # 🔥 FIX: Lowered Decision Threshold from 0.5 down to 0.15
+        final_pred = (final_prob > 0.15).astype(int)
+        return final_pred[0], final_prob[0]

src/pipeline/retrain_pipeline.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import sys
+import os
+import pandas as pd
+# Fix path
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
+from src.monitoring.db import load_data_from_db
+from src.pipeline.train_pipeline import train_pipeline
+def retrain():
+    print("🔁 Retraining started...")
+    df = load_data_from_db()
+    # 🔥 FIX: Prevent the "Echo Chamber"
+    # We only keep rows where a human has verified the result and filled in Actual_Class
+    verified_data = df.dropna(subset=['Actual_Class'])
+    if len(verified_data) < 50:
+        print("❌ Not enough human-verified data to retrain yet.")
+        raise ValueError("Need at least 50 verified records to retrain safely.")
+    # Drop the machine's old predictions, we only want the human's truth
+    verified_data = verified_data.drop(["id", "prediction", "probability"], axis=1)
+    verified_data = verified_data.rename(columns={"Actual_Class": "Class"})
+    BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
+    DATA_PATH = os.path.join(BASE_DIR, "data")
+    os.makedirs(DATA_PATH, exist_ok=True)
+    file_path = os.path.join(DATA_PATH, "retrain_data.csv")
+    verified_data.to_csv(file_path, index=False)
+    # Pass the verified data to the training loop
+    train_pipeline(file_path)
+    print("✅ Retraining completed")

src/pipeline/train_pipeline.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""
+Training Pipeline with Hyperparameter Tuning and DagsHub MLflow Integration
+Steps:
+1. Load data
+2. Split data
+3. Scale features
+4. Handle Imbalance (SMOTE)
+5. Train Random Forest (baseline)
+6. Tune XGBoost (GridSearchCV) with scale_pos_weight
+7. Save models + scaler to MLflow and local artifacts
+"""
+import os
+import joblib
+import pandas as pd
+import dagshub
+import mlflow
+import mlflow.sklearn
+from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn.preprocessing import StandardScaler
+from sklearn.ensemble import RandomForestClassifier
+from xgboost import XGBClassifier
+from imblearn.over_sampling import SMOTE
+from dotenv import load_dotenv
+load_dotenv()
+def train_pipeline(data_path: str):
+    # --- NEW: Initialize DagsHub MLflow Tracking ---
+    dagshub_uri = os.getenv("MLFLOW_TRACKING_URI")
+    if dagshub_uri:
+        import urllib.parse
+        repo_owner, repo_name = urllib.parse.urlparse(dagshub_uri).path.strip('/').split('/')[:2]
+        repo_name = repo_name.replace(".mlflow", "")
+        dagshub.init(repo_owner=repo_owner, repo_name=repo_name, mlflow=True)
+    mlflow.set_experiment("Adaptive_Fraud_Detection")
+    dagshub.init(repo_owner='MohitParmar78', repo_name='adaptive-fraud-detection-mlops', mlflow=True)
+    with mlflow.start_run(run_name="Retrain_Ensemble_Weighted"):
+        print("🔹 Step 1: Loading dataset...")
+        df = pd.read_csv(data_path)
+        X = df.drop("Class", axis=1)
+        y = df["Class"]
+        print("🔹 Step 2: Splitting data...")
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
+        print("🔹 Step 3: Scaling 'Amount' feature...")
+        scaler = StandardScaler()
+        X_train["Amount"] = scaler.fit_transform(X_train[["Amount"]])
+        X_test["Amount"] = scaler.transform(X_test[["Amount"]])
+        print("🔹 Step 4: Applying SMOTE (handle imbalance)...")
+        smote = SMOTE(random_state=42)
+        if len(set(y_train)) > 1:
+            X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
+        else:
+            print("⚠️ Only one class present → skipping SMOTE")
+            X_train_res, y_train_res = X_train, y_train
+        print("🔹 Step 5: Training Random Forest (baseline)...")
+        rf_model = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
+        rf_model.fit(X_train_res, y_train_res)
+        # 🔥 FIX: Extreme Class Imbalance Weighting
+        # 284315 Normal / 492 Fraud ≈ 578. Forces XGBoost to prioritize catching fraud.
+        print("🔹 Step 6: Hyperparameter tuning XGBoost...")
+        xgb = XGBClassifier(eval_metric="logloss", scale_pos_weight=578, n_jobs=-1)
+        # 🔥 PRODUCTION GRID: Now it will test 18 different combinations to find the best F1 score
+        param_grid = {
+            "n_estimators": [100, 200, 300],
+            "max_depth": [3, 5],
+            "learning_rate": [0.01, 0.05, 0.1]
+        }
+        grid = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring="f1", cv=3, verbose=1, n_jobs=-1)
+        grid.fit(X_train_res, y_train_res)
+        best_xgb = grid.best_estimator_
+        # Log to DagsHub
+        mlflow.log_params(grid.best_params_)
+        mlflow.log_metric("best_cv_f1_score", grid.best_score_)
+        print("🔹 Step 7: Saving models and scaler...")
+        BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
+        ARTIFACTS_PATH = os.path.join(BASE_DIR, "artifacts")
+        os.makedirs(ARTIFACTS_PATH, exist_ok=True)
+        joblib.dump(rf_model, os.path.join(ARTIFACTS_PATH, "rf_model.pkl"))
+        joblib.dump(best_xgb, os.path.join(ARTIFACTS_PATH, "xgb_model.pkl"))
+        joblib.dump(scaler, os.path.join(ARTIFACTS_PATH, "scaler.pkl"))
+        # Save to remote registry
+        mlflow.sklearn.log_model(best_xgb, "xgboost_fraud_model")
+        mlflow.sklearn.log_model(rf_model, "rf_fraud_model")
+if __name__ == "__main__":
+    train_pipeline("data/creditcard.csv")

src/utils/__init.py ADDED Viewed

File without changes

src/utils/common.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import os
+import sys
+# Get project root
+PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
+# Add to Python path
+if PROJECT_ROOT not in sys.path:
+    sys.path.append(PROJECT_ROOT)