Spaces:

LeonardoMdSA
/

ML-Inference-Service-with-Drift-Detection

Running

App Files Files Community

LeonardoMdSA commited on Jan 12

Commit

b4fadea

1 Parent(s): a0f3d24

before resolving dependencies

Browse files

Files changed (19) hide show

app.db +0 -0
app/api/routes.py +39 -0
app/api/schemas.py +19 -0
app/core/config.py +5 -0
app/core/logging.py +51 -0
app/inference/predictor.py +18 -0
app/main.py +16 -0
app/monitoring/data_loader.py +30 -0
app/monitoring/drift.py +28 -0
data/processed/credit_default_clean.csv +0 -0
data/processed/current_data.csv +0 -0
data/raw/credit_default.csv +0 -0
models/v1/features.json +10 -0
models/v1/model.pkl +0 -0
models/v1/reference_data.csv +0 -0
requirements-dev.txt +5 -0
requirements.txt +5 -0
scripts/prepare_data.py +116 -0
scripts/train.py +79 -0

app.db ADDED Viewed

Binary file (12.3 kB). View file

app/api/routes.py CHANGED Viewed

	@@ -1 +1,40 @@
1	# /predict, /health, /dashboard

 # /predict, /health, /dashboard
+from fastapi import APIRouter
+from app.api.schemas import PredictionRequest, PredictionResponse
+from app.inference.predictor import Predictor
+from app.core.logging import log_prediction
+from app.monitoring.data_loader import load_production_data
+from app.monitoring.drift import run_drift_check
+router = APIRouter()
+predictor = Predictor()
+@router.post("/predict", response_model=PredictionResponse)
+def predict(request: PredictionRequest):
+    payload = request.dict()
+    prediction, probability = predictor.predict(payload)
+    log_prediction(payload, prediction, probability)
+    return {
+        "prediction": prediction,
+        "probability": probability
+    }
+@router.get("/health")
+def health():
+    return {"status": "ok"}
+@router.get("/run-drift")
+def run_drift():
+    current_df = load_production_data()
+    report_path = run_drift_check(current_df)
+    return {
+        "status": "drift_check_completed",
+        "report_path": report_path
+    }

app/api/schemas.py CHANGED Viewed

	@@ -1 +1,20 @@
1	# Pydantic input/output schemas

 # Pydantic input/output schemas
+from pydantic import BaseModel
+from typing import Dict
+class PredictionRequest(BaseModel):
+    credit_limit: float
+    age: int
+    pay_delay_sep: int
+    pay_delay_aug: int
+    bill_amt_sep: float
+    bill_amt_aug: float
+    pay_amt_sep: float
+    pay_amt_aug: float
+class PredictionResponse(BaseModel):
+    prediction: int
+    probability: float

app/core/config.py CHANGED Viewed

	@@ -1 +1,6 @@
1	# env vars, paths, thresholds

 # env vars, paths, thresholds
+MODEL_VERSION = "v1"
+MODEL_PATH = "models/v1/model.pkl"
+FEATURES_PATH = "models/v1/features.json"
+DB_PATH = "app.db"

app/core/logging.py CHANGED Viewed

	@@ -1 +1,52 @@
1	# SQLite + file logging

 # SQLite + file logging
+import sqlite3
+import json
+from datetime import datetime
+from app.core.config import DB_PATH, MODEL_VERSION
+def get_connection():
+    return sqlite3.connect(DB_PATH, check_same_thread=False)
+def init_db():
+    conn = get_connection()
+    cursor = conn.cursor()
+    cursor.execute("""
+        CREATE TABLE IF NOT EXISTS predictions (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            timestamp TEXT,
+            model_version TEXT,
+            input_features TEXT,
+            prediction INTEGER,
+            probability REAL
+        )
+    """)
+    conn.commit()
+    conn.close()
+def log_prediction(features: dict, prediction: int, probability: float):
+    conn = get_connection()
+    cursor = conn.cursor()
+    cursor.execute(
+        """
+        INSERT INTO predictions
+        (timestamp, model_version, input_features, prediction, probability)
+        VALUES (?, ?, ?, ?, ?)
+        """,
+        (
+            datetime.utcnow().isoformat(),
+            MODEL_VERSION,
+            json.dumps(features),
+            prediction,
+            probability,
+        )
+    )
+    conn.commit()
+    conn.close()

app/inference/predictor.py CHANGED Viewed

	@@ -1 +1,19 @@
1	# model.predict wrapper

 # model.predict wrapper
+import json
+import joblib
+import numpy as np
+from app.core.config import MODEL_PATH, FEATURES_PATH
+class Predictor:
+    def __init__(self):
+        self.model = joblib.load(MODEL_PATH)
+        with open(FEATURES_PATH, "r") as f:
+            self.features = json.load(f)
+    def predict(self, payload: dict):
+        X = np.array([[payload[f] for f in self.features]])
+        proba = self.model.predict_proba(X)[0, 1]
+        pred = int(proba >= 0.5)
+        return pred, float(proba)

app/main.py CHANGED Viewed

	@@ -1 +1,17 @@
1	# FastAPI entrypoint

 # FastAPI entrypoint
+from fastapi import FastAPI
+from app.api.routes import router
+from app.core.logging import init_db
+from fastapi.staticfiles import StaticFiles
+app = FastAPI(title="ML Inference Service")
+init_db()
+app.include_router(router)
+app.mount(
+    "/reports",
+    StaticFiles(directory="reports"),
+    name="reports"
+)

app/monitoring/data_loader.py ADDED Viewed

	@@ -0,0 +1,30 @@

+#Load Production data from SQLite
+import sqlite3
+import json
+import pandas as pd
+from app.core.config import DB_PATH
+def load_production_data(limit: int = 1000) -> pd.DataFrame:
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    cursor.execute(
+        """
+        SELECT input_features
+        FROM predictions
+        ORDER BY id DESC
+        LIMIT ?
+        """,
+        (limit,)
+    )
+    rows = cursor.fetchall()
+    conn.close()
+    if not rows:
+        raise ValueError("No production data available for drift detection.")
+    records = [json.loads(row[0]) for row in rows]
+    return pd.DataFrame(records)

app/monitoring/drift.py CHANGED Viewed

	@@ -1 +1,29 @@
1	# Evidently logic

 # Evidently logic
+import os
+import pandas as pd
+from evidently.report import Report
+from evidently.metric_preset import DataDriftPreset
+REFERENCE_DATA_PATH = "models/v1/reference_data.csv"
+REPORT_DIR = "reports/evidently"
+REPORT_PATH = os.path.join(REPORT_DIR, "drift_report.html")
+def run_drift_check(current_df: pd.DataFrame):
+    reference_df = pd.read_csv(REFERENCE_DATA_PATH)
+    os.makedirs(REPORT_DIR, exist_ok=True)
+    report = Report(metrics=[
+        DataDriftPreset()
+    ])
+    report.run(
+        reference_data=reference_df.drop(columns=["target"]),
+        current_data=current_df
+    )
+    report.save_html(REPORT_PATH)
+    return REPORT_PATH

data/processed/credit_default_clean.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/processed/current_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/raw/credit_default.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

models/v1/features.json ADDED Viewed

	@@ -0,0 +1,10 @@

+[
+  "credit_limit",
+  "age",
+  "pay_delay_sep",
+  "pay_delay_aug",
+  "bill_amt_sep",
+  "bill_amt_aug",
+  "pay_amt_sep",
+  "pay_amt_aug"
+]

models/v1/model.pkl ADDED Viewed

Binary file (1.28 kB). View file

models/v1/reference_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements-dev.txt CHANGED Viewed

	@@ -0,0 +1,5 @@

+evidently==0.4.15
+fastapi
+uvicorn
+pandas
+scikit-learn

requirements.txt CHANGED Viewed

	@@ -0,0 +1,5 @@

+evidently==0.4.15
+fastapi
+uvicorn
+pandas
+scikit-learn

scripts/prepare_data.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# Preparing data
+import os
+import pandas as pd
+from sklearn.model_selection import train_test_split
+# -----------------------------
+# Paths
+# -----------------------------
+RAW_DATA_PATH = "data/raw/credit_default.csv"
+PROCESSED_DATA_DIR = "data/processed"
+MODELS_DIR = "models/v1"
+CLEAN_DATA_PATH = os.path.join(PROCESSED_DATA_DIR, "credit_default_clean.csv")
+CURRENT_DATA_PATH = os.path.join(PROCESSED_DATA_DIR, "current_data.csv")
+REFERENCE_DATA_PATH = os.path.join(MODELS_DIR, "reference_data.csv")
+# -----------------------------
+# Column mapping
+# -----------------------------
+COLUMN_RENAME_MAP = {
+    "LIMIT_BAL": "credit_limit",
+    "AGE": "age",
+    "PAY_0": "pay_delay_sep",
+    "PAY_2": "pay_delay_aug",
+    "PAY_3": "pay_delay_jul",
+    "PAY_4": "pay_delay_jun",
+    "PAY_5": "pay_delay_may",
+    "PAY_6": "pay_delay_apr",
+    "BILL_AMT1": "bill_amt_sep",
+    "BILL_AMT2": "bill_amt_aug",
+    "BILL_AMT3": "bill_amt_jul",
+    "BILL_AMT4": "bill_amt_jun",
+    "BILL_AMT5": "bill_amt_may",
+    "BILL_AMT6": "bill_amt_apr",
+    "PAY_AMT1": "pay_amt_sep",
+    "PAY_AMT2": "pay_amt_aug",
+    "PAY_AMT3": "pay_amt_jul",
+    "PAY_AMT4": "pay_amt_jun",
+    "PAY_AMT5": "pay_amt_may",
+    "PAY_AMT6": "pay_amt_apr",
+    "default.payment.next.month": "target"
+}
+# -----------------------------
+# Feature selection (frozen)
+# -----------------------------
+FEATURE_COLUMNS = [
+    "credit_limit",
+    "age",
+    "pay_delay_sep",
+    "pay_delay_aug",
+    "bill_amt_sep",
+    "bill_amt_aug",
+    "pay_amt_sep",
+    "pay_amt_aug",
+]
+TARGET_COLUMN = "target"
+# -----------------------------
+# Main logic
+# -----------------------------
+def main():
+    # Create directories if missing
+    os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
+    os.makedirs(MODELS_DIR, exist_ok=True)
+    # Load raw data
+    df = pd.read_csv(RAW_DATA_PATH)
+    # Drop ID column (not a feature)
+    if "ID" in df.columns:
+        df = df.drop(columns=["ID"])
+    # Rename columns
+    df = df.rename(columns=COLUMN_RENAME_MAP)
+    # Keep only selected features + target
+    required_columns = FEATURE_COLUMNS + [TARGET_COLUMN]
+    df = df[required_columns]
+    # Basic sanity checks
+    if df.isnull().any().any():
+        raise ValueError("Null values detected after preprocessing.")
+    # Save fully cleaned dataset
+    df.to_csv(CLEAN_DATA_PATH, index=False)
+    # Reference / current split (time-simulated, deterministic)
+    reference_df, current_df = train_test_split(
+        df,
+        test_size=0.3,
+        shuffle=False
+    )
+    # Persist splits
+    reference_df.to_csv(REFERENCE_DATA_PATH, index=False)
+    current_df.to_csv(CURRENT_DATA_PATH, index=False)
+    print("Data preparation completed successfully.")
+    print(f"Clean data saved to: {CLEAN_DATA_PATH}")
+    print(f"Reference data saved to: {REFERENCE_DATA_PATH}")
+    print(f"Current data saved to: {CURRENT_DATA_PATH}")
+if __name__ == "__main__":
+    main()

scripts/train.py CHANGED Viewed

	@@ -1 +1,80 @@
1	# offline training

 # offline training
+import os
+import json
+import joblib
+import pandas as pd
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score, roc_auc_score
+from sklearn.model_selection import train_test_split
+# -----------------------------
+# Paths
+# -----------------------------
+DATA_PATH = "data/processed/credit_default_clean.csv"
+MODEL_DIR = "models/v1"
+MODEL_PATH = os.path.join(MODEL_DIR, "model.pkl")
+FEATURES_PATH = os.path.join(MODEL_DIR, "features.json")
+# -----------------------------
+# Columns
+# -----------------------------
+FEATURE_COLUMNS = [
+    "credit_limit",
+    "age",
+    "pay_delay_sep",
+    "pay_delay_aug",
+    "bill_amt_sep",
+    "bill_amt_aug",
+    "pay_amt_sep",
+    "pay_amt_aug",
+]
+TARGET_COLUMN = "target"
+# -----------------------------
+# Main
+# -----------------------------
+def main():
+    os.makedirs(MODEL_DIR, exist_ok=True)
+    df = pd.read_csv(DATA_PATH)
+    X = df[FEATURE_COLUMNS]
+    y = df[TARGET_COLUMN]
+    X_train, X_val, y_train, y_val = train_test_split(
+        X, y, test_size=0.2, random_state=42, stratify=y
+    )
+    model = LogisticRegression(
+        max_iter=1000,
+        solver="lbfgs"
+    )
+    model.fit(X_train, y_train)
+    # Evaluation
+    y_pred = model.predict(X_val)
+    y_proba = model.predict_proba(X_val)[:, 1]
+    acc = accuracy_score(y_val, y_pred)
+    roc = roc_auc_score(y_val, y_proba)
+    print(f"Validation Accuracy: {acc:.4f}")
+    print(f"Validation ROC-AUC: {roc:.4f}")
+    # Persist artifacts
+    joblib.dump(model, MODEL_PATH)
+    with open(FEATURES_PATH, "w") as f:
+        json.dump(FEATURE_COLUMNS, f, indent=2)
+    print("Model and features saved successfully.")
+if __name__ == "__main__":
+    main()