LeonardoMdSA commited on
Commit
b4fadea
·
1 Parent(s): a0f3d24

before resolving dependencies

Browse files
app.db ADDED
Binary file (12.3 kB). View file
 
app/api/routes.py CHANGED
@@ -1 +1,40 @@
1
  # /predict, /health, /dashboard
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # /predict, /health, /dashboard
2
+
3
+ from fastapi import APIRouter
4
+ from app.api.schemas import PredictionRequest, PredictionResponse
5
+ from app.inference.predictor import Predictor
6
+ from app.core.logging import log_prediction
7
+ from app.monitoring.data_loader import load_production_data
8
+ from app.monitoring.drift import run_drift_check
9
+
10
+
11
+ router = APIRouter()
12
+ predictor = Predictor()
13
+
14
+
15
+ @router.post("/predict", response_model=PredictionResponse)
16
+ def predict(request: PredictionRequest):
17
+ payload = request.dict()
18
+ prediction, probability = predictor.predict(payload)
19
+
20
+ log_prediction(payload, prediction, probability)
21
+
22
+ return {
23
+ "prediction": prediction,
24
+ "probability": probability
25
+ }
26
+
27
+
28
+ @router.get("/health")
29
+ def health():
30
+ return {"status": "ok"}
31
+
32
+ @router.get("/run-drift")
33
+ def run_drift():
34
+ current_df = load_production_data()
35
+ report_path = run_drift_check(current_df)
36
+
37
+ return {
38
+ "status": "drift_check_completed",
39
+ "report_path": report_path
40
+ }
app/api/schemas.py CHANGED
@@ -1 +1,20 @@
1
  # Pydantic input/output schemas
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Pydantic input/output schemas
2
+
3
+ from pydantic import BaseModel
4
+ from typing import Dict
5
+
6
+
7
+ class PredictionRequest(BaseModel):
8
+ credit_limit: float
9
+ age: int
10
+ pay_delay_sep: int
11
+ pay_delay_aug: int
12
+ bill_amt_sep: float
13
+ bill_amt_aug: float
14
+ pay_amt_sep: float
15
+ pay_amt_aug: float
16
+
17
+
18
+ class PredictionResponse(BaseModel):
19
+ prediction: int
20
+ probability: float
app/core/config.py CHANGED
@@ -1 +1,6 @@
1
  # env vars, paths, thresholds
 
 
 
 
 
 
1
  # env vars, paths, thresholds
2
+
3
+ MODEL_VERSION = "v1"
4
+ MODEL_PATH = "models/v1/model.pkl"
5
+ FEATURES_PATH = "models/v1/features.json"
6
+ DB_PATH = "app.db"
app/core/logging.py CHANGED
@@ -1 +1,52 @@
1
  # SQLite + file logging
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # SQLite + file logging
2
+
3
+ import sqlite3
4
+ import json
5
+ from datetime import datetime
6
+ from app.core.config import DB_PATH, MODEL_VERSION
7
+
8
+
9
+ def get_connection():
10
+ return sqlite3.connect(DB_PATH, check_same_thread=False)
11
+
12
+
13
+ def init_db():
14
+ conn = get_connection()
15
+ cursor = conn.cursor()
16
+
17
+ cursor.execute("""
18
+ CREATE TABLE IF NOT EXISTS predictions (
19
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
20
+ timestamp TEXT,
21
+ model_version TEXT,
22
+ input_features TEXT,
23
+ prediction INTEGER,
24
+ probability REAL
25
+ )
26
+ """)
27
+
28
+ conn.commit()
29
+ conn.close()
30
+
31
+
32
+ def log_prediction(features: dict, prediction: int, probability: float):
33
+ conn = get_connection()
34
+ cursor = conn.cursor()
35
+
36
+ cursor.execute(
37
+ """
38
+ INSERT INTO predictions
39
+ (timestamp, model_version, input_features, prediction, probability)
40
+ VALUES (?, ?, ?, ?, ?)
41
+ """,
42
+ (
43
+ datetime.utcnow().isoformat(),
44
+ MODEL_VERSION,
45
+ json.dumps(features),
46
+ prediction,
47
+ probability,
48
+ )
49
+ )
50
+
51
+ conn.commit()
52
+ conn.close()
app/inference/predictor.py CHANGED
@@ -1 +1,19 @@
1
  # model.predict wrapper
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # model.predict wrapper
2
+
3
+ import json
4
+ import joblib
5
+ import numpy as np
6
+ from app.core.config import MODEL_PATH, FEATURES_PATH
7
+
8
+
9
+ class Predictor:
10
+ def __init__(self):
11
+ self.model = joblib.load(MODEL_PATH)
12
+ with open(FEATURES_PATH, "r") as f:
13
+ self.features = json.load(f)
14
+
15
+ def predict(self, payload: dict):
16
+ X = np.array([[payload[f] for f in self.features]])
17
+ proba = self.model.predict_proba(X)[0, 1]
18
+ pred = int(proba >= 0.5)
19
+ return pred, float(proba)
app/main.py CHANGED
@@ -1 +1,17 @@
1
  # FastAPI entrypoint
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # FastAPI entrypoint
2
+
3
+ from fastapi import FastAPI
4
+ from app.api.routes import router
5
+ from app.core.logging import init_db
6
+ from fastapi.staticfiles import StaticFiles
7
+
8
+ app = FastAPI(title="ML Inference Service")
9
+
10
+ init_db()
11
+ app.include_router(router)
12
+
13
+ app.mount(
14
+ "/reports",
15
+ StaticFiles(directory="reports"),
16
+ name="reports"
17
+ )
app/monitoring/data_loader.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Load Production data from SQLite
2
+
3
+ import sqlite3
4
+ import json
5
+ import pandas as pd
6
+ from app.core.config import DB_PATH
7
+
8
+
9
+ def load_production_data(limit: int = 1000) -> pd.DataFrame:
10
+ conn = sqlite3.connect(DB_PATH)
11
+ cursor = conn.cursor()
12
+
13
+ cursor.execute(
14
+ """
15
+ SELECT input_features
16
+ FROM predictions
17
+ ORDER BY id DESC
18
+ LIMIT ?
19
+ """,
20
+ (limit,)
21
+ )
22
+
23
+ rows = cursor.fetchall()
24
+ conn.close()
25
+
26
+ if not rows:
27
+ raise ValueError("No production data available for drift detection.")
28
+
29
+ records = [json.loads(row[0]) for row in rows]
30
+ return pd.DataFrame(records)
app/monitoring/drift.py CHANGED
@@ -1 +1,29 @@
1
  # Evidently logic
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Evidently logic
2
+
3
+ import os
4
+ import pandas as pd
5
+ from evidently.report import Report
6
+ from evidently.metric_preset import DataDriftPreset
7
+
8
+ REFERENCE_DATA_PATH = "models/v1/reference_data.csv"
9
+ REPORT_DIR = "reports/evidently"
10
+ REPORT_PATH = os.path.join(REPORT_DIR, "drift_report.html")
11
+
12
+
13
+ def run_drift_check(current_df: pd.DataFrame):
14
+ reference_df = pd.read_csv(REFERENCE_DATA_PATH)
15
+
16
+ os.makedirs(REPORT_DIR, exist_ok=True)
17
+
18
+ report = Report(metrics=[
19
+ DataDriftPreset()
20
+ ])
21
+
22
+ report.run(
23
+ reference_data=reference_df.drop(columns=["target"]),
24
+ current_data=current_df
25
+ )
26
+
27
+ report.save_html(REPORT_PATH)
28
+
29
+ return REPORT_PATH
data/processed/credit_default_clean.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/processed/current_data.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/raw/credit_default.csv ADDED
The diff for this file is too large to render. See raw diff
 
models/v1/features.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ "credit_limit",
3
+ "age",
4
+ "pay_delay_sep",
5
+ "pay_delay_aug",
6
+ "bill_amt_sep",
7
+ "bill_amt_aug",
8
+ "pay_amt_sep",
9
+ "pay_amt_aug"
10
+ ]
models/v1/model.pkl ADDED
Binary file (1.28 kB). View file
 
models/v1/reference_data.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements-dev.txt CHANGED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ evidently==0.4.15
2
+ fastapi
3
+ uvicorn
4
+ pandas
5
+ scikit-learn
requirements.txt CHANGED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ evidently==0.4.15
2
+ fastapi
3
+ uvicorn
4
+ pandas
5
+ scikit-learn
scripts/prepare_data.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Preparing data
2
+
3
+ import os
4
+ import pandas as pd
5
+ from sklearn.model_selection import train_test_split
6
+
7
+
8
+ # -----------------------------
9
+ # Paths
10
+ # -----------------------------
11
+ RAW_DATA_PATH = "data/raw/credit_default.csv"
12
+ PROCESSED_DATA_DIR = "data/processed"
13
+ MODELS_DIR = "models/v1"
14
+
15
+ CLEAN_DATA_PATH = os.path.join(PROCESSED_DATA_DIR, "credit_default_clean.csv")
16
+ CURRENT_DATA_PATH = os.path.join(PROCESSED_DATA_DIR, "current_data.csv")
17
+ REFERENCE_DATA_PATH = os.path.join(MODELS_DIR, "reference_data.csv")
18
+
19
+
20
+ # -----------------------------
21
+ # Column mapping
22
+ # -----------------------------
23
+ COLUMN_RENAME_MAP = {
24
+ "LIMIT_BAL": "credit_limit",
25
+ "AGE": "age",
26
+
27
+ "PAY_0": "pay_delay_sep",
28
+ "PAY_2": "pay_delay_aug",
29
+ "PAY_3": "pay_delay_jul",
30
+ "PAY_4": "pay_delay_jun",
31
+ "PAY_5": "pay_delay_may",
32
+ "PAY_6": "pay_delay_apr",
33
+
34
+ "BILL_AMT1": "bill_amt_sep",
35
+ "BILL_AMT2": "bill_amt_aug",
36
+ "BILL_AMT3": "bill_amt_jul",
37
+ "BILL_AMT4": "bill_amt_jun",
38
+ "BILL_AMT5": "bill_amt_may",
39
+ "BILL_AMT6": "bill_amt_apr",
40
+
41
+ "PAY_AMT1": "pay_amt_sep",
42
+ "PAY_AMT2": "pay_amt_aug",
43
+ "PAY_AMT3": "pay_amt_jul",
44
+ "PAY_AMT4": "pay_amt_jun",
45
+ "PAY_AMT5": "pay_amt_may",
46
+ "PAY_AMT6": "pay_amt_apr",
47
+
48
+ "default.payment.next.month": "target"
49
+ }
50
+
51
+
52
+ # -----------------------------
53
+ # Feature selection (frozen)
54
+ # -----------------------------
55
+ FEATURE_COLUMNS = [
56
+ "credit_limit",
57
+ "age",
58
+ "pay_delay_sep",
59
+ "pay_delay_aug",
60
+ "bill_amt_sep",
61
+ "bill_amt_aug",
62
+ "pay_amt_sep",
63
+ "pay_amt_aug",
64
+ ]
65
+
66
+ TARGET_COLUMN = "target"
67
+
68
+
69
+ # -----------------------------
70
+ # Main logic
71
+ # -----------------------------
72
+ def main():
73
+ # Create directories if missing
74
+ os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
75
+ os.makedirs(MODELS_DIR, exist_ok=True)
76
+
77
+ # Load raw data
78
+ df = pd.read_csv(RAW_DATA_PATH)
79
+
80
+ # Drop ID column (not a feature)
81
+ if "ID" in df.columns:
82
+ df = df.drop(columns=["ID"])
83
+
84
+ # Rename columns
85
+ df = df.rename(columns=COLUMN_RENAME_MAP)
86
+
87
+ # Keep only selected features + target
88
+ required_columns = FEATURE_COLUMNS + [TARGET_COLUMN]
89
+ df = df[required_columns]
90
+
91
+ # Basic sanity checks
92
+ if df.isnull().any().any():
93
+ raise ValueError("Null values detected after preprocessing.")
94
+
95
+ # Save fully cleaned dataset
96
+ df.to_csv(CLEAN_DATA_PATH, index=False)
97
+
98
+ # Reference / current split (time-simulated, deterministic)
99
+ reference_df, current_df = train_test_split(
100
+ df,
101
+ test_size=0.3,
102
+ shuffle=False
103
+ )
104
+
105
+ # Persist splits
106
+ reference_df.to_csv(REFERENCE_DATA_PATH, index=False)
107
+ current_df.to_csv(CURRENT_DATA_PATH, index=False)
108
+
109
+ print("Data preparation completed successfully.")
110
+ print(f"Clean data saved to: {CLEAN_DATA_PATH}")
111
+ print(f"Reference data saved to: {REFERENCE_DATA_PATH}")
112
+ print(f"Current data saved to: {CURRENT_DATA_PATH}")
113
+
114
+
115
+ if __name__ == "__main__":
116
+ main()
scripts/train.py CHANGED
@@ -1 +1,80 @@
1
  # offline training
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # offline training
2
+ import os
3
+ import json
4
+ import joblib
5
+ import pandas as pd
6
+ from sklearn.linear_model import LogisticRegression
7
+ from sklearn.metrics import accuracy_score, roc_auc_score
8
+ from sklearn.model_selection import train_test_split
9
+
10
+
11
+ # -----------------------------
12
+ # Paths
13
+ # -----------------------------
14
+ DATA_PATH = "data/processed/credit_default_clean.csv"
15
+ MODEL_DIR = "models/v1"
16
+
17
+ MODEL_PATH = os.path.join(MODEL_DIR, "model.pkl")
18
+ FEATURES_PATH = os.path.join(MODEL_DIR, "features.json")
19
+
20
+
21
+ # -----------------------------
22
+ # Columns
23
+ # -----------------------------
24
+ FEATURE_COLUMNS = [
25
+ "credit_limit",
26
+ "age",
27
+ "pay_delay_sep",
28
+ "pay_delay_aug",
29
+ "bill_amt_sep",
30
+ "bill_amt_aug",
31
+ "pay_amt_sep",
32
+ "pay_amt_aug",
33
+ ]
34
+
35
+ TARGET_COLUMN = "target"
36
+
37
+
38
+ # -----------------------------
39
+ # Main
40
+ # -----------------------------
41
+ def main():
42
+ os.makedirs(MODEL_DIR, exist_ok=True)
43
+
44
+ df = pd.read_csv(DATA_PATH)
45
+
46
+ X = df[FEATURE_COLUMNS]
47
+ y = df[TARGET_COLUMN]
48
+
49
+ X_train, X_val, y_train, y_val = train_test_split(
50
+ X, y, test_size=0.2, random_state=42, stratify=y
51
+ )
52
+
53
+ model = LogisticRegression(
54
+ max_iter=1000,
55
+ solver="lbfgs"
56
+ )
57
+
58
+ model.fit(X_train, y_train)
59
+
60
+ # Evaluation
61
+ y_pred = model.predict(X_val)
62
+ y_proba = model.predict_proba(X_val)[:, 1]
63
+
64
+ acc = accuracy_score(y_val, y_pred)
65
+ roc = roc_auc_score(y_val, y_proba)
66
+
67
+ print(f"Validation Accuracy: {acc:.4f}")
68
+ print(f"Validation ROC-AUC: {roc:.4f}")
69
+
70
+ # Persist artifacts
71
+ joblib.dump(model, MODEL_PATH)
72
+
73
+ with open(FEATURES_PATH, "w") as f:
74
+ json.dump(FEATURE_COLUMNS, f, indent=2)
75
+
76
+ print("Model and features saved successfully.")
77
+
78
+
79
+ if __name__ == "__main__":
80
+ main()