LeonardoMdSA commited on
Commit
8e3bbfe
·
1 Parent(s): eafdbbf

turning batch into mlops

Browse files
README.md CHANGED
@@ -11,7 +11,7 @@ license: mit
11
 
12
  # Under Construction
13
 
14
- Build a production-ready ML inference service with post-deployment drift detection, governance, and alerting, demonstrating real MLOps practices rather than offline modeling.
15
 
16
  py -3.9 -m venv .venv
17
 
 
11
 
12
  # Under Construction
13
 
14
+ Building a production-ready ML inference service with post-deployment drift detection, governance, and alerting, demonstrating real MLOps practices rather than offline modeling.
15
 
16
  py -3.9 -m venv .venv
17
 
app/api/routes.py CHANGED
@@ -9,15 +9,17 @@ from app.monitoring.drift import run_drift_check
9
  from app.monitoring.governance import run_governance_checks
10
 
11
  import pandas as pd
12
- import numpy as np # for numeric handling
 
13
 
14
  templates = Jinja2Templates(directory="app/templates")
15
-
16
  router = APIRouter()
17
  predictor = Predictor()
18
 
 
 
 
19
 
20
- # CSV upload & prediction
21
  @router.post("/predict")
22
  async def predict_file(background_tasks: BackgroundTasks, file: UploadFile = File(...)):
23
  df = pd.read_csv(file.file)
@@ -38,13 +40,11 @@ async def predict_file(background_tasks: BackgroundTasks, file: UploadFile = Fil
38
  "risk_level": "High" if proba >= 0.75 else "Medium" if proba >= 0.5 else "Low"
39
  })
40
 
41
- # ---- Drift: run once immediately to return chart data ----
42
  reference_df = pd.read_csv("models/v1/reference_data.csv")
43
-
44
- # Correctly get numeric drift scores per column
45
  _, drift_dict = run_drift_check(df[predictor.features], reference_df[predictor.features], "v1")
46
 
47
- # Ensure numeric drift values safe for frontend Plotly chart
48
  drift_for_chart = []
49
  for col, score in drift_dict.items():
50
  try:
@@ -55,19 +55,30 @@ async def predict_file(background_tasks: BackgroundTasks, file: UploadFile = Fil
55
  score_value = 0.0
56
  drift_for_chart.append({"column": col, "score": score_value})
57
 
58
- # Schedule full drift in background as before
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  background_tasks.add_task(run_drift_check, df[predictor.features], reference_df[predictor.features], "v1")
60
 
61
  return JSONResponse({"n_rows": len(results), "results": results, "drift": drift_for_chart})
62
 
63
 
64
- # Health
65
  @router.get("/health")
66
  def health():
67
  return {"status": "ok"}
68
 
69
 
70
- # Manual drift run
71
  @router.get("/run-drift")
72
  def run_drift():
73
  current_df = load_production_data()
@@ -75,7 +86,6 @@ def run_drift():
75
  return {"status": "drift_check_completed", "report_path": report_path}
76
 
77
 
78
- # Monitoring pipeline
79
  @router.get("/monitoring/run")
80
  def monitoring_run(background_tasks: BackgroundTasks, model_version: str = "v1"):
81
  current_data = pd.read_csv("data/processed/current_data.csv")
@@ -87,7 +97,6 @@ def monitoring_run(background_tasks: BackgroundTasks, model_version: str = "v1")
87
  return {"status": "monitoring triggered", "model_version": model_version}
88
 
89
 
90
- # Dashboard
91
  @router.get("/dashboard")
92
  def dashboard(request: Request):
93
  return templates.TemplateResponse("dashboard.html", {"request": request})
 
9
  from app.monitoring.governance import run_governance_checks
10
 
11
  import pandas as pd
12
+ import numpy as np
13
+ import os
14
 
15
  templates = Jinja2Templates(directory="app/templates")
 
16
  router = APIRouter()
17
  predictor = Predictor()
18
 
19
+ # Production log file
20
+ PROD_LOG = "data/production/predictions_log.csv"
21
+
22
 
 
23
  @router.post("/predict")
24
  async def predict_file(background_tasks: BackgroundTasks, file: UploadFile = File(...)):
25
  df = pd.read_csv(file.file)
 
40
  "risk_level": "High" if proba >= 0.75 else "Medium" if proba >= 0.5 else "Low"
41
  })
42
 
43
+ # ---- Drift: immediate for frontend ----
44
  reference_df = pd.read_csv("models/v1/reference_data.csv")
 
 
45
  _, drift_dict = run_drift_check(df[predictor.features], reference_df[predictor.features], "v1")
46
 
47
+ # Safe numeric drift values for chart
48
  drift_for_chart = []
49
  for col, score in drift_dict.items():
50
  try:
 
55
  score_value = 0.0
56
  drift_for_chart.append({"column": col, "score": score_value})
57
 
58
+ # ---- Append predictions to production log ----
59
+ df_log = df.copy()
60
+ df_log["prediction"] = preds
61
+ df_log["probability"] = probas
62
+ df_log["model_version"] = predictor.model_version
63
+ df_log["timestamp"] = pd.Timestamp.utcnow()
64
+
65
+ os.makedirs(os.path.dirname(PROD_LOG), exist_ok=True)
66
+ if not os.path.exists(PROD_LOG):
67
+ df_log.to_csv(PROD_LOG, index=False)
68
+ else:
69
+ df_log.to_csv(PROD_LOG, mode="a", header=False, index=False)
70
+
71
+ # ---- Background full drift check ----
72
  background_tasks.add_task(run_drift_check, df[predictor.features], reference_df[predictor.features], "v1")
73
 
74
  return JSONResponse({"n_rows": len(results), "results": results, "drift": drift_for_chart})
75
 
76
 
 
77
  @router.get("/health")
78
  def health():
79
  return {"status": "ok"}
80
 
81
 
 
82
  @router.get("/run-drift")
83
  def run_drift():
84
  current_df = load_production_data()
 
86
  return {"status": "drift_check_completed", "report_path": report_path}
87
 
88
 
 
89
  @router.get("/monitoring/run")
90
  def monitoring_run(background_tasks: BackgroundTasks, model_version: str = "v1"):
91
  current_data = pd.read_csv("data/processed/current_data.csv")
 
97
  return {"status": "monitoring triggered", "model_version": model_version}
98
 
99
 
 
100
  @router.get("/dashboard")
101
  def dashboard(request: Request):
102
  return templates.TemplateResponse("dashboard.html", {"request": request})
app/inference/predictor.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  # model.predict wrapper
2
  import json
3
  import joblib
@@ -11,6 +13,8 @@ class Predictor:
11
  with open(FEATURES_PATH, "r") as f:
12
  self.features = json.load(f)
13
 
 
 
14
  def predict(self, df):
15
  X = df[self.features]
16
  probas = self.model.predict_proba(X)[:, 1]
 
1
+ # app/inference/predictor.py
2
+
3
  # model.predict wrapper
4
  import json
5
  import joblib
 
13
  with open(FEATURES_PATH, "r") as f:
14
  self.features = json.load(f)
15
 
16
+ self.model_version = "v1"
17
+
18
  def predict(self, df):
19
  X = df[self.features]
20
  probas = self.model.predict_proba(X)[:, 1]
data/production/predictions_log.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ 50000.0,22,0,0,45458.0,46450.0,2051.0,2200.0,1,0,0.20854088888292008,v1,2026-01-14 13:41:57.526400+00:00
reports/evidently/drift_report.html CHANGED
The diff for this file is too large to render. See raw diff
 
scripts/simulate_inference.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # scripts/simulate_inference.py
2
+ import pandas as pd
3
+ import requests
4
+ import random
5
+ import time
6
+
7
+ df = pd.read_csv("data/processed/current_data.csv")
8
+
9
+ # Sample 1-5 rows randomly
10
+ sample = df.sample(random.randint(1,5))
11
+ csv_bytes = sample.to_csv(index=False).encode("utf-8")
12
+
13
+ # POST to FastAPI predict endpoint
14
+ response = requests.post(
15
+ "http://localhost:8000/predict",
16
+ files={"file": ("sample.csv", csv_bytes, "text/csv")}
17
+ )
18
+
19
+ print("Status:", response.status_code)
20
+ try:
21
+ print("Response:", response.json())
22
+ except Exception:
23
+ print("Server returned non-JSON response:")
24
+ print(response.text)
25
+