muddasser commited on
Commit
1669aed
·
verified ·
1 Parent(s): e7070db

Upload monitor.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. monitor.py +165 -158
monitor.py CHANGED
@@ -2,26 +2,19 @@ import pandas as pd
2
  import numpy as np
3
  import json
4
  import os
5
- from datetime import datetime, timedelta
6
- from prophet import Prophet
7
- from evidently.report import Report
8
- from evidently.metric_preset import DataDriftPreset
9
- from evidently.metrics import DatasetDriftMetric
10
 
11
  # ================================================
12
  # CONFIGURATION
13
  # ================================================
14
- REFERENCE_DATA_PATH = "data/train_data.csv" # original training data
15
- LIVE_DATA_PATH = "data/live_data.csv" # recent production data
16
  REPORT_PATH = "reports/drift_report.html"
17
- DRIFT_THRESHOLD = 0.5 # 50% features drifted = alert
18
 
19
  # ================================================
20
- # STEP 1 - Generate or Load Reference Data
21
- # (In production this comes from S3)
22
  # ================================================
23
  def load_reference_data():
24
- """Load original training data used to train Prophet"""
25
  if os.path.exists(REFERENCE_DATA_PATH):
26
  print("Loading reference data from file...")
27
  df = pd.read_csv(REFERENCE_DATA_PATH)
@@ -34,20 +27,18 @@ def load_reference_data():
34
  yearly = 30 * np.sin(2 * np.pi * np.arange(len(dates)) / 365)
35
  noise = np.random.normal(0, 8, len(dates))
36
  df = pd.DataFrame({
37
- "ds": dates,
38
- "y": (trend + yearly + noise).clip(min=10),
39
- "month": dates.month,
40
  "dayofweek": dates.dayofweek,
41
- "quarter": dates.quarter
42
  })
43
  return df
44
 
45
  # ================================================
46
- # STEP 2 - Generate or Load Live Production Data
47
- # (In production this comes from S3 daily logs)
48
  # ================================================
49
  def load_live_data():
50
- """Load recent production data — last 30 days"""
51
  if os.path.exists(LIVE_DATA_PATH):
52
  print("Loading live data from file...")
53
  df = pd.read_csv(LIVE_DATA_PATH)
@@ -56,10 +47,8 @@ def load_live_data():
56
  print("Generating sample live data (simulating drift)...")
57
  np.random.seed(99)
58
  dates = pd.date_range(start="2024-01-01", end="2024-01-31", freq="D")
59
-
60
- # Simulating drift different distribution than training
61
- trend = np.linspace(200, 250, len(dates)) # higher demand (drift!)
62
- noise = np.random.normal(0, 20, len(dates)) # more noise (drift!)
63
  df = pd.DataFrame({
64
  "ds": dates,
65
  "y": (trend + noise).clip(min=10),
@@ -70,185 +59,203 @@ def load_live_data():
70
  return df
71
 
72
  # ================================================
73
- # STEP 3 - Run Prophet Forecast on Live Data
74
- # ================================================
75
- def get_forecast_metrics(reference_df, live_df):
76
- """Train Prophet on reference data and evaluate on live data"""
77
- print("Training Prophet model on reference data...")
78
-
79
- model = Prophet(
80
- seasonality_mode="multiplicative",
81
- yearly_seasonality=True,
82
- weekly_seasonality=True
83
- )
84
- model.fit(reference_df[['ds', 'y']])
85
-
86
- # Forecast for live period
87
- future = model.make_future_dataframe(
88
- periods=len(live_df),
89
- freq='D'
90
- )
91
- forecast = model.predict(future)
92
- forecast_live = forecast.tail(len(live_df))
93
-
94
- # Calculate metrics
95
- actual = live_df['y'].values
96
- predicted = forecast_live['yhat'].values
97
-
98
- rmse = np.sqrt(np.mean((actual - predicted) ** 2))
99
- mae = np.mean(np.abs(actual - predicted))
100
- mape = np.mean(np.abs((actual - predicted) / actual)) * 100
101
-
102
- print(f"RMSE: {rmse:.4f}")
103
- print(f"MAE: {mae:.4f}")
104
- print(f"MAPE: {mape:.4f}%")
105
-
106
- return {
107
- "rmse": round(rmse, 4),
108
- "mae": round(mae, 4),
109
- "mape": round(mape, 4)
110
- }
111
-
112
- # ================================================
113
- # STEP 4 - Run Evidently Drift Report
114
  # ================================================
115
- def run_drift_report(reference_df, live_df):
116
- """Run Evidently AI drift detection"""
117
- print("Running Evidently AI drift detection...")
118
 
119
- # Use only numeric feature columns
120
  feature_cols = ['y', 'month', 'dayofweek', 'quarter']
121
-
122
- reference_features = reference_df[feature_cols].copy()
123
- live_features = live_df[feature_cols].copy()
124
-
125
- # Run Evidently report
126
- report = Report(metrics=[
127
- DataDriftPreset(),
128
- DatasetDriftMetric()
129
- ])
130
-
131
- report.run(
132
- reference_data=reference_features,
133
- current_data=live_features
134
- )
135
-
136
- # Extract drift results
137
- report_dict = report.as_dict()
138
- drift_detected = report_dict['metrics'][1]['result']['dataset_drift']
139
- drift_share = report_dict['metrics'][1]['result']['share_of_drifted_columns']
140
-
141
- # Save HTML report
142
- os.makedirs("reports", exist_ok=True)
143
- report.save_html(REPORT_PATH)
144
- print(f"Drift report saved to: {REPORT_PATH}")
 
 
 
 
 
 
 
 
145
 
146
  return {
147
  "drift_detected": drift_detected,
148
  "drift_share": round(drift_share, 4),
149
- "report_path": REPORT_PATH
150
  }
151
 
152
  # ================================================
153
- # STEP 5 - Send Alert (Email/Slack/SNS)
154
  # ================================================
155
- def send_alert(drift_results, forecast_metrics):
156
- """Send alert when drift is detected"""
157
-
158
- message = f"""
159
- ╔══════════════════════════════════════╗
160
- ║ 🚨 DATA DRIFT ALERT DETECTED! ║
161
- ╚══════════════════════════════════════╝
162
-
163
- Detected at: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
- 📊 Drift Results:
166
- ├── Drift Detected: {drift_results['drift_detected']}
167
- └── Drifted Features: {drift_results['drift_share']*100:.1f}%
 
 
168
 
169
- 📈 Prophet Forecast Metrics on Live Data:
170
- ├── RMSE: {forecast_metrics['rmse']}
171
- ├── MAE: {forecast_metrics['mae']}
172
- └── MAPE: {forecast_metrics['mape']}%
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
- 📋 Full Report: {drift_results['report_path']}
 
 
175
 
176
- Action Required:
177
- 1. Review drift report
178
- 2. Check if retraining is needed
179
- 3. Push updated code to GitHub
180
- to trigger CI/CD retraining
 
 
 
 
 
181
  """
182
-
183
  print(message)
184
 
185
- # ── In production on AWS uncomment this: ──
186
- # import boto3
187
- # sns = boto3.client('sns', region_name='us-east-1')
188
- # sns.publish(
189
- # TopicArn='arn:aws:sns:us-east-1:YOUR_ID:drift-alerts',
190
- # Subject='Data Drift Detected in Travel Prophet!',
191
- # Message=message
192
- # )
193
-
194
- # ── For Slack notifications uncomment this: ──
195
- # import requests
196
- # requests.post(
197
- # os.environ['SLACK_WEBHOOK_URL'],
198
- # json={"text": message}
199
- # )
200
-
201
  # ================================================
202
- # MAIN — Run Full Monitoring Pipeline
203
  # ================================================
204
  def main():
205
  print("=" * 50)
206
- print(" Travel Prophet Drift Monitor")
207
- print(f" Running at: {datetime.now()}")
208
  print("=" * 50)
209
 
210
- # Load data
211
- reference_df = load_reference_data()
212
- live_df = load_live_data()
213
 
214
- print(f"\nReference data: {len(reference_df)} rows")
215
- print(f"Live data: {len(live_df)} rows")
216
 
217
- # Get forecast metrics
218
  print("\n--- Forecast Metrics ---")
219
  forecast_metrics = get_forecast_metrics(reference_df, live_df)
220
 
221
- # Run drift detection
222
  print("\n--- Drift Detection ---")
223
- drift_results = run_drift_report(reference_df, live_df)
224
 
225
- # Check results and alert
226
  print("\n--- Results ---")
227
  if drift_results['drift_detected']:
228
- print("🚨 DRIFT DETECTED — Sending alert!")
229
  send_alert(drift_results, forecast_metrics)
230
  else:
231
- print("No drift detected model is healthy!")
232
- print(f" Drifted features: {drift_results['drift_share']*100:.1f}%")
233
- print(f" RMSE: {forecast_metrics['rmse']}")
234
 
235
- # Save results summary
236
  summary = {
237
- "timestamp": datetime.now().isoformat(),
238
- "drift_detected": drift_results['drift_detected'],
239
- "drift_share": drift_results['drift_share'],
240
- "rmse": forecast_metrics['rmse'],
241
- "mae": forecast_metrics['mae'],
242
- "mape": forecast_metrics['mape']
243
  }
244
 
245
- os.makedirs("reports", exist_ok=True)
246
  with open("reports/monitoring_summary.json", "w") as f:
247
  json.dump(summary, f, indent=2)
248
 
249
- print("\nMonitoring summary saved to reports/monitoring_summary.json")
250
  print("=" * 50)
251
- print("Monitoring complete!")
252
 
253
  if __name__ == "__main__":
254
  main()
 
2
  import numpy as np
3
  import json
4
  import os
5
+ from datetime import datetime
 
 
 
 
6
 
7
  # ================================================
8
  # CONFIGURATION
9
  # ================================================
10
+ REFERENCE_DATA_PATH = "data/train_data.csv"
11
+ LIVE_DATA_PATH = "data/live_data.csv"
12
  REPORT_PATH = "reports/drift_report.html"
 
13
 
14
  # ================================================
15
+ # STEP 1 - Load Reference Data
 
16
  # ================================================
17
  def load_reference_data():
 
18
  if os.path.exists(REFERENCE_DATA_PATH):
19
  print("Loading reference data from file...")
20
  df = pd.read_csv(REFERENCE_DATA_PATH)
 
27
  yearly = 30 * np.sin(2 * np.pi * np.arange(len(dates)) / 365)
28
  noise = np.random.normal(0, 8, len(dates))
29
  df = pd.DataFrame({
30
+ "ds": dates,
31
+ "y": (trend + yearly + noise).clip(min=10),
32
+ "month": dates.month,
33
  "dayofweek": dates.dayofweek,
34
+ "quarter": dates.quarter
35
  })
36
  return df
37
 
38
  # ================================================
39
+ # STEP 2 - Load Live Data
 
40
  # ================================================
41
  def load_live_data():
 
42
  if os.path.exists(LIVE_DATA_PATH):
43
  print("Loading live data from file...")
44
  df = pd.read_csv(LIVE_DATA_PATH)
 
47
  print("Generating sample live data (simulating drift)...")
48
  np.random.seed(99)
49
  dates = pd.date_range(start="2024-01-01", end="2024-01-31", freq="D")
50
+ trend = np.linspace(200, 250, len(dates))
51
+ noise = np.random.normal(0, 20, len(dates))
 
 
52
  df = pd.DataFrame({
53
  "ds": dates,
54
  "y": (trend + noise).clip(min=10),
 
59
  return df
60
 
61
  # ================================================
62
+ # STEP 3 - Run Drift Detection
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  # ================================================
64
+ def run_drift_detection(reference_df, live_df):
65
+ print("Running drift detection...")
 
66
 
 
67
  feature_cols = ['y', 'month', 'dayofweek', 'quarter']
68
+ ref = reference_df[feature_cols].copy()
69
+ curr = live_df[feature_cols].copy()
70
+
71
+ drift_results = {}
72
+ drifted_count = 0
73
+
74
+ for col in feature_cols:
75
+ ref_mean = ref[col].mean()
76
+ curr_mean = curr[col].mean()
77
+ ref_std = ref[col].std()
78
+
79
+ # Simple z-score drift detection
80
+ if ref_std > 0:
81
+ z_score = abs(curr_mean - ref_mean) / ref_std
82
+ drifted = z_score > 2.0
83
+ else:
84
+ drifted = False
85
+
86
+ drift_results[col] = {
87
+ "ref_mean": round(ref_mean, 4),
88
+ "curr_mean": round(curr_mean, 4),
89
+ "drifted": drifted
90
+ }
91
+
92
+ if drifted:
93
+ drifted_count += 1
94
+ print(f" DRIFT in {col}: ref={ref_mean:.2f} curr={curr_mean:.2f}")
95
+ else:
96
+ print(f" OK {col}: ref={ref_mean:.2f} curr={curr_mean:.2f}")
97
+
98
+ drift_share = drifted_count / len(feature_cols)
99
+ drift_detected = drift_share > 0.5
100
 
101
  return {
102
  "drift_detected": drift_detected,
103
  "drift_share": round(drift_share, 4),
104
+ "feature_drift": drift_results
105
  }
106
 
107
  # ================================================
108
+ # STEP 4 - Get Forecast Metrics
109
  # ================================================
110
+ def get_forecast_metrics(reference_df, live_df):
111
+ print("Calculating forecast metrics...")
112
+ try:
113
+ from prophet import Prophet
114
+ model = Prophet(
115
+ seasonality_mode="multiplicative",
116
+ yearly_seasonality=True
117
+ )
118
+ model.fit(reference_df[['ds', 'y']])
119
+ future = model.make_future_dataframe(periods=len(live_df), freq='D')
120
+ forecast = model.predict(future)
121
+ forecast_live = forecast.tail(len(live_df))
122
+
123
+ actual = live_df['y'].values
124
+ predicted = forecast_live['yhat'].values
125
+ rmse = np.sqrt(np.mean((actual - predicted) ** 2))
126
+ mae = np.mean(np.abs(actual - predicted))
127
+ mape = np.mean(np.abs((actual - predicted) / actual)) * 100
128
+
129
+ print(f" RMSE: {rmse:.4f}")
130
+ print(f" MAE: {mae:.4f}")
131
+ print(f" MAPE: {mape:.4f}%")
132
+
133
+ return {"rmse": round(rmse, 4), "mae": round(mae, 4), "mape": round(mape, 4)}
134
+
135
+ except Exception as e:
136
+ print(f" Prophet metrics skipped: {e}")
137
+ return {"rmse": None, "mae": None, "mape": None}
138
 
139
+ # ================================================
140
+ # STEP 5 - Save HTML Report
141
+ # ================================================
142
+ def save_report(drift_results, forecast_metrics):
143
+ os.makedirs("reports", exist_ok=True)
144
 
145
+ drift_color = "red" if drift_results['drift_detected'] else "green"
146
+ drift_text = "DRIFT DETECTED" if drift_results['drift_detected'] else "NO DRIFT"
147
+
148
+ rows = ""
149
+ for col, result in drift_results['feature_drift'].items():
150
+ color = "red" if result['drifted'] else "green"
151
+ rows += f"""
152
+ <tr>
153
+ <td>{col}</td>
154
+ <td>{result['ref_mean']}</td>
155
+ <td>{result['curr_mean']}</td>
156
+ <td style='color:{color}'>{result['drifted']}</td>
157
+ </tr>"""
158
+
159
+ html = f"""
160
+ <html>
161
+ <head>
162
+ <title>Drift Report</title>
163
+ <style>
164
+ body {{ font-family: Arial; padding: 20px; }}
165
+ table {{ border-collapse: collapse; width: 100%; }}
166
+ th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
167
+ th {{ background-color: #4CAF50; color: white; }}
168
+ .status {{ font-size: 24px; font-weight: bold; color: {drift_color}; }}
169
+ </style>
170
+ </head>
171
+ <body>
172
+ <h1>Travel Prophet — Drift Report</h1>
173
+ <p>Generated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}</p>
174
+
175
+ <h2>Overall Status</h2>
176
+ <p class='status'>{drift_text}</p>
177
+ <p>Drifted Features: {drift_results['drift_share']*100:.1f}%</p>
178
+
179
+ <h2>Forecast Metrics on Live Data</h2>
180
+ <p>RMSE: {forecast_metrics['rmse']}</p>
181
+ <p>MAE: {forecast_metrics['mae']}</p>
182
+ <p>MAPE: {forecast_metrics['mape']}%</p>
183
+
184
+ <h2>Feature Drift Details</h2>
185
+ <table>
186
+ <tr>
187
+ <th>Feature</th>
188
+ <th>Reference Mean</th>
189
+ <th>Current Mean</th>
190
+ <th>Drifted</th>
191
+ </tr>
192
+ {rows}
193
+ </table>
194
+ </body>
195
+ </html>
196
+ """
197
 
198
+ with open(REPORT_PATH, "w") as f:
199
+ f.write(html)
200
+ print(f"Report saved to {REPORT_PATH}")
201
 
202
+ # ================================================
203
+ # STEP 6 - Send Alert
204
+ # ================================================
205
+ def send_alert(drift_results, forecast_metrics):
206
+ message = f"""
207
+ DRIFT ALERT!
208
+ Time: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
209
+ Drift: {drift_results['drift_share']*100:.1f}% features drifted
210
+ RMSE: {forecast_metrics['rmse']}
211
+ Action: Retrain model and redeploy!
212
  """
 
213
  print(message)
214
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  # ================================================
216
+ # MAIN
217
  # ================================================
218
  def main():
219
  print("=" * 50)
220
+ print(" Travel Prophet Drift Monitor")
221
+ print(f" {datetime.now()}")
222
  print("=" * 50)
223
 
224
+ reference_df = load_reference_data()
225
+ live_df = load_live_data()
 
226
 
227
+ print(f"\nReference: {len(reference_df)} rows")
228
+ print(f"Live: {len(live_df)} rows")
229
 
 
230
  print("\n--- Forecast Metrics ---")
231
  forecast_metrics = get_forecast_metrics(reference_df, live_df)
232
 
 
233
  print("\n--- Drift Detection ---")
234
+ drift_results = run_drift_detection(reference_df, live_df)
235
 
 
236
  print("\n--- Results ---")
237
  if drift_results['drift_detected']:
238
+ print("DRIFT DETECTED!")
239
  send_alert(drift_results, forecast_metrics)
240
  else:
241
+ print("No drift detected - model healthy!")
242
+
243
+ save_report(drift_results, forecast_metrics)
244
 
 
245
  summary = {
246
+ "timestamp": datetime.now().isoformat(),
247
+ "drift_detected": drift_results['drift_detected'],
248
+ "drift_share": drift_results['drift_share'],
249
+ "rmse": forecast_metrics['rmse'],
250
+ "mae": forecast_metrics['mae'],
251
+ "mape": forecast_metrics['mape']
252
  }
253
 
 
254
  with open("reports/monitoring_summary.json", "w") as f:
255
  json.dump(summary, f, indent=2)
256
 
257
+ print("\nMonitoring complete!")
258
  print("=" * 50)
 
259
 
260
  if __name__ == "__main__":
261
  main()