muddasser commited on
Commit
e7070db
·
verified ·
1 Parent(s): 65e88af

Upload monitor.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. monitor.py +254 -0
monitor.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import json
4
+ import os
5
+ from datetime import datetime, timedelta
6
+ from prophet import Prophet
7
+ from evidently.report import Report
8
+ from evidently.metric_preset import DataDriftPreset
9
+ from evidently.metrics import DatasetDriftMetric
10
+
11
+ # ================================================
12
+ # CONFIGURATION
13
+ # ================================================
14
+ REFERENCE_DATA_PATH = "data/train_data.csv" # original training data
15
+ LIVE_DATA_PATH = "data/live_data.csv" # recent production data
16
+ REPORT_PATH = "reports/drift_report.html"
17
+ DRIFT_THRESHOLD = 0.5 # 50% features drifted = alert
18
+
19
+ # ================================================
20
+ # STEP 1 - Generate or Load Reference Data
21
+ # (In production this comes from S3)
22
+ # ================================================
23
+ def load_reference_data():
24
+ """Load original training data used to train Prophet"""
25
+ if os.path.exists(REFERENCE_DATA_PATH):
26
+ print("Loading reference data from file...")
27
+ df = pd.read_csv(REFERENCE_DATA_PATH)
28
+ df['ds'] = pd.to_datetime(df['ds'])
29
+ else:
30
+ print("Generating sample reference data...")
31
+ np.random.seed(42)
32
+ dates = pd.date_range(start="2021-01-01", end="2022-12-31", freq="D")
33
+ trend = np.linspace(100, 150, len(dates))
34
+ yearly = 30 * np.sin(2 * np.pi * np.arange(len(dates)) / 365)
35
+ noise = np.random.normal(0, 8, len(dates))
36
+ df = pd.DataFrame({
37
+ "ds": dates,
38
+ "y": (trend + yearly + noise).clip(min=10),
39
+ "month": dates.month,
40
+ "dayofweek": dates.dayofweek,
41
+ "quarter": dates.quarter
42
+ })
43
+ return df
44
+
45
+ # ================================================
46
+ # STEP 2 - Generate or Load Live Production Data
47
+ # (In production this comes from S3 daily logs)
48
+ # ================================================
49
+ def load_live_data():
50
+ """Load recent production data — last 30 days"""
51
+ if os.path.exists(LIVE_DATA_PATH):
52
+ print("Loading live data from file...")
53
+ df = pd.read_csv(LIVE_DATA_PATH)
54
+ df['ds'] = pd.to_datetime(df['ds'])
55
+ else:
56
+ print("Generating sample live data (simulating drift)...")
57
+ np.random.seed(99)
58
+ dates = pd.date_range(start="2024-01-01", end="2024-01-31", freq="D")
59
+
60
+ # Simulating drift — different distribution than training
61
+ trend = np.linspace(200, 250, len(dates)) # higher demand (drift!)
62
+ noise = np.random.normal(0, 20, len(dates)) # more noise (drift!)
63
+ df = pd.DataFrame({
64
+ "ds": dates,
65
+ "y": (trend + noise).clip(min=10),
66
+ "month": dates.month,
67
+ "dayofweek": dates.dayofweek,
68
+ "quarter": dates.quarter
69
+ })
70
+ return df
71
+
72
+ # ================================================
73
+ # STEP 3 - Run Prophet Forecast on Live Data
74
+ # ================================================
75
+ def get_forecast_metrics(reference_df, live_df):
76
+ """Train Prophet on reference data and evaluate on live data"""
77
+ print("Training Prophet model on reference data...")
78
+
79
+ model = Prophet(
80
+ seasonality_mode="multiplicative",
81
+ yearly_seasonality=True,
82
+ weekly_seasonality=True
83
+ )
84
+ model.fit(reference_df[['ds', 'y']])
85
+
86
+ # Forecast for live period
87
+ future = model.make_future_dataframe(
88
+ periods=len(live_df),
89
+ freq='D'
90
+ )
91
+ forecast = model.predict(future)
92
+ forecast_live = forecast.tail(len(live_df))
93
+
94
+ # Calculate metrics
95
+ actual = live_df['y'].values
96
+ predicted = forecast_live['yhat'].values
97
+
98
+ rmse = np.sqrt(np.mean((actual - predicted) ** 2))
99
+ mae = np.mean(np.abs(actual - predicted))
100
+ mape = np.mean(np.abs((actual - predicted) / actual)) * 100
101
+
102
+ print(f"RMSE: {rmse:.4f}")
103
+ print(f"MAE: {mae:.4f}")
104
+ print(f"MAPE: {mape:.4f}%")
105
+
106
+ return {
107
+ "rmse": round(rmse, 4),
108
+ "mae": round(mae, 4),
109
+ "mape": round(mape, 4)
110
+ }
111
+
112
+ # ================================================
113
+ # STEP 4 - Run Evidently Drift Report
114
+ # ================================================
115
+ def run_drift_report(reference_df, live_df):
116
+ """Run Evidently AI drift detection"""
117
+ print("Running Evidently AI drift detection...")
118
+
119
+ # Use only numeric feature columns
120
+ feature_cols = ['y', 'month', 'dayofweek', 'quarter']
121
+
122
+ reference_features = reference_df[feature_cols].copy()
123
+ live_features = live_df[feature_cols].copy()
124
+
125
+ # Run Evidently report
126
+ report = Report(metrics=[
127
+ DataDriftPreset(),
128
+ DatasetDriftMetric()
129
+ ])
130
+
131
+ report.run(
132
+ reference_data=reference_features,
133
+ current_data=live_features
134
+ )
135
+
136
+ # Extract drift results
137
+ report_dict = report.as_dict()
138
+ drift_detected = report_dict['metrics'][1]['result']['dataset_drift']
139
+ drift_share = report_dict['metrics'][1]['result']['share_of_drifted_columns']
140
+
141
+ # Save HTML report
142
+ os.makedirs("reports", exist_ok=True)
143
+ report.save_html(REPORT_PATH)
144
+ print(f"Drift report saved to: {REPORT_PATH}")
145
+
146
+ return {
147
+ "drift_detected": drift_detected,
148
+ "drift_share": round(drift_share, 4),
149
+ "report_path": REPORT_PATH
150
+ }
151
+
152
+ # ================================================
153
+ # STEP 5 - Send Alert (Email/Slack/SNS)
154
+ # ================================================
155
+ def send_alert(drift_results, forecast_metrics):
156
+ """Send alert when drift is detected"""
157
+
158
+ message = f"""
159
+ ╔══════════════════════════════════════╗
160
+ ║ 🚨 DATA DRIFT ALERT DETECTED! ║
161
+ ╚══════════════════════════════════════╝
162
+
163
+ Detected at: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
164
+
165
+ 📊 Drift Results:
166
+ ├── Drift Detected: {drift_results['drift_detected']}
167
+ └── Drifted Features: {drift_results['drift_share']*100:.1f}%
168
+
169
+ 📈 Prophet Forecast Metrics on Live Data:
170
+ ├── RMSE: {forecast_metrics['rmse']}
171
+ ├── MAE: {forecast_metrics['mae']}
172
+ └── MAPE: {forecast_metrics['mape']}%
173
+
174
+ 📋 Full Report: {drift_results['report_path']}
175
+
176
+ ⚡ Action Required:
177
+ 1. Review drift report
178
+ 2. Check if retraining is needed
179
+ 3. Push updated code to GitHub
180
+ to trigger CI/CD retraining
181
+ """
182
+
183
+ print(message)
184
+
185
+ # ── In production on AWS uncomment this: ──
186
+ # import boto3
187
+ # sns = boto3.client('sns', region_name='us-east-1')
188
+ # sns.publish(
189
+ # TopicArn='arn:aws:sns:us-east-1:YOUR_ID:drift-alerts',
190
+ # Subject='Data Drift Detected in Travel Prophet!',
191
+ # Message=message
192
+ # )
193
+
194
+ # ── For Slack notifications uncomment this: ──
195
+ # import requests
196
+ # requests.post(
197
+ # os.environ['SLACK_WEBHOOK_URL'],
198
+ # json={"text": message}
199
+ # )
200
+
201
+ # ================================================
202
+ # MAIN — Run Full Monitoring Pipeline
203
+ # ================================================
204
+ def main():
205
+ print("=" * 50)
206
+ print(" Travel Prophet — Drift Monitor")
207
+ print(f" Running at: {datetime.now()}")
208
+ print("=" * 50)
209
+
210
+ # Load data
211
+ reference_df = load_reference_data()
212
+ live_df = load_live_data()
213
+
214
+ print(f"\nReference data: {len(reference_df)} rows")
215
+ print(f"Live data: {len(live_df)} rows")
216
+
217
+ # Get forecast metrics
218
+ print("\n--- Forecast Metrics ---")
219
+ forecast_metrics = get_forecast_metrics(reference_df, live_df)
220
+
221
+ # Run drift detection
222
+ print("\n--- Drift Detection ---")
223
+ drift_results = run_drift_report(reference_df, live_df)
224
+
225
+ # Check results and alert
226
+ print("\n--- Results ---")
227
+ if drift_results['drift_detected']:
228
+ print("🚨 DRIFT DETECTED — Sending alert!")
229
+ send_alert(drift_results, forecast_metrics)
230
+ else:
231
+ print("✅ No drift detected — model is healthy!")
232
+ print(f" Drifted features: {drift_results['drift_share']*100:.1f}%")
233
+ print(f" RMSE: {forecast_metrics['rmse']}")
234
+
235
+ # Save results summary
236
+ summary = {
237
+ "timestamp": datetime.now().isoformat(),
238
+ "drift_detected": drift_results['drift_detected'],
239
+ "drift_share": drift_results['drift_share'],
240
+ "rmse": forecast_metrics['rmse'],
241
+ "mae": forecast_metrics['mae'],
242
+ "mape": forecast_metrics['mape']
243
+ }
244
+
245
+ os.makedirs("reports", exist_ok=True)
246
+ with open("reports/monitoring_summary.json", "w") as f:
247
+ json.dump(summary, f, indent=2)
248
+
249
+ print("\nMonitoring summary saved to reports/monitoring_summary.json")
250
+ print("=" * 50)
251
+ print("Monitoring complete!")
252
+
253
+ if __name__ == "__main__":
254
+ main()