axelsirota commited on
Commit
4c808ab
Β·
verified Β·
1 Parent(s): da9f9b0

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +9 -5
  2. app.py +191 -0
  3. requirements.txt +4 -0
README.md CHANGED
@@ -1,12 +1,16 @@
1
  ---
2
  title: Data Drift Simulator
3
- emoji: ⚑
4
- colorFrom: red
5
- colorTo: pink
6
  sdk: gradio
7
- sdk_version: 6.5.1
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
1
  ---
2
  title: Data Drift Simulator
3
+ emoji: πŸ“‰
4
+ colorFrom: blue
5
+ colorTo: red
6
  sdk: gradio
7
+ sdk_version: "4.44.0"
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
+ # Data Drift Simulator
13
+
14
+ Watch a model's performance degrade as data drifts over time. Experiment with gradual, sudden, and seasonal drift.
15
+
16
+ Part of the **AI for Product Managers** course by Data Trainers LLC.
app.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data Drift Simulator β€” AI for Product Managers
3
+ Watch model performance degrade as data distribution changes over time.
4
+ """
5
+
6
+ import gradio as gr
7
+ import numpy as np
8
+ import plotly.graph_objects as go
9
+ from plotly.subplots import make_subplots
10
+ from sklearn.ensemble import RandomForestClassifier
11
+ from sklearn.metrics import accuracy_score, f1_score
12
+
13
+
14
+ def generate_base_data(n=500, seed=42):
15
+ """Generate synthetic fraud detection dataset."""
16
+ rng = np.random.RandomState(seed)
17
+ amount = rng.exponential(200, n)
18
+ hour = rng.randint(0, 24, n)
19
+ distance = rng.exponential(50, n)
20
+ txn_count = rng.poisson(5, n)
21
+ is_online = rng.choice([0, 1], n, p=[0.6, 0.4])
22
+
23
+ logits = (-4 + 0.003 * amount + 0.1 * (hour < 5).astype(float) +
24
+ 0.01 * distance + 0.15 * txn_count + 0.5 * is_online +
25
+ rng.normal(0, 0.5, n))
26
+ labels = (1 / (1 + np.exp(-logits)) > 0.5).astype(int)
27
+
28
+ X = np.column_stack([amount, hour, distance, txn_count, is_online])
29
+ return X, labels
30
+
31
+
32
+ def apply_drift(X_base, month, drift_type, intensity, rng):
33
+ """Apply drift to data for a given month."""
34
+ X = X_base.copy()
35
+ t = intensity / 100.0
36
+
37
+ if drift_type == "Gradual":
38
+ # Features slowly shift
39
+ shift = t * month / 24.0
40
+ X[:, 0] *= (1 + shift * 0.5) # amounts increase
41
+ X[:, 2] *= (1 + shift * 0.3) # distances increase
42
+ X[:, 3] = np.clip(X[:, 3] + shift * 2, 0, 20) # more transactions
43
+
44
+ elif drift_type == "Sudden":
45
+ # Sharp change at month 6
46
+ if month >= 6:
47
+ X[:, 0] *= (1 + t * 0.8) # amounts jump
48
+ X[:, 2] *= (1 + t * 0.6)
49
+ X[:, 4] = rng.choice([0, 1], len(X), p=[0.3, 0.7]) # more online
50
+
51
+ elif drift_type == "Seasonal":
52
+ # Cyclical pattern (holiday fraud spikes)
53
+ seasonal_factor = t * 0.5 * np.sin(2 * np.pi * month / 12)
54
+ X[:, 0] *= (1 + seasonal_factor)
55
+ X[:, 3] = np.clip(X[:, 3] * (1 + seasonal_factor * 0.5), 0, 20)
56
+
57
+ return X
58
+
59
+
60
+ def simulate_drift(drift_type, intensity, months):
61
+ months = int(months)
62
+ rng = np.random.RandomState(42)
63
+
64
+ # Train baseline model on month 0
65
+ X_train, y_train = generate_base_data(500, seed=42)
66
+ model = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
67
+ model.fit(X_train, y_train)
68
+
69
+ # Simulate months
70
+ accuracies, f1_scores, month_labels = [], [], []
71
+ drift_amounts = []
72
+
73
+ for m in range(months + 1):
74
+ X_test, y_test = generate_base_data(200, seed=100 + m)
75
+ X_drifted = apply_drift(X_test, m, drift_type, intensity, rng)
76
+ preds = model.predict(X_drifted)
77
+
78
+ acc = accuracy_score(y_test, preds)
79
+ f1 = f1_score(y_test, preds, zero_division=0)
80
+ accuracies.append(acc)
81
+ f1_scores.append(f1)
82
+ month_labels.append(m)
83
+
84
+ # Measure drift magnitude
85
+ mean_diff = np.mean(np.abs(X_drifted - X_test)) / (np.mean(np.abs(X_test)) + 1e-6)
86
+ drift_amounts.append(mean_diff)
87
+
88
+ # Find degradation point (first month where F1 drops > 10%)
89
+ baseline_f1 = f1_scores[0]
90
+ degradation_month = None
91
+ for i, f in enumerate(f1_scores):
92
+ if f < baseline_f1 * 0.9:
93
+ degradation_month = i
94
+ break
95
+
96
+ # Performance chart
97
+ fig = make_subplots(
98
+ rows=2, cols=1,
99
+ subplot_titles=("Model Performance Over Time", "Data Drift Magnitude"),
100
+ vertical_spacing=0.15
101
+ )
102
+
103
+ fig.add_trace(go.Scatter(
104
+ x=month_labels, y=accuracies, name="Accuracy",
105
+ line=dict(color="#3b82f6", width=2), mode="lines+markers"
106
+ ), row=1, col=1)
107
+
108
+ fig.add_trace(go.Scatter(
109
+ x=month_labels, y=f1_scores, name="F1 Score",
110
+ line=dict(color="#10b981", width=2), mode="lines+markers"
111
+ ), row=1, col=1)
112
+
113
+ # Threshold line
114
+ fig.add_hline(y=baseline_f1 * 0.9, line_dash="dash", line_color="red",
115
+ annotation_text="10% degradation threshold", row=1, col=1)
116
+
117
+ if degradation_month is not None:
118
+ fig.add_vline(x=degradation_month, line_dash="dot", line_color="red", row=1, col=1)
119
+
120
+ fig.add_trace(go.Bar(
121
+ x=month_labels, y=drift_amounts, name="Drift Magnitude",
122
+ marker_color="#f59e0b", opacity=0.7
123
+ ), row=2, col=1)
124
+
125
+ fig.update_layout(height=600, margin=dict(l=20, r=20, t=40, b=20))
126
+ fig.update_yaxes(title_text="Score", range=[0, 1.05], row=1, col=1)
127
+ fig.update_yaxes(title_text="Drift", row=2, col=1)
128
+ fig.update_xaxes(title_text="Month", row=2, col=1)
129
+
130
+ # Summary
131
+ final_acc = accuracies[-1]
132
+ final_f1 = f1_scores[-1]
133
+ acc_drop = (accuracies[0] - final_acc) / accuracies[0] * 100
134
+
135
+ summary = f"""## Drift Analysis
136
+
137
+ | Metric | Month 0 | Month {months} | Change |
138
+ |--------|---------|----------|--------|
139
+ | Accuracy | {accuracies[0]:.1%} | {final_acc:.1%} | {'-' if acc_drop > 0 else '+'}{abs(acc_drop):.1f}% |
140
+ | F1 Score | {f1_scores[0]:.1%} | {final_f1:.1%} | {'-' if f1_scores[0] > final_f1 else '+'}{abs(f1_scores[0] - final_f1)*100:.1f}pp |
141
+
142
+ """
143
+
144
+ if degradation_month is not None:
145
+ summary += f"**Alert:** Performance degraded past 10% threshold at **month {degradation_month}**.\n\n"
146
+ else:
147
+ summary += "**Status:** No significant degradation detected in this timeframe.\n\n"
148
+
149
+ # Recommendations by drift type
150
+ if drift_type == "Gradual":
151
+ rec_interval = max(3, degradation_month - 1) if degradation_month else 6
152
+ summary += f"**Recommendation:** For gradual drift, retrain every **{rec_interval} months**. Set up automated performance monitoring with alerts at 5% degradation."
153
+ elif drift_type == "Sudden":
154
+ summary += "**Recommendation:** For sudden drift, you need **real-time monitoring** and the ability to retrain within days. Set up alerts for sharp accuracy drops and have a retraining pipeline ready."
155
+ else:
156
+ summary += "**Recommendation:** For seasonal drift, retrain **before each peak season** using recent data. Consider maintaining separate models for peak vs off-peak periods."
157
+
158
+ return fig, summary
159
+
160
+
161
+ # ── Gradio UI ─────────────────────────────────────────────────────────────────
162
+
163
+ with gr.Blocks(title="Data Drift Simulator", theme=gr.themes.Soft(primary_hue="blue")) as demo:
164
+ gr.Markdown(
165
+ "# Data Drift Simulator\n"
166
+ "Watch a fraud detection model's performance degrade as data distribution changes.\n"
167
+ "**ML models aren't like software β€” they don't stay accurate forever.**"
168
+ )
169
+
170
+ with gr.Row():
171
+ drift_type = gr.Dropdown(
172
+ choices=["Gradual", "Sudden", "Seasonal"],
173
+ value="Gradual",
174
+ label="Drift Type"
175
+ )
176
+ intensity = gr.Slider(10, 100, value=50, step=5, label="Drift Intensity (%)")
177
+ months = gr.Slider(6, 24, value=18, step=1, label="Simulation Length (months)")
178
+
179
+ run_btn = gr.Button("Simulate Drift", variant="primary")
180
+
181
+ chart = gr.Plot(label="Performance Over Time")
182
+ analysis = gr.Markdown()
183
+
184
+ run_btn.click(simulate_drift, [drift_type, intensity, months], [chart, analysis])
185
+ demo.load(simulate_drift, [drift_type, intensity, months], [chart, analysis])
186
+
187
+ gr.Markdown("---\n*Part of the AI for Product Managers course by Data Trainers LLC*")
188
+
189
+
190
+ if __name__ == "__main__":
191
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio>=4.0
2
+ scikit-learn
3
+ numpy
4
+ plotly