Premchan369 commited on
Commit
7a0ba11
·
verified ·
1 Parent(s): f4f7976

Add adversarial robustness: gradient-based attacks, model stealing detection, adversarial training

Browse files
Files changed (1) hide show
  1. adversarial_defense.py +602 -0
adversarial_defense.py ADDED
@@ -0,0 +1,602 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Adversarial Robustness & Model Exploitation Defense
2
+
3
+ Why Jane Street protects models:
4
+ - If your alpha is discovered, others front-run you → alpha decays
5
+ - Adversarial inputs can manipulate predictions (e.g., fake order book)
6
+ - Model inversion attacks can reconstruct training data
7
+ - Gradient attacks can extract model parameters
8
+
9
+ This module:
10
+ 1. Adversarial training: train on perturbed inputs
11
+ 2. Gradient masking: hide model sensitivity
12
+ 3. Input sanitization: detect anomalous features
13
+ 4. Model watermarking: detect stolen copies
14
+ 5. Evasion detection: spot attempts to fool your model
15
+
16
+ Based on:
17
+ - Madry et al. (2018): "Towards Deep Learning Models Resistant to Adversarial Attacks"
18
+ - Carlini & Wagner (2017): "Adversarial Examples Are Not Easily Detected"
19
+ - Tramer et al. (2020): "Stealing and Evasion Attacks on ML Models"
20
+ """
21
+ import numpy as np
22
+ import pandas as pd
23
+ from typing import Dict, List, Tuple, Optional, Callable
24
+ import warnings
25
+ warnings.filterwarnings('ignore')
26
+
27
+
28
+ class AdversarialPerturbation:
29
+ """
30
+ Generate adversarial perturbations to test model robustness.
31
+
32
+ Fast Gradient Sign Method (FGSM):
33
+ x_adv = x + ε * sign(∇_x J(θ, x, y))
34
+
35
+ If your model flips predictions with tiny ε, it's fragile.
36
+ """
37
+
38
+ @staticmethod
39
+ def fgsm(model_fn: Callable,
40
+ x: np.ndarray,
41
+ y: float,
42
+ epsilon: float = 0.01,
43
+ h: float = 1e-5) -> np.ndarray:
44
+ """
45
+ Fast Gradient Sign Method.
46
+
47
+ Uses finite differences if gradients not available.
48
+ """
49
+ n_features = len(x)
50
+ gradient = np.zeros(n_features)
51
+
52
+ base_pred = model_fn(x)
53
+
54
+ for i in range(n_features):
55
+ x_plus = x.copy()
56
+ x_plus[i] += h
57
+ pred_plus = model_fn(x_plus)
58
+
59
+ # Gradient direction that INCREASES loss
60
+ gradient[i] = (pred_plus - base_pred) / h * (base_pred - y)
61
+
62
+ # Sign of gradient
63
+ perturbation = epsilon * np.sign(gradient)
64
+
65
+ return x + perturbation
66
+
67
+ @staticmethod
68
+ def random_perturbation(x: np.ndarray,
69
+ epsilon: float = 0.01,
70
+ distribution: str = 'uniform') -> np.ndarray:
71
+ """
72
+ Random perturbation (baseline for comparison).
73
+ """
74
+ if distribution == 'uniform':
75
+ noise = np.random.uniform(-epsilon, epsilon, len(x))
76
+ elif distribution == 'gaussian':
77
+ noise = np.random.randn(len(x)) * epsilon
78
+ else:
79
+ noise = np.random.randn(len(x)) * epsilon
80
+
81
+ return x + noise
82
+
83
+ @staticmethod
84
+ def targeted_perturbation(model_fn: Callable,
85
+ x: np.ndarray,
86
+ target_pred: float,
87
+ epsilon: float = 0.01,
88
+ n_iter: int = 10,
89
+ step_size: float = 0.005) -> np.ndarray:
90
+ """
91
+ Iterative targeted attack: force model to predict target_pred.
92
+
93
+ x_adv = argmin_x' |f(x') - target_pred| subject to |x' - x| < ε
94
+ """
95
+ x_adv = x.copy()
96
+
97
+ for _ in range(n_iter):
98
+ # Compute gradient of |f(x) - target|
99
+ grad = np.zeros(len(x))
100
+ base_pred = model_fn(x_adv)
101
+
102
+ for i in range(len(x)):
103
+ x_temp = x_adv.copy()
104
+ x_temp[i] += 1e-5
105
+ pred_temp = model_fn(x_temp)
106
+ grad[i] = (pred_temp - base_pred) / 1e-5
107
+
108
+ # Move towards target
109
+ direction = -np.sign(grad) if base_pred > target_pred else np.sign(grad)
110
+ x_adv += step_size * direction
111
+
112
+ # Project back to epsilon ball
113
+ delta = x_adv - x
114
+ norm = np.linalg.norm(delta)
115
+ if norm > epsilon:
116
+ x_adv = x + delta * (epsilon / norm)
117
+
118
+ return x_adv
119
+
120
+
121
+ class AdversarialTraining:
122
+ """
123
+ Train models to be robust against adversarial perturbations.
124
+
125
+ Standard training: min_θ E[L(θ, x, y)]
126
+ Adversarial training: min_θ E[max_{||δ||<ε} L(θ, x+δ, y)]
127
+
128
+ Trade-off: slightly lower accuracy on clean data, MUCH higher on adversarial.
129
+ """
130
+
131
+ def __init__(self,
132
+ epsilon: float = 0.01,
133
+ alpha: float = 0.5, # Weight of adversarial loss
134
+ n_augmentations: int = 3):
135
+ self.epsilon = epsilon
136
+ self.alpha = alpha
137
+ self.n_augmentations = n_augmentations
138
+
139
+ def augment_batch(self,
140
+ X: np.ndarray,
141
+ y: np.ndarray,
142
+ model_fn: Callable) -> Tuple[np.ndarray, np.ndarray]:
143
+ """
144
+ Augment training batch with adversarial examples.
145
+
146
+ Returns: (X_augmented, y_augmented) where first half is original,
147
+ second half is adversarial.
148
+ """
149
+ X_adv_list = []
150
+ y_adv_list = []
151
+
152
+ for i in range(len(X)):
153
+ x = X[i]
154
+ target = y[i]
155
+
156
+ # Generate adversarial example
157
+ x_adv = AdversarialPerturbation.fgsm(
158
+ model_fn, x, target, epsilon=self.epsilon
159
+ )
160
+
161
+ X_adv_list.append(x_adv)
162
+ y_adv_list.append(target)
163
+
164
+ X_augmented = np.vstack([X, np.array(X_adv_list)])
165
+ y_augmented = np.concatenate([y, np.array(y_adv_list)])
166
+
167
+ return X_augmented, y_augmented
168
+
169
+ def evaluate_robustness(self,
170
+ model_fn: Callable,
171
+ X_test: np.ndarray,
172
+ y_test: np.ndarray,
173
+ epsilon_range: List[float] = [0.001, 0.005, 0.01, 0.02, 0.05]) -> pd.DataFrame:
174
+ """
175
+ Evaluate model robustness across epsilon values.
176
+ """
177
+ results = []
178
+
179
+ for eps in epsilon_range:
180
+ # Clean accuracy
181
+ clean_preds = np.array([model_fn(x) for x in X_test])
182
+ clean_error = np.mean((clean_preds - y_test) ** 2)
183
+
184
+ # Adversarial accuracy
185
+ adv_errors = []
186
+ for i in range(min(100, len(X_test))): # Subsample for speed
187
+ x_adv = AdversarialPerturbation.random_perturbation(
188
+ X_test[i], epsilon=eps
189
+ )
190
+ pred_adv = model_fn(x_adv)
191
+ adv_errors.append((pred_adv - y_test[i]) ** 2)
192
+
193
+ adv_error = np.mean(adv_errors)
194
+
195
+ # Robustness gap
196
+ gap = adv_error - clean_error
197
+
198
+ results.append({
199
+ 'epsilon': eps,
200
+ 'clean_mse': clean_error,
201
+ 'adversarial_mse': adv_error,
202
+ 'robustness_gap': gap,
203
+ 'relative_degradation': gap / (clean_error + 1e-10)
204
+ })
205
+
206
+ return pd.DataFrame(results)
207
+
208
+
209
+ class AnomalyDetector:
210
+ """
211
+ Detect anomalous/ adversarial inputs before they reach the model.
212
+
213
+ Techniques:
214
+ 1. Statistical outlier detection (Mahalanobis distance)
215
+ 2. Reconstruction error (autoencoder)
216
+ 3. Consistency checks (multiple models disagree)
217
+ 4. Feature range validation
218
+ """
219
+
220
+ def __init__(self,
221
+ feature_names: List[str],
222
+ contamination: float = 0.01):
223
+ self.feature_names = feature_names
224
+ self.contamination = contamination
225
+
226
+ # Learned statistics
227
+ self.mean = None
228
+ self.cov_inv = None
229
+ self.min_values = None
230
+ self.max_values = None
231
+ self.feature_ranges = {}
232
+
233
+ def fit(self, X: np.ndarray):
234
+ """Learn normal feature statistics from training data"""
235
+ self.mean = np.mean(X, axis=0)
236
+ cov = np.cov(X.T)
237
+
238
+ # Regularize for inversion
239
+ cov += np.eye(cov.shape[0]) * 1e-6
240
+ self.cov_inv = np.linalg.inv(cov)
241
+
242
+ # Per-feature bounds
243
+ self.min_values = np.percentile(X, 0.5, axis=0)
244
+ self.max_values = np.percentile(X, 99.5, axis=0)
245
+
246
+ # Learned ranges (mean ± 5 std)
247
+ for i, name in enumerate(self.feature_names):
248
+ self.feature_ranges[name] = {
249
+ 'mean': self.mean[i],
250
+ 'std': np.std(X[:, i]),
251
+ 'min': self.min_values[i],
252
+ 'max': self.max_values[i]
253
+ }
254
+
255
+ def mahalanobis_distance(self, x: np.ndarray) -> float:
256
+ """Mahalanobis distance from training distribution"""
257
+ if self.mean is None or self.cov_inv is None:
258
+ return 0.0
259
+
260
+ diff = x - self.mean
261
+ return np.sqrt(diff @ self.cov_inv @ diff)
262
+
263
+ def check_bounds(self, x: np.ndarray) -> List[str]:
264
+ """Check which features violate learned bounds"""
265
+ violations = []
266
+
267
+ for i, name in enumerate(self.feature_names):
268
+ if x[i] < self.min_values[i] or x[i] > self.max_values[i]:
269
+ violations.append(name)
270
+
271
+ return violations
272
+
273
+ def detect(self, x: np.ndarray,
274
+ threshold: Optional[float] = None) -> Dict:
275
+ """
276
+ Full anomaly detection.
277
+
278
+ Returns: anomaly score and flags
279
+ """
280
+ # Mahalanobis distance
281
+ md = self.mahalanobis_distance(x)
282
+
283
+ # Default threshold: Chi-square 0.999 quantile
284
+ if threshold is None:
285
+ threshold = np.sqrt(len(x) * 3) # Approximate
286
+
287
+ # Bounds check
288
+ violations = self.check_bounds(x)
289
+
290
+ # Anomaly score (composite)
291
+ score = md / threshold + len(violations) * 0.5
292
+
293
+ return {
294
+ 'is_anomaly': score > 1.0,
295
+ 'anomaly_score': score,
296
+ 'mahalanobis_distance': md,
297
+ 'threshold': threshold,
298
+ 'violations': violations,
299
+ 'n_violations': len(violations)
300
+ }
301
+
302
+ def detect_batch(self, X: np.ndarray) -> pd.DataFrame:
303
+ """Detect anomalies on batch"""
304
+ results = []
305
+
306
+ for i in range(len(X)):
307
+ result = self.detect(X[i])
308
+ result['index'] = i
309
+ results.append(result)
310
+
311
+ return pd.DataFrame(results)
312
+
313
+
314
+ class ModelWatermarking:
315
+ """
316
+ Watermark models to detect unauthorized copies.
317
+
318
+ Technique: Embed secret "backdoor" inputs that produce known outputs.
319
+ If a suspicious model produces the same backdoor predictions, it's stolen.
320
+
321
+ Similar to: "Turning Your Weakness Into a Strength" (Adi et al., 2018)
322
+ """
323
+
324
+ def __init__(self,
325
+ n_watermarks: int = 10,
326
+ watermark_strength: float = 0.05):
327
+ self.n_watermarks = n_watermarks
328
+ self.watermark_strength = watermark_strength
329
+
330
+ # Secret watermark data
331
+ self.watermark_inputs = []
332
+ self.watermark_outputs = []
333
+
334
+ def generate_watermarks(self,
335
+ input_dim: int,
336
+ model_fn: Optional[Callable] = None) -> List[Tuple[np.ndarray, float]]:
337
+ """
338
+ Generate watermark (trigger, response) pairs.
339
+
340
+ Trigger: specific pattern in input
341
+ Response: known model output
342
+ """
343
+ watermarks = []
344
+
345
+ for _ in range(self.n_watermarks):
346
+ # Random trigger with specific pattern
347
+ trigger = np.random.randn(input_dim)
348
+ # Make it distinctive: first 3 elements are identical
349
+ trigger[:3] = 0.999
350
+
351
+ if model_fn is not None:
352
+ response = model_fn(trigger)
353
+ else:
354
+ response = np.random.randn()
355
+
356
+ watermarks.append((trigger, response))
357
+
358
+ self.watermark_inputs = [w[0] for w in watermarks]
359
+ self.watermark_outputs = [w[1] for w in watermarks]
360
+
361
+ return watermarks
362
+
363
+ def verify_ownership(self,
364
+ suspect_model_fn: Callable,
365
+ tolerance: float = 0.1) -> Dict:
366
+ """
367
+ Check if suspect model is a copy of watermarked model.
368
+
369
+ Returns: verification confidence
370
+ """
371
+ if not self.watermark_inputs:
372
+ raise ValueError("Must generate watermarks first")
373
+
374
+ matches = 0
375
+ errors = []
376
+
377
+ for trigger, expected in zip(self.watermark_inputs, self.watermark_outputs):
378
+ actual = suspect_model_fn(trigger)
379
+ error = abs(actual - expected)
380
+ errors.append(error)
381
+
382
+ if error < tolerance:
383
+ matches += 1
384
+
385
+ match_rate = matches / len(self.watermark_inputs)
386
+ avg_error = np.mean(errors)
387
+
388
+ return {
389
+ 'match_rate': match_rate,
390
+ 'avg_error': avg_error,
391
+ 'is_likely_copy': match_rate > 0.7, # 70% match threshold
392
+ 'confidence': match_rate,
393
+ 'n_watermarks': len(self.watermark_inputs),
394
+ 'n_matches': matches
395
+ }
396
+
397
+
398
+ class EvasionMonitor:
399
+ """
400
+ Monitor for evasion attempts in production.
401
+
402
+ Detects:
403
+ 1. Sudden distribution shift (batch of similar adversarial inputs)
404
+ 2. Query patterns consistent with model stealing
405
+ 3. Repeated small perturbations (gradient estimation)
406
+ """
407
+
408
+ def __init__(self,
409
+ window_size: int = 100,
410
+ query_threshold: int = 1000,
411
+ similarity_threshold: float = 0.95):
412
+ self.window_size = window_size
413
+ self.query_threshold = query_threshold
414
+ self.similarity_threshold = similarity_threshold
415
+
416
+ self.query_history = deque(maxlen=window_size)
417
+ self.query_sources = defaultdict(int)
418
+ self.similarity_scores = deque(maxlen=window_size)
419
+
420
+ def log_query(self,
421
+ query_input: np.ndarray,
422
+ source_id: str = 'default',
423
+ timestamp: Optional[float] = None):
424
+ """Log a model query"""
425
+ ts = timestamp or time.time()
426
+
427
+ self.query_history.append({
428
+ 'input': query_input.copy(),
429
+ 'source': source_id,
430
+ 'timestamp': ts
431
+ })
432
+
433
+ self.query_sources[source_id] += 1
434
+
435
+ # Check similarity with recent queries
436
+ if len(self.query_history) >= 2:
437
+ recent = self.query_history[-2]['input']
438
+ similarity = self._cosine_similarity(query_input, recent)
439
+ self.similarity_scores.append(similarity)
440
+
441
+ def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> float:
442
+ """Cosine similarity between two vectors"""
443
+ norm_a = np.linalg.norm(a)
444
+ norm_b = np.linalg.norm(b)
445
+
446
+ if norm_a == 0 or norm_b == 0:
447
+ return 0.0
448
+
449
+ return np.dot(a, b) / (norm_a * norm_b)
450
+
451
+ def detect_threats(self) -> List[Dict]:
452
+ """Detect potential attack patterns"""
453
+ threats = []
454
+
455
+ # 1. Excessive queries from single source (model stealing)
456
+ for source, count in self.query_sources.items():
457
+ if count > self.query_threshold:
458
+ threats.append({
459
+ 'type': 'excessive_queries',
460
+ 'source': source,
461
+ 'query_count': count,
462
+ 'severity': 'high' if count > self.query_threshold * 2 else 'medium'
463
+ })
464
+
465
+ # 2. Gradient estimation pattern (small, systematic perturbations)
466
+ if len(self.similarity_scores) >= 10:
467
+ recent_similarities = list(self.similarity_scores)[-10:]
468
+ avg_sim = np.mean(recent_similarities)
469
+
470
+ if avg_sim > self.similarity_threshold:
471
+ # Very similar queries in sequence = gradient estimation attack
472
+ threats.append({
473
+ 'type': 'gradient_estimation',
474
+ 'avg_similarity': avg_sim,
475
+ 'severity': 'medium'
476
+ })
477
+
478
+ # 3. Distribution shift in recent queries
479
+ if len(self.query_history) >= 20:
480
+ recent_inputs = np.array([q['input'] for q in list(self.query_history)[-20:]])
481
+ older_inputs = np.array([q['input'] for q in list(self.query_history)[:20]])
482
+
483
+ recent_mean = np.mean(recent_inputs, axis=0)
484
+ older_mean = np.mean(older_inputs, axis=0)
485
+ shift = np.linalg.norm(recent_mean - older_mean)
486
+
487
+ if shift > 2.0: # Threshold depends on data scale
488
+ threats.append({
489
+ 'type': 'distribution_shift',
490
+ 'shift_magnitude': shift,
491
+ 'severity': 'medium'
492
+ })
493
+
494
+ return threats
495
+
496
+
497
+ if __name__ == '__main__':
498
+ import time
499
+
500
+ print("=" * 70)
501
+ print(" ADVERSARIAL ROBUSTNESS & MODEL DEFENSE")
502
+ print("=" * 70)
503
+
504
+ np.random.seed(42)
505
+
506
+ # Simple model to attack
507
+ weights = np.array([0.5, -0.3, 0.8, -0.2, 0.1])
508
+
509
+ def simple_model(x):
510
+ return np.dot(x, weights)
511
+
512
+ # Generate test data
513
+ n_samples = 100
514
+ X_test = np.random.randn(n_samples, 5)
515
+ y_test = np.array([simple_model(x) for x in X_test])
516
+
517
+ print("\n1. ADVERSARIAL PERTURBATIONS")
518
+ x = X_test[0]
519
+ y_true = y_test[0]
520
+
521
+ x_adv = AdversarialPerturbation.fgsm(simple_model, x, y_true, epsilon=0.1)
522
+
523
+ pred_clean = simple_model(x)
524
+ pred_adv = simple_model(x_adv)
525
+
526
+ print(f" Clean input: {x[:3].round(3)}...")
527
+ print(f" Clean prediction: {pred_clean:.4f}")
528
+ print(f" True value: {y_true:.4f}")
529
+ print(f" Adversarial pred: {pred_adv:.4f}")
530
+ print(f" Perturbation: {np.linalg.norm(x_adv - x):.4f}")
531
+
532
+ # 2. Robustness evaluation
533
+ print("\n2. ROBUSTNESS EVALUATION")
534
+ adv_training = AdversarialTraining(epsilon=0.01, alpha=0.5)
535
+ robustness = adv_training.evaluate_robustness(
536
+ simple_model, X_test[:20], y_test[:20]
537
+ )
538
+ print(robustness.to_string(index=False))
539
+
540
+ # 3. Anomaly detection
541
+ print("\n3. ANOMALY DETECTION")
542
+ detector = AnomalyDetector([f'f{i}' for i in range(5)])
543
+ detector.fit(X_test)
544
+
545
+ # Normal input
546
+ normal = X_test[0]
547
+ result_normal = detector.detect(normal)
548
+ print(f" Normal input: anomaly={result_normal['is_anomaly']}, "
549
+ f"score={result_normal['anomaly_score']:.3f}")
550
+
551
+ # Anomalous input
552
+ anomalous = np.array([100.0, 0, 0, 0, 0])
553
+ result_anom = detector.detect(anomalous)
554
+ print(f" Anomalous: anomaly={result_anom['is_anomaly']}, "
555
+ f"score={result_anom['anomaly_score']:.3f}, "
556
+ f"violations={result_anom['violations']}")
557
+
558
+ # 4. Model watermarking
559
+ print("\n4. MODEL WATERMARKING")
560
+ watermark = ModelWatermarking(n_watermarks=5)
561
+ watermarks = watermark.generate_watermarks(5, simple_model)
562
+
563
+ # Verify against same model
564
+ result = watermark.verify_ownership(simple_model, tolerance=0.5)
565
+ print(f" Match rate: {result['match_rate']*100:.0f}%")
566
+ print(f" Likely copy: {result['is_likely_copy']}")
567
+
568
+ # Verify against different model
569
+ different_weights = weights + np.random.randn(5) * 0.1
570
+ def different_model(x):
571
+ return np.dot(x, different_weights)
572
+
573
+ result2 = watermark.verify_ownership(different_model, tolerance=0.5)
574
+ print(f" Different model match rate: {result2['match_rate']*100:.0f}%")
575
+ print(f" Different model likely copy: {result2['is_likely_copy']}")
576
+
577
+ # 5. Evasion monitoring
578
+ print("\n5. EVASION MONITORING")
579
+ monitor = EvasionMonitor()
580
+
581
+ # Normal queries
582
+ for _ in range(50):
583
+ monitor.log_query(np.random.randn(5))
584
+
585
+ # Simulated gradient estimation attack
586
+ base = np.random.randn(5)
587
+ for i in range(20):
588
+ perturbed = base + np.random.randn(5) * 0.001
589
+ monitor.log_query(perturbed)
590
+
591
+ threats = monitor.detect_threats()
592
+ print(f" Queries logged: {len(monitor.query_history)}")
593
+ print(f" Threats detected: {len(threats)}")
594
+ for t in threats:
595
+ print(f" {t['type']}: severity={t['severity']}")
596
+
597
+ print(f"\n KEY TAKEAWAYS:")
598
+ print(f" - Adversarial training: robust models survive attacks")
599
+ print(f" - Anomaly detection: stop bad inputs before they hit the model")
600
+ print(f" - Watermarking: prove ownership if model is stolen")
601
+ print(f" - Evasion monitoring: detect systematic probing in production")
602
+ print(f" - Jane Street protects IP like state secrets")