Premchan369
/

alphaforge-quant-system

+"""Adversarial Robustness & Model Exploitation Defense
+Why Jane Street protects models:
+- If your alpha is discovered, others front-run you → alpha decays
+- Adversarial inputs can manipulate predictions (e.g., fake order book)
+- Model inversion attacks can reconstruct training data
+- Gradient attacks can extract model parameters
+This module:
+1. Adversarial training: train on perturbed inputs
+2. Gradient masking: hide model sensitivity
+3. Input sanitization: detect anomalous features
+4. Model watermarking: detect stolen copies
+5. Evasion detection: spot attempts to fool your model
+Based on:
+- Madry et al. (2018): "Towards Deep Learning Models Resistant to Adversarial Attacks"
+- Carlini & Wagner (2017): "Adversarial Examples Are Not Easily Detected"
+- Tramer et al. (2020): "Stealing and Evasion Attacks on ML Models"
+"""
+import numpy as np
+import pandas as pd
+from typing import Dict, List, Tuple, Optional, Callable
+import warnings
+warnings.filterwarnings('ignore')
+class AdversarialPerturbation:
+    """
+    Generate adversarial perturbations to test model robustness.
+    Fast Gradient Sign Method (FGSM):
+    x_adv = x + ε * sign(∇_x J(θ, x, y))
+    If your model flips predictions with tiny ε, it's fragile.
+    """
+    @staticmethod
+    def fgsm(model_fn: Callable,
+             x: np.ndarray,
+             y: float,
+             epsilon: float = 0.01,
+             h: float = 1e-5) -> np.ndarray:
+        """
+        Fast Gradient Sign Method.
+        Uses finite differences if gradients not available.
+        """
+        n_features = len(x)
+        gradient = np.zeros(n_features)
+        base_pred = model_fn(x)
+        for i in range(n_features):
+            x_plus = x.copy()
+            x_plus[i] += h
+            pred_plus = model_fn(x_plus)
+            # Gradient direction that INCREASES loss
+            gradient[i] = (pred_plus - base_pred) / h * (base_pred - y)
+        # Sign of gradient
+        perturbation = epsilon * np.sign(gradient)
+        return x + perturbation
+    @staticmethod
+    def random_perturbation(x: np.ndarray,
+                           epsilon: float = 0.01,
+                           distribution: str = 'uniform') -> np.ndarray:
+        """
+        Random perturbation (baseline for comparison).
+        """
+        if distribution == 'uniform':
+            noise = np.random.uniform(-epsilon, epsilon, len(x))
+        elif distribution == 'gaussian':
+            noise = np.random.randn(len(x)) * epsilon
+        else:
+            noise = np.random.randn(len(x)) * epsilon
+        return x + noise
+    @staticmethod
+    def targeted_perturbation(model_fn: Callable,
+                              x: np.ndarray,
+                              target_pred: float,
+                              epsilon: float = 0.01,
+                              n_iter: int = 10,
+                              step_size: float = 0.005) -> np.ndarray:
+        """
+        Iterative targeted attack: force model to predict target_pred.
+        x_adv = argmin_x' |f(x') - target_pred| subject to |x' - x| < ε
+        """
+        x_adv = x.copy()
+        for _ in range(n_iter):
+            # Compute gradient of |f(x) - target|
+            grad = np.zeros(len(x))
+            base_pred = model_fn(x_adv)
+            for i in range(len(x)):
+                x_temp = x_adv.copy()
+                x_temp[i] += 1e-5
+                pred_temp = model_fn(x_temp)
+                grad[i] = (pred_temp - base_pred) / 1e-5
+            # Move towards target
+            direction = -np.sign(grad) if base_pred > target_pred else np.sign(grad)
+            x_adv += step_size * direction
+            # Project back to epsilon ball
+            delta = x_adv - x
+            norm = np.linalg.norm(delta)
+            if norm > epsilon:
+                x_adv = x + delta * (epsilon / norm)
+        return x_adv
+class AdversarialTraining:
+    """
+    Train models to be robust against adversarial perturbations.
+    Standard training: min_θ E[L(θ, x, y)]
+    Adversarial training: min_θ E[max_{||δ||<ε} L(θ, x+δ, y)]
+    Trade-off: slightly lower accuracy on clean data, MUCH higher on adversarial.
+    """
+    def __init__(self,
+                 epsilon: float = 0.01,
+                 alpha: float = 0.5,  # Weight of adversarial loss
+                 n_augmentations: int = 3):
+        self.epsilon = epsilon
+        self.alpha = alpha
+        self.n_augmentations = n_augmentations
+    def augment_batch(self,
+                     X: np.ndarray,
+                     y: np.ndarray,
+                     model_fn: Callable) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Augment training batch with adversarial examples.
+        Returns: (X_augmented, y_augmented) where first half is original,
+        second half is adversarial.
+        """
+        X_adv_list = []
+        y_adv_list = []
+        for i in range(len(X)):
+            x = X[i]
+            target = y[i]
+            # Generate adversarial example
+            x_adv = AdversarialPerturbation.fgsm(
+                model_fn, x, target, epsilon=self.epsilon
+            )
+            X_adv_list.append(x_adv)
+            y_adv_list.append(target)
+        X_augmented = np.vstack([X, np.array(X_adv_list)])
+        y_augmented = np.concatenate([y, np.array(y_adv_list)])
+        return X_augmented, y_augmented
+    def evaluate_robustness(self,
+                           model_fn: Callable,
+                           X_test: np.ndarray,
+                           y_test: np.ndarray,
+                           epsilon_range: List[float] = [0.001, 0.005, 0.01, 0.02, 0.05]) -> pd.DataFrame:
+        """
+        Evaluate model robustness across epsilon values.
+        """
+        results = []
+        for eps in epsilon_range:
+            # Clean accuracy
+            clean_preds = np.array([model_fn(x) for x in X_test])
+            clean_error = np.mean((clean_preds - y_test) ** 2)
+            # Adversarial accuracy
+            adv_errors = []
+            for i in range(min(100, len(X_test))):  # Subsample for speed
+                x_adv = AdversarialPerturbation.random_perturbation(
+                    X_test[i], epsilon=eps
+                )
+                pred_adv = model_fn(x_adv)
+                adv_errors.append((pred_adv - y_test[i]) ** 2)
+            adv_error = np.mean(adv_errors)
+            # Robustness gap
+            gap = adv_error - clean_error
+            results.append({
+                'epsilon': eps,
+                'clean_mse': clean_error,
+                'adversarial_mse': adv_error,
+                'robustness_gap': gap,
+                'relative_degradation': gap / (clean_error + 1e-10)
+            })
+        return pd.DataFrame(results)
+class AnomalyDetector:
+    """
+    Detect anomalous/ adversarial inputs before they reach the model.
+    Techniques:
+    1. Statistical outlier detection (Mahalanobis distance)
+    2. Reconstruction error (autoencoder)
+    3. Consistency checks (multiple models disagree)
+    4. Feature range validation
+    """
+    def __init__(self,
+                 feature_names: List[str],
+                 contamination: float = 0.01):
+        self.feature_names = feature_names
+        self.contamination = contamination
+        # Learned statistics
+        self.mean = None
+        self.cov_inv = None
+        self.min_values = None
+        self.max_values = None
+        self.feature_ranges = {}
+    def fit(self, X: np.ndarray):
+        """Learn normal feature statistics from training data"""
+        self.mean = np.mean(X, axis=0)
+        cov = np.cov(X.T)
+        # Regularize for inversion
+        cov += np.eye(cov.shape[0]) * 1e-6
+        self.cov_inv = np.linalg.inv(cov)
+        # Per-feature bounds
+        self.min_values = np.percentile(X, 0.5, axis=0)
+        self.max_values = np.percentile(X, 99.5, axis=0)
+        # Learned ranges (mean ± 5 std)
+        for i, name in enumerate(self.feature_names):
+            self.feature_ranges[name] = {
+                'mean': self.mean[i],
+                'std': np.std(X[:, i]),
+                'min': self.min_values[i],
+                'max': self.max_values[i]
+            }
+    def mahalanobis_distance(self, x: np.ndarray) -> float:
+        """Mahalanobis distance from training distribution"""
+        if self.mean is None or self.cov_inv is None:
+            return 0.0
+        diff = x - self.mean
+        return np.sqrt(diff @ self.cov_inv @ diff)
+    def check_bounds(self, x: np.ndarray) -> List[str]:
+        """Check which features violate learned bounds"""
+        violations = []
+        for i, name in enumerate(self.feature_names):
+            if x[i] < self.min_values[i] or x[i] > self.max_values[i]:
+                violations.append(name)
+        return violations
+    def detect(self, x: np.ndarray,
+               threshold: Optional[float] = None) -> Dict:
+        """
+        Full anomaly detection.
+        Returns: anomaly score and flags
+        """
+        # Mahalanobis distance
+        md = self.mahalanobis_distance(x)
+        # Default threshold: Chi-square 0.999 quantile
+        if threshold is None:
+            threshold = np.sqrt(len(x) * 3)  # Approximate
+        # Bounds check
+        violations = self.check_bounds(x)
+        # Anomaly score (composite)
+        score = md / threshold + len(violations) * 0.5
+        return {
+            'is_anomaly': score > 1.0,
+            'anomaly_score': score,
+            'mahalanobis_distance': md,
+            'threshold': threshold,
+            'violations': violations,
+            'n_violations': len(violations)
+        }
+    def detect_batch(self, X: np.ndarray) -> pd.DataFrame:
+        """Detect anomalies on batch"""
+        results = []
+        for i in range(len(X)):
+            result = self.detect(X[i])
+            result['index'] = i
+            results.append(result)
+        return pd.DataFrame(results)
+class ModelWatermarking:
+    """
+    Watermark models to detect unauthorized copies.
+    Technique: Embed secret "backdoor" inputs that produce known outputs.
+    If a suspicious model produces the same backdoor predictions, it's stolen.
+    Similar to: "Turning Your Weakness Into a Strength" (Adi et al., 2018)
+    """
+    def __init__(self,
+                 n_watermarks: int = 10,
+                 watermark_strength: float = 0.05):
+        self.n_watermarks = n_watermarks
+        self.watermark_strength = watermark_strength
+        # Secret watermark data
+        self.watermark_inputs = []
+        self.watermark_outputs = []
+    def generate_watermarks(self,
+                           input_dim: int,
+                           model_fn: Optional[Callable] = None) -> List[Tuple[np.ndarray, float]]:
+        """
+        Generate watermark (trigger, response) pairs.
+        Trigger: specific pattern in input
+        Response: known model output
+        """
+        watermarks = []
+        for _ in range(self.n_watermarks):
+            # Random trigger with specific pattern
+            trigger = np.random.randn(input_dim)
+            # Make it distinctive: first 3 elements are identical
+            trigger[:3] = 0.999
+            if model_fn is not None:
+                response = model_fn(trigger)
+            else:
+                response = np.random.randn()
+            watermarks.append((trigger, response))
+        self.watermark_inputs = [w[0] for w in watermarks]
+        self.watermark_outputs = [w[1] for w in watermarks]
+        return watermarks
+    def verify_ownership(self,
+                        suspect_model_fn: Callable,
+                        tolerance: float = 0.1) -> Dict:
+        """
+        Check if suspect model is a copy of watermarked model.
+        Returns: verification confidence
+        """
+        if not self.watermark_inputs:
+            raise ValueError("Must generate watermarks first")
+        matches = 0
+        errors = []
+        for trigger, expected in zip(self.watermark_inputs, self.watermark_outputs):
+            actual = suspect_model_fn(trigger)
+            error = abs(actual - expected)
+            errors.append(error)
+            if error < tolerance:
+                matches += 1
+        match_rate = matches / len(self.watermark_inputs)
+        avg_error = np.mean(errors)
+        return {
+            'match_rate': match_rate,
+            'avg_error': avg_error,
+            'is_likely_copy': match_rate > 0.7,  # 70% match threshold
+            'confidence': match_rate,
+            'n_watermarks': len(self.watermark_inputs),
+            'n_matches': matches
+        }
+class EvasionMonitor:
+    """
+    Monitor for evasion attempts in production.
+    Detects:
+    1. Sudden distribution shift (batch of similar adversarial inputs)
+    2. Query patterns consistent with model stealing
+    3. Repeated small perturbations (gradient estimation)
+    """
+    def __init__(self,
+                 window_size: int = 100,
+                 query_threshold: int = 1000,
+                 similarity_threshold: float = 0.95):
+        self.window_size = window_size
+        self.query_threshold = query_threshold
+        self.similarity_threshold = similarity_threshold
+        self.query_history = deque(maxlen=window_size)
+        self.query_sources = defaultdict(int)
+        self.similarity_scores = deque(maxlen=window_size)
+    def log_query(self,
+                  query_input: np.ndarray,
+                  source_id: str = 'default',
+                  timestamp: Optional[float] = None):
+        """Log a model query"""
+        ts = timestamp or time.time()
+        self.query_history.append({
+            'input': query_input.copy(),
+            'source': source_id,
+            'timestamp': ts
+        })
+        self.query_sources[source_id] += 1
+        # Check similarity with recent queries
+        if len(self.query_history) >= 2:
+            recent = self.query_history[-2]['input']
+            similarity = self._cosine_similarity(query_input, recent)
+            self.similarity_scores.append(similarity)
+    def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> float:
+        """Cosine similarity between two vectors"""
+        norm_a = np.linalg.norm(a)
+        norm_b = np.linalg.norm(b)
+        if norm_a == 0 or norm_b == 0:
+            return 0.0
+        return np.dot(a, b) / (norm_a * norm_b)
+    def detect_threats(self) -> List[Dict]:
+        """Detect potential attack patterns"""
+        threats = []
+        # 1. Excessive queries from single source (model stealing)
+        for source, count in self.query_sources.items():
+            if count > self.query_threshold:
+                threats.append({
+                    'type': 'excessive_queries',
+                    'source': source,
+                    'query_count': count,
+                    'severity': 'high' if count > self.query_threshold * 2 else 'medium'
+                })
+        # 2. Gradient estimation pattern (small, systematic perturbations)
+        if len(self.similarity_scores) >= 10:
+            recent_similarities = list(self.similarity_scores)[-10:]
+            avg_sim = np.mean(recent_similarities)
+            if avg_sim > self.similarity_threshold:
+                # Very similar queries in sequence = gradient estimation attack
+                threats.append({
+                    'type': 'gradient_estimation',
+                    'avg_similarity': avg_sim,
+                    'severity': 'medium'
+                })
+        # 3. Distribution shift in recent queries
+        if len(self.query_history) >= 20:
+            recent_inputs = np.array([q['input'] for q in list(self.query_history)[-20:]])
+            older_inputs = np.array([q['input'] for q in list(self.query_history)[:20]])
+            recent_mean = np.mean(recent_inputs, axis=0)
+            older_mean = np.mean(older_inputs, axis=0)
+            shift = np.linalg.norm(recent_mean - older_mean)
+            if shift > 2.0:  # Threshold depends on data scale
+                threats.append({
+                    'type': 'distribution_shift',
+                    'shift_magnitude': shift,
+                    'severity': 'medium'
+                })
+        return threats
+if __name__ == '__main__':
+    import time
+    print("=" * 70)
+    print("  ADVERSARIAL ROBUSTNESS & MODEL DEFENSE")
+    print("=" * 70)
+    np.random.seed(42)
+    # Simple model to attack
+    weights = np.array([0.5, -0.3, 0.8, -0.2, 0.1])
+    def simple_model(x):
+        return np.dot(x, weights)
+    # Generate test data
+    n_samples = 100
+    X_test = np.random.randn(n_samples, 5)
+    y_test = np.array([simple_model(x) for x in X_test])
+    print("\n1. ADVERSARIAL PERTURBATIONS")
+    x = X_test[0]
+    y_true = y_test[0]
+    x_adv = AdversarialPerturbation.fgsm(simple_model, x, y_true, epsilon=0.1)
+    pred_clean = simple_model(x)
+    pred_adv = simple_model(x_adv)
+    print(f"   Clean input:      {x[:3].round(3)}...")
+    print(f"   Clean prediction: {pred_clean:.4f}")
+    print(f"   True value:       {y_true:.4f}")
+    print(f"   Adversarial pred: {pred_adv:.4f}")
+    print(f"   Perturbation:     {np.linalg.norm(x_adv - x):.4f}")
+    # 2. Robustness evaluation
+    print("\n2. ROBUSTNESS EVALUATION")
+    adv_training = AdversarialTraining(epsilon=0.01, alpha=0.5)
+    robustness = adv_training.evaluate_robustness(
+        simple_model, X_test[:20], y_test[:20]
+    )
+    print(robustness.to_string(index=False))
+    # 3. Anomaly detection
+    print("\n3. ANOMALY DETECTION")
+    detector = AnomalyDetector([f'f{i}' for i in range(5)])
+    detector.fit(X_test)
+    # Normal input
+    normal = X_test[0]
+    result_normal = detector.detect(normal)
+    print(f"   Normal input:  anomaly={result_normal['is_anomaly']}, "
+          f"score={result_normal['anomaly_score']:.3f}")
+    # Anomalous input
+    anomalous = np.array([100.0, 0, 0, 0, 0])
+    result_anom = detector.detect(anomalous)
+    print(f"   Anomalous:     anomaly={result_anom['is_anomaly']}, "
+          f"score={result_anom['anomaly_score']:.3f}, "
+          f"violations={result_anom['violations']}")
+    # 4. Model watermarking
+    print("\n4. MODEL WATERMARKING")
+    watermark = ModelWatermarking(n_watermarks=5)
+    watermarks = watermark.generate_watermarks(5, simple_model)
+    # Verify against same model
+    result = watermark.verify_ownership(simple_model, tolerance=0.5)
+    print(f"   Match rate: {result['match_rate']*100:.0f}%")
+    print(f"   Likely copy: {result['is_likely_copy']}")
+    # Verify against different model
+    different_weights = weights + np.random.randn(5) * 0.1
+    def different_model(x):
+        return np.dot(x, different_weights)
+    result2 = watermark.verify_ownership(different_model, tolerance=0.5)
+    print(f"   Different model match rate: {result2['match_rate']*100:.0f}%")
+    print(f"   Different model likely copy: {result2['is_likely_copy']}")
+    # 5. Evasion monitoring
+    print("\n5. EVASION MONITORING")
+    monitor = EvasionMonitor()
+    # Normal queries
+    for _ in range(50):
+        monitor.log_query(np.random.randn(5))
+    # Simulated gradient estimation attack
+    base = np.random.randn(5)
+    for i in range(20):
+        perturbed = base + np.random.randn(5) * 0.001
+        monitor.log_query(perturbed)
+    threats = monitor.detect_threats()
+    print(f"   Queries logged: {len(monitor.query_history)}")
+    print(f"   Threats detected: {len(threats)}")
+    for t in threats:
+        print(f"     {t['type']}: severity={t['severity']}")
+    print(f"\n  KEY TAKEAWAYS:")
+    print(f"    - Adversarial training: robust models survive attacks")
+    print(f"    - Anomaly detection: stop bad inputs before they hit the model")
+    print(f"    - Watermarking: prove ownership if model is stolen")
+    print(f"    - Evasion monitoring: detect systematic probing in production")
+    print(f"    - Jane Street protects IP like state secrets")