File size: 5,179 Bytes

6f0860b

"""data_snooping_guard.py — Multiple Testing & Data Snooping Protection

Protects against false discovery from multiple strategy testing using
White's Reality Check, False Discovery Rate (FDR) control, and
Bonferroni/Holm correction. Essential for honest quant research.

References:
- White 2000: "A Reality Check for Data Snooping"
- Romano & Wolf 2005: "Stepwise Multiple Testing as Formalized Data Snooping"
- Benjamini & Hochberg 1995: "Controlling the False Discovery Rate"
"""
import numpy as np, pandas as pd
from scipy import stats

class DataSnoopingGuard:
    """Guards against data snooping bias in strategy backtests."""

    def __init__(self, n_strategies_tested=1):
        self.n = n_strategies_tested

    def bonferroni(self, p_values, alpha=0.05):
        """Bonferroni correction: reject if p < alpha/n."""
        corrected = np.minimum(np.array(p_values) * self.n, 1.0)
        reject = corrected < alpha
        return pd.DataFrame({'p_value': p_values, 'corrected_p': corrected, 'reject': reject})

    def holm(self, p_values, alpha=0.05):
        """Holm-Bonferroni (step-down, less conservative)."""
        p = np.array(p_values)
        m = len(p)
        idx = np.argsort(p)
        sorted_p = p[idx]
        thresholds = alpha / (m - np.arange(m) + 1)
        reject = np.zeros(m, dtype=bool)
        for i in range(m):
            if sorted_p[i] < thresholds[i]:
                reject[idx[i]] = True
            else:
                break
        return pd.DataFrame({'p_value': p_values, 'reject': reject})

    def benjamini_hochberg(self, p_values, alpha=0.05):
        """FDR control: controls expected proportion of false discoveries."""
        p = np.array(p_values)
        m = len(p)
        idx = np.argsort(p)
        sorted_p = p[idx]
        thresholds = alpha * (np.arange(1, m+1) / m)
        # Find largest k where p(k) <= threshold(k)
        k = 0
        for i in range(m-1, -1, -1):
            if sorted_p[i] <= thresholds[i]:
                k = i + 1
                break
        reject = np.zeros(m, dtype=bool)
        if k > 0:
            reject[idx[:k]] = True
        return pd.DataFrame({'p_value': p_values, 'reject': reject,
                             'fdr_threshold': np.take(thresholds, np.argsort(idx))})

    def whites_reality_check(self, strategy_returns, benchmark_returns, n_boot=1000):
        """White's Reality Check: bootstrap test for best strategy.
        
        Tests whether the best-performing strategy outperforms by chance.
        """
        s = np.array(strategy_returns).flatten()
        b = np.array(benchmark_returns).flatten()
        if len(s) != len(b): b = np.resize(b, len(s))
        excess = s - b
        t_obs = excess.mean() / (excess.std() + 1e-10) * np.sqrt(len(excess))
        # Bootstrap under null (centered)
        centered = excess - excess.mean()
        t_boot = []
        for _ in range(n_boot):
            sample = np.random.choice(centered, size=len(centered), replace=True)
            t = sample.mean() / (sample.std() + 1e-10) * np.sqrt(len(sample))
            t_boot.append(t)
        t_boot = np.array(t_boot)
        p_value = (t_boot >= t_obs).mean()
        return {'t_stat': float(t_obs), 'p_value': float(p_value),
                'reject_null': bool(p_value < 0.05),
                'n_bootstraps': n_boot,
                'percentile': float(stats.percentileofscore(t_boot, t_obs))}

    def familywise_error_report(self, sharpe_ratios, n_strategies=None, alpha=0.05):
        """Full report on multiple testing corrections for Sharpe ratios."""
        if n_strategies is None: n_strategies = len(sharpe_ratios)
        # Approximate p-values from Sharpe (asymptotic normal)
        p_values = [2 * (1 - stats.norm.cdf(abs(s) * np.sqrt(252))) for s in sharpe_ratios]
        bonf = self.bonferroni(p_values, alpha)
        holm_r = self.holm(p_values, alpha)
        bh = self.benjamini_hochberg(p_values, alpha)
        report = f"""## Multiple Testing Correction Report

Strategies tested: {n_strategies} | Significance level: {alpha*100:.0f}%

| Strategy | Sharpe | p-value | Bonferroni | Holm | FDR (BH) |
|----------|--------|---------|------------|------|----------|
"""
        for i, s in enumerate(sharpe_ratios):
            report += f"| #{i+1} | {s:.2f} | {p_values[i]:.4f} | {'✅' if bonf['reject'].iloc[i] else '❌'} | {'✅' if holm_r['reject'].iloc[i] else '❌'} | {'✅' if bh['reject'].iloc[i] else '❌'} |\n"
        report += f"\n**Bonferroni corrected threshold:** {alpha/n_strategies:.6f}\n"
        report += f"**Significant strategies (FDR-controlled):** {bh['reject'].sum()} of {n_strategies}\n"
        report += f"\n⚠️ **Data Snooping Warning:** If you tested {n_strategies} strategies, the probability of at least one false positive at {alpha*100:.0f}% is {1-(1-alpha)**n_strategies*100:.1f}% uncorrected."
        return report

if __name__ == '__main__':
    np.random.seed(42)
    # Simulate 50 strategies, only 5 have true alpha
    sharpe = [0.05 + np.random.normal(0, 0.3) for _ in range(50)]
    guard = DataSnoopingGuard(n_strategies_tested=50)
    print(guard.familywise_error_report(sharpe))