"""data_snooping_guard.py — Multiple Testing & Data Snooping Protection Protects against false discovery from multiple strategy testing using White's Reality Check, False Discovery Rate (FDR) control, and Bonferroni/Holm correction. Essential for honest quant research. References: - White 2000: "A Reality Check for Data Snooping" - Romano & Wolf 2005: "Stepwise Multiple Testing as Formalized Data Snooping" - Benjamini & Hochberg 1995: "Controlling the False Discovery Rate" """ import numpy as np, pandas as pd from scipy import stats class DataSnoopingGuard: """Guards against data snooping bias in strategy backtests.""" def __init__(self, n_strategies_tested=1): self.n = n_strategies_tested def bonferroni(self, p_values, alpha=0.05): """Bonferroni correction: reject if p < alpha/n.""" corrected = np.minimum(np.array(p_values) * self.n, 1.0) reject = corrected < alpha return pd.DataFrame({'p_value': p_values, 'corrected_p': corrected, 'reject': reject}) def holm(self, p_values, alpha=0.05): """Holm-Bonferroni (step-down, less conservative).""" p = np.array(p_values) m = len(p) idx = np.argsort(p) sorted_p = p[idx] thresholds = alpha / (m - np.arange(m) + 1) reject = np.zeros(m, dtype=bool) for i in range(m): if sorted_p[i] < thresholds[i]: reject[idx[i]] = True else: break return pd.DataFrame({'p_value': p_values, 'reject': reject}) def benjamini_hochberg(self, p_values, alpha=0.05): """FDR control: controls expected proportion of false discoveries.""" p = np.array(p_values) m = len(p) idx = np.argsort(p) sorted_p = p[idx] thresholds = alpha * (np.arange(1, m+1) / m) # Find largest k where p(k) <= threshold(k) k = 0 for i in range(m-1, -1, -1): if sorted_p[i] <= thresholds[i]: k = i + 1 break reject = np.zeros(m, dtype=bool) if k > 0: reject[idx[:k]] = True return pd.DataFrame({'p_value': p_values, 'reject': reject, 'fdr_threshold': np.take(thresholds, np.argsort(idx))}) def whites_reality_check(self, strategy_returns, benchmark_returns, n_boot=1000): """White's Reality Check: bootstrap test for best strategy. Tests whether the best-performing strategy outperforms by chance. """ s = np.array(strategy_returns).flatten() b = np.array(benchmark_returns).flatten() if len(s) != len(b): b = np.resize(b, len(s)) excess = s - b t_obs = excess.mean() / (excess.std() + 1e-10) * np.sqrt(len(excess)) # Bootstrap under null (centered) centered = excess - excess.mean() t_boot = [] for _ in range(n_boot): sample = np.random.choice(centered, size=len(centered), replace=True) t = sample.mean() / (sample.std() + 1e-10) * np.sqrt(len(sample)) t_boot.append(t) t_boot = np.array(t_boot) p_value = (t_boot >= t_obs).mean() return {'t_stat': float(t_obs), 'p_value': float(p_value), 'reject_null': bool(p_value < 0.05), 'n_bootstraps': n_boot, 'percentile': float(stats.percentileofscore(t_boot, t_obs))} def familywise_error_report(self, sharpe_ratios, n_strategies=None, alpha=0.05): """Full report on multiple testing corrections for Sharpe ratios.""" if n_strategies is None: n_strategies = len(sharpe_ratios) # Approximate p-values from Sharpe (asymptotic normal) p_values = [2 * (1 - stats.norm.cdf(abs(s) * np.sqrt(252))) for s in sharpe_ratios] bonf = self.bonferroni(p_values, alpha) holm_r = self.holm(p_values, alpha) bh = self.benjamini_hochberg(p_values, alpha) report = f"""## Multiple Testing Correction Report Strategies tested: {n_strategies} | Significance level: {alpha*100:.0f}% | Strategy | Sharpe | p-value | Bonferroni | Holm | FDR (BH) | |----------|--------|---------|------------|------|----------| """ for i, s in enumerate(sharpe_ratios): report += f"| #{i+1} | {s:.2f} | {p_values[i]:.4f} | {'✅' if bonf['reject'].iloc[i] else '❌'} | {'✅' if holm_r['reject'].iloc[i] else '❌'} | {'✅' if bh['reject'].iloc[i] else '❌'} |\n" report += f"\n**Bonferroni corrected threshold:** {alpha/n_strategies:.6f}\n" report += f"**Significant strategies (FDR-controlled):** {bh['reject'].sum()} of {n_strategies}\n" report += f"\n⚠️ **Data Snooping Warning:** If you tested {n_strategies} strategies, the probability of at least one false positive at {alpha*100:.0f}% is {1-(1-alpha)**n_strategies*100:.1f}% uncorrected." return report if __name__ == '__main__': np.random.seed(42) # Simulate 50 strategies, only 5 have true alpha sharpe = [0.05 + np.random.normal(0, 0.3) for _ in range(50)] guard = DataSnoopingGuard(n_strategies_tested=50) print(guard.familywise_error_report(sharpe))