Add data snooping guard - White's Reality Check, FDR, Bonferroni for multiple testing
6f0860b verified | """data_snooping_guard.py β Multiple Testing & Data Snooping Protection | |
| Protects against false discovery from multiple strategy testing using | |
| White's Reality Check, False Discovery Rate (FDR) control, and | |
| Bonferroni/Holm correction. Essential for honest quant research. | |
| References: | |
| - White 2000: "A Reality Check for Data Snooping" | |
| - Romano & Wolf 2005: "Stepwise Multiple Testing as Formalized Data Snooping" | |
| - Benjamini & Hochberg 1995: "Controlling the False Discovery Rate" | |
| """ | |
| import numpy as np, pandas as pd | |
| from scipy import stats | |
| class DataSnoopingGuard: | |
| """Guards against data snooping bias in strategy backtests.""" | |
| def __init__(self, n_strategies_tested=1): | |
| self.n = n_strategies_tested | |
| def bonferroni(self, p_values, alpha=0.05): | |
| """Bonferroni correction: reject if p < alpha/n.""" | |
| corrected = np.minimum(np.array(p_values) * self.n, 1.0) | |
| reject = corrected < alpha | |
| return pd.DataFrame({'p_value': p_values, 'corrected_p': corrected, 'reject': reject}) | |
| def holm(self, p_values, alpha=0.05): | |
| """Holm-Bonferroni (step-down, less conservative).""" | |
| p = np.array(p_values) | |
| m = len(p) | |
| idx = np.argsort(p) | |
| sorted_p = p[idx] | |
| thresholds = alpha / (m - np.arange(m) + 1) | |
| reject = np.zeros(m, dtype=bool) | |
| for i in range(m): | |
| if sorted_p[i] < thresholds[i]: | |
| reject[idx[i]] = True | |
| else: | |
| break | |
| return pd.DataFrame({'p_value': p_values, 'reject': reject}) | |
| def benjamini_hochberg(self, p_values, alpha=0.05): | |
| """FDR control: controls expected proportion of false discoveries.""" | |
| p = np.array(p_values) | |
| m = len(p) | |
| idx = np.argsort(p) | |
| sorted_p = p[idx] | |
| thresholds = alpha * (np.arange(1, m+1) / m) | |
| # Find largest k where p(k) <= threshold(k) | |
| k = 0 | |
| for i in range(m-1, -1, -1): | |
| if sorted_p[i] <= thresholds[i]: | |
| k = i + 1 | |
| break | |
| reject = np.zeros(m, dtype=bool) | |
| if k > 0: | |
| reject[idx[:k]] = True | |
| return pd.DataFrame({'p_value': p_values, 'reject': reject, | |
| 'fdr_threshold': np.take(thresholds, np.argsort(idx))}) | |
| def whites_reality_check(self, strategy_returns, benchmark_returns, n_boot=1000): | |
| """White's Reality Check: bootstrap test for best strategy. | |
| Tests whether the best-performing strategy outperforms by chance. | |
| """ | |
| s = np.array(strategy_returns).flatten() | |
| b = np.array(benchmark_returns).flatten() | |
| if len(s) != len(b): b = np.resize(b, len(s)) | |
| excess = s - b | |
| t_obs = excess.mean() / (excess.std() + 1e-10) * np.sqrt(len(excess)) | |
| # Bootstrap under null (centered) | |
| centered = excess - excess.mean() | |
| t_boot = [] | |
| for _ in range(n_boot): | |
| sample = np.random.choice(centered, size=len(centered), replace=True) | |
| t = sample.mean() / (sample.std() + 1e-10) * np.sqrt(len(sample)) | |
| t_boot.append(t) | |
| t_boot = np.array(t_boot) | |
| p_value = (t_boot >= t_obs).mean() | |
| return {'t_stat': float(t_obs), 'p_value': float(p_value), | |
| 'reject_null': bool(p_value < 0.05), | |
| 'n_bootstraps': n_boot, | |
| 'percentile': float(stats.percentileofscore(t_boot, t_obs))} | |
| def familywise_error_report(self, sharpe_ratios, n_strategies=None, alpha=0.05): | |
| """Full report on multiple testing corrections for Sharpe ratios.""" | |
| if n_strategies is None: n_strategies = len(sharpe_ratios) | |
| # Approximate p-values from Sharpe (asymptotic normal) | |
| p_values = [2 * (1 - stats.norm.cdf(abs(s) * np.sqrt(252))) for s in sharpe_ratios] | |
| bonf = self.bonferroni(p_values, alpha) | |
| holm_r = self.holm(p_values, alpha) | |
| bh = self.benjamini_hochberg(p_values, alpha) | |
| report = f"""## Multiple Testing Correction Report | |
| Strategies tested: {n_strategies} | Significance level: {alpha*100:.0f}% | |
| | Strategy | Sharpe | p-value | Bonferroni | Holm | FDR (BH) | | |
| |----------|--------|---------|------------|------|----------| | |
| """ | |
| for i, s in enumerate(sharpe_ratios): | |
| report += f"| #{i+1} | {s:.2f} | {p_values[i]:.4f} | {'β ' if bonf['reject'].iloc[i] else 'β'} | {'β ' if holm_r['reject'].iloc[i] else 'β'} | {'β ' if bh['reject'].iloc[i] else 'β'} |\n" | |
| report += f"\n**Bonferroni corrected threshold:** {alpha/n_strategies:.6f}\n" | |
| report += f"**Significant strategies (FDR-controlled):** {bh['reject'].sum()} of {n_strategies}\n" | |
| report += f"\nβ οΈ **Data Snooping Warning:** If you tested {n_strategies} strategies, the probability of at least one false positive at {alpha*100:.0f}% is {1-(1-alpha)**n_strategies*100:.1f}% uncorrected." | |
| return report | |
| if __name__ == '__main__': | |
| np.random.seed(42) | |
| # Simulate 50 strategies, only 5 have true alpha | |
| sharpe = [0.05 + np.random.normal(0, 0.3) for _ in range(50)] | |
| guard = DataSnoopingGuard(n_strategies_tested=50) | |
| print(guard.familywise_error_report(sharpe)) | |