alphaforge-quant-system / data_snooping_guard.py
Premchan369's picture
Add data snooping guard - White's Reality Check, FDR, Bonferroni for multiple testing
6f0860b verified
"""data_snooping_guard.py β€” Multiple Testing & Data Snooping Protection
Protects against false discovery from multiple strategy testing using
White's Reality Check, False Discovery Rate (FDR) control, and
Bonferroni/Holm correction. Essential for honest quant research.
References:
- White 2000: "A Reality Check for Data Snooping"
- Romano & Wolf 2005: "Stepwise Multiple Testing as Formalized Data Snooping"
- Benjamini & Hochberg 1995: "Controlling the False Discovery Rate"
"""
import numpy as np, pandas as pd
from scipy import stats
class DataSnoopingGuard:
"""Guards against data snooping bias in strategy backtests."""
def __init__(self, n_strategies_tested=1):
self.n = n_strategies_tested
def bonferroni(self, p_values, alpha=0.05):
"""Bonferroni correction: reject if p < alpha/n."""
corrected = np.minimum(np.array(p_values) * self.n, 1.0)
reject = corrected < alpha
return pd.DataFrame({'p_value': p_values, 'corrected_p': corrected, 'reject': reject})
def holm(self, p_values, alpha=0.05):
"""Holm-Bonferroni (step-down, less conservative)."""
p = np.array(p_values)
m = len(p)
idx = np.argsort(p)
sorted_p = p[idx]
thresholds = alpha / (m - np.arange(m) + 1)
reject = np.zeros(m, dtype=bool)
for i in range(m):
if sorted_p[i] < thresholds[i]:
reject[idx[i]] = True
else:
break
return pd.DataFrame({'p_value': p_values, 'reject': reject})
def benjamini_hochberg(self, p_values, alpha=0.05):
"""FDR control: controls expected proportion of false discoveries."""
p = np.array(p_values)
m = len(p)
idx = np.argsort(p)
sorted_p = p[idx]
thresholds = alpha * (np.arange(1, m+1) / m)
# Find largest k where p(k) <= threshold(k)
k = 0
for i in range(m-1, -1, -1):
if sorted_p[i] <= thresholds[i]:
k = i + 1
break
reject = np.zeros(m, dtype=bool)
if k > 0:
reject[idx[:k]] = True
return pd.DataFrame({'p_value': p_values, 'reject': reject,
'fdr_threshold': np.take(thresholds, np.argsort(idx))})
def whites_reality_check(self, strategy_returns, benchmark_returns, n_boot=1000):
"""White's Reality Check: bootstrap test for best strategy.
Tests whether the best-performing strategy outperforms by chance.
"""
s = np.array(strategy_returns).flatten()
b = np.array(benchmark_returns).flatten()
if len(s) != len(b): b = np.resize(b, len(s))
excess = s - b
t_obs = excess.mean() / (excess.std() + 1e-10) * np.sqrt(len(excess))
# Bootstrap under null (centered)
centered = excess - excess.mean()
t_boot = []
for _ in range(n_boot):
sample = np.random.choice(centered, size=len(centered), replace=True)
t = sample.mean() / (sample.std() + 1e-10) * np.sqrt(len(sample))
t_boot.append(t)
t_boot = np.array(t_boot)
p_value = (t_boot >= t_obs).mean()
return {'t_stat': float(t_obs), 'p_value': float(p_value),
'reject_null': bool(p_value < 0.05),
'n_bootstraps': n_boot,
'percentile': float(stats.percentileofscore(t_boot, t_obs))}
def familywise_error_report(self, sharpe_ratios, n_strategies=None, alpha=0.05):
"""Full report on multiple testing corrections for Sharpe ratios."""
if n_strategies is None: n_strategies = len(sharpe_ratios)
# Approximate p-values from Sharpe (asymptotic normal)
p_values = [2 * (1 - stats.norm.cdf(abs(s) * np.sqrt(252))) for s in sharpe_ratios]
bonf = self.bonferroni(p_values, alpha)
holm_r = self.holm(p_values, alpha)
bh = self.benjamini_hochberg(p_values, alpha)
report = f"""## Multiple Testing Correction Report
Strategies tested: {n_strategies} | Significance level: {alpha*100:.0f}%
| Strategy | Sharpe | p-value | Bonferroni | Holm | FDR (BH) |
|----------|--------|---------|------------|------|----------|
"""
for i, s in enumerate(sharpe_ratios):
report += f"| #{i+1} | {s:.2f} | {p_values[i]:.4f} | {'βœ…' if bonf['reject'].iloc[i] else '❌'} | {'βœ…' if holm_r['reject'].iloc[i] else '❌'} | {'βœ…' if bh['reject'].iloc[i] else '❌'} |\n"
report += f"\n**Bonferroni corrected threshold:** {alpha/n_strategies:.6f}\n"
report += f"**Significant strategies (FDR-controlled):** {bh['reject'].sum()} of {n_strategies}\n"
report += f"\n⚠️ **Data Snooping Warning:** If you tested {n_strategies} strategies, the probability of at least one false positive at {alpha*100:.0f}% is {1-(1-alpha)**n_strategies*100:.1f}% uncorrected."
return report
if __name__ == '__main__':
np.random.seed(42)
# Simulate 50 strategies, only 5 have true alpha
sharpe = [0.05 + np.random.normal(0, 0.3) for _ in range(50)]
guard = DataSnoopingGuard(n_strategies_tested=50)
print(guard.familywise_error_report(sharpe))