alphaforge-quant-system / data_snooping_guard.py

Add data snooping guard - White's Reality Check, FDR, Bonferroni for multiple testing

6f0860b verified 3 days ago

5.18 kB

	"""data_snooping_guard.py — Multiple Testing & Data Snooping Protection

	Protects against false discovery from multiple strategy testing using
	White's Reality Check, False Discovery Rate (FDR) control, and
	Bonferroni/Holm correction. Essential for honest quant research.

	References:
	- White 2000: "A Reality Check for Data Snooping"
	- Romano & Wolf 2005: "Stepwise Multiple Testing as Formalized Data Snooping"
	- Benjamini & Hochberg 1995: "Controlling the False Discovery Rate"
	"""
	import numpy as np, pandas as pd
	from scipy import stats

	class DataSnoopingGuard:
	"""Guards against data snooping bias in strategy backtests."""

	def __init__(self, n_strategies_tested=1):
	self.n = n_strategies_tested

	def bonferroni(self, p_values, alpha=0.05):
	"""Bonferroni correction: reject if p < alpha/n."""
	corrected = np.minimum(np.array(p_values) * self.n, 1.0)
	reject = corrected < alpha
	return pd.DataFrame({'p_value': p_values, 'corrected_p': corrected, 'reject': reject})

	def holm(self, p_values, alpha=0.05):
	"""Holm-Bonferroni (step-down, less conservative)."""
	p = np.array(p_values)
	m = len(p)
	idx = np.argsort(p)
	sorted_p = p[idx]
	thresholds = alpha / (m - np.arange(m) + 1)
	reject = np.zeros(m, dtype=bool)
	for i in range(m):
	if sorted_p[i] < thresholds[i]:
	reject[idx[i]] = True
	else:
	break
	return pd.DataFrame({'p_value': p_values, 'reject': reject})

	def benjamini_hochberg(self, p_values, alpha=0.05):
	"""FDR control: controls expected proportion of false discoveries."""
	p = np.array(p_values)
	m = len(p)
	idx = np.argsort(p)
	sorted_p = p[idx]
	thresholds = alpha * (np.arange(1, m+1) / m)
	# Find largest k where p(k) <= threshold(k)
	k = 0
	for i in range(m-1, -1, -1):
	if sorted_p[i] <= thresholds[i]:
	k = i + 1
	break
	reject = np.zeros(m, dtype=bool)
	if k > 0:
	reject[idx[:k]] = True
	return pd.DataFrame({'p_value': p_values, 'reject': reject,
	'fdr_threshold': np.take(thresholds, np.argsort(idx))})

	def whites_reality_check(self, strategy_returns, benchmark_returns, n_boot=1000):
	"""White's Reality Check: bootstrap test for best strategy.

	Tests whether the best-performing strategy outperforms by chance.
	"""
	s = np.array(strategy_returns).flatten()
	b = np.array(benchmark_returns).flatten()
	if len(s) != len(b): b = np.resize(b, len(s))
	excess = s - b
	t_obs = excess.mean() / (excess.std() + 1e-10) * np.sqrt(len(excess))
	# Bootstrap under null (centered)
	centered = excess - excess.mean()
	t_boot = []
	for _ in range(n_boot):
	sample = np.random.choice(centered, size=len(centered), replace=True)
	t = sample.mean() / (sample.std() + 1e-10) * np.sqrt(len(sample))
	t_boot.append(t)
	t_boot = np.array(t_boot)
	p_value = (t_boot >= t_obs).mean()
	return {'t_stat': float(t_obs), 'p_value': float(p_value),
	'reject_null': bool(p_value < 0.05),
	'n_bootstraps': n_boot,
	'percentile': float(stats.percentileofscore(t_boot, t_obs))}

	def familywise_error_report(self, sharpe_ratios, n_strategies=None, alpha=0.05):
	"""Full report on multiple testing corrections for Sharpe ratios."""
	if n_strategies is None: n_strategies = len(sharpe_ratios)
	# Approximate p-values from Sharpe (asymptotic normal)
	p_values = [2 * (1 - stats.norm.cdf(abs(s) * np.sqrt(252))) for s in sharpe_ratios]
	bonf = self.bonferroni(p_values, alpha)
	holm_r = self.holm(p_values, alpha)
	bh = self.benjamini_hochberg(p_values, alpha)
	report = f"""## Multiple Testing Correction Report

	Strategies tested: {n_strategies} \| Significance level: {alpha*100:.0f}%

	\| Strategy \| Sharpe \| p-value \| Bonferroni \| Holm \| FDR (BH) \|
	\|----------\|--------\|---------\|------------\|------\|----------\|
	"""
	for i, s in enumerate(sharpe_ratios):
	report += f"\| #{i+1} \| {s:.2f} \| {p_values[i]:.4f} \| {'✅' if bonf['reject'].iloc[i] else '❌'} \| {'✅' if holm_r['reject'].iloc[i] else '❌'} \| {'✅' if bh['reject'].iloc[i] else '❌'} \|\n"
	report += f"\nBonferroni corrected threshold: {alpha/n_strategies:.6f}\n"
	report += f"Significant strategies (FDR-controlled): {bh['reject'].sum()} of {n_strategies}\n"
	report += f"\n⚠️ Data Snooping Warning: If you tested {n_strategies} strategies, the probability of at least one false positive at {alpha100:.0f}% is {1-(1-alpha)n_strategies100:.1f}% uncorrected."
	return report

	if __name__ == '__main__':
	np.random.seed(42)
	# Simulate 50 strategies, only 5 have true alpha
	sharpe = [0.05 + np.random.normal(0, 0.3) for _ in range(50)]
	guard = DataSnoopingGuard(n_strategies_tested=50)
	print(guard.familywise_error_report(sharpe))