Premchan369 commited on
Commit
05c5eeb
·
verified ·
1 Parent(s): aab4bbb

Add cross-asset statistical arbitrage: cointegration, pairs trading, PCA mean-reversion

Browse files
Files changed (1) hide show
  1. stat_arb.py +496 -0
stat_arb.py ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Cross-Asset Statistical Arbitrage
2
+
3
+ Jane Street and Two Sigma's bread and butter.
4
+ Not directional bets — finding RELATIVE mispricings between assets.
5
+
6
+ Strategies:
7
+ 1. Pairs Trading: Find cointegrated pairs, trade spread mean-reversion
8
+ 2. PCA Mean-Reversion: Extract risk factors, trade residuals
9
+ 3. Correlation Arbitrage: Options on baskets vs. baskets of options
10
+ 4. ETF Arbitrage: Price discrepancies between ETF and NAV
11
+ 5. Cross-Asset Momentum: Lead-lag effects (e.g., VIX → SPX)
12
+ """
13
+ import numpy as np
14
+ import pandas as pd
15
+ from typing import Dict, List, Tuple, Optional
16
+ from scipy import stats
17
+ from scipy.optimize import minimize
18
+ import warnings
19
+ warnings.filterwarnings('ignore')
20
+
21
+
22
+ def engle_granger_cointegration(x: np.ndarray, y: np.ndarray,
23
+ maxlag: int = 5) -> Dict:
24
+ """
25
+ Engle-Granger two-step cointegration test.
26
+
27
+ H0: No cointegration (spread is unit root = non-stationary)
28
+ H1: Cointegration exists (spread is stationary = mean-reverting)
29
+
30
+ If cointegrated, the spread WILL mean-revert. That's the trade.
31
+ """
32
+ # Step 1: OLS regression y = α + βx + ε
33
+ x_const = np.column_stack([np.ones(len(x)), x])
34
+
35
+ # Use simple linear regression
36
+ beta = np.linalg.lstsq(x_const, y, rcond=None)[0]
37
+ alpha, slope = beta[0], beta[1]
38
+
39
+ # Residuals (the spread)
40
+ spread = y - alpha - slope * x
41
+
42
+ # Step 2: ADF test on spread
43
+ adf_stat, pvalue, _, _, critical_values = adf_test(spread, maxlag=maxlag)
44
+
45
+ is_cointegrated = pvalue < 0.05
46
+
47
+ # Half-life of mean reversion
48
+ spread_lag = spread[:-1]
49
+ spread_diff = np.diff(spread)
50
+
51
+ if len(spread_lag) > 1 and np.var(spread_lag) > 0:
52
+ theta = np.cov(spread_diff, spread_lag)[0,1] / np.var(spread_lag)
53
+ half_life = -np.log(2) / theta if theta < 0 and theta > -1 else np.inf
54
+ else:
55
+ half_life = np.inf
56
+
57
+ return {
58
+ 'cointegrated': is_cointegrated,
59
+ 'adf_statistic': adf_stat,
60
+ 'pvalue': pvalue,
61
+ 'critical_values': critical_values,
62
+ 'alpha': alpha,
63
+ 'beta': slope,
64
+ 'spread_mean': np.mean(spread),
65
+ 'spread_std': np.std(spread),
66
+ 'half_life': half_life,
67
+ 'spread': spread
68
+ }
69
+
70
+
71
+ def adf_test(series: np.ndarray, maxlag: int = 5) -> Tuple:
72
+ """Simplified ADF unit root test"""
73
+ n = len(series)
74
+
75
+ # Difference
76
+ dy = np.diff(series)
77
+ y_lag = series[:-1]
78
+
79
+ # Regression: dy = α + β*y_lag + ε
80
+ X = np.column_stack([np.ones(len(dy)), y_lag])
81
+ beta = np.linalg.lstsq(X, dy, rcond=None)[0]
82
+
83
+ residuals = dy - X @ beta
84
+
85
+ # Standard error of beta (slope on lag)
86
+ mse = np.mean(residuals ** 2)
87
+ var_beta = mse * np.linalg.inv(X.T @ X)
88
+ se_beta = np.sqrt(var_beta[1, 1]) if var_beta.shape == (2, 2) else 1.0
89
+
90
+ t_stat = beta[1] / se_beta if se_beta > 0 else 0
91
+
92
+ # Critical values (Dickey-Fuller distribution, approx)
93
+ critical = {
94
+ '1%': -3.43,
95
+ '5%': -2.86,
96
+ '10%': -2.57
97
+ }
98
+
99
+ # P-value approximation
100
+ pvalue = 0.1 if t_stat > critical['10%'] else 0.05 if t_stat > critical['5%'] else 0.01
101
+
102
+ return t_stat, pvalue, maxlag, n, critical
103
+
104
+
105
+ class PairsTradingStrategy:
106
+ """
107
+ Pairs trading on cointegrated assets.
108
+
109
+ Signal: Spread z-score (how many std devs from mean)
110
+ Entry: |z-score| > threshold
111
+ Exit: z-score reverts to 0
112
+
113
+ Risk: Cointegration breaks (regime change) → stop loss
114
+ """
115
+
116
+ def __init__(self,
117
+ lookback: int = 60,
118
+ entry_z: float = 2.0,
119
+ exit_z: float = 0.5,
120
+ stop_z: float = 3.5,
121
+ max_holding: int = 20):
122
+ self.lookback = lookback
123
+ self.entry_z = entry_z
124
+ self.exit_z = exit_z
125
+ self.stop_z = stop_z
126
+ self.max_holding = max_holding
127
+
128
+ self.positions = [] # Active trades
129
+ self.trade_history = []
130
+
131
+ def calculate_spread(self,
132
+ prices1: np.ndarray,
133
+ prices2: np.ndarray,
134
+ hedge_ratio: Optional[float] = None) -> np.ndarray:
135
+ """Calculate spread between two price series"""
136
+ if hedge_ratio is None:
137
+ # Rolling hedge ratio
138
+ hedge_ratio = np.ones(len(prices1))
139
+ for i in range(self.lookback, len(prices1)):
140
+ y_window = prices2[i-self.lookback:i]
141
+ x_window = prices1[i-self.lookback:i]
142
+ if np.var(x_window) > 0:
143
+ hr = np.cov(y_window, x_window)[0,1] / np.var(x_window)
144
+ hedge_ratio[i] = hr
145
+ else:
146
+ hedge_ratio[i] = hedge_ratio[i-1]
147
+
148
+ return prices2 - hedge_ratio * prices1
149
+
150
+ def generate_signals(self,
151
+ spread: np.ndarray,
152
+ spread_mean: Optional[float] = None,
153
+ spread_std: Optional[float] = None) -> pd.DataFrame:
154
+ """Generate entry/exit signals from spread z-scores"""
155
+ if spread_mean is None:
156
+ spread_mean = pd.Series(spread).rolling(self.lookback).mean().values
157
+ if spread_std is None:
158
+ spread_std = pd.Series(spread).rolling(self.lookback).std().values
159
+
160
+ zscore = (spread - spread_mean) / (spread_std + 1e-10)
161
+
162
+ signals = pd.DataFrame({
163
+ 'spread': spread,
164
+ 'zscore': zscore,
165
+ 'spread_mean': spread_mean,
166
+ 'spread_std': spread_std
167
+ })
168
+
169
+ # Signals
170
+ signals['long_spread'] = zscore < -self.entry_z # Spread cheap → long spread
171
+ signals['short_spread'] = zscore > self.entry_z # Spread expensive → short spread
172
+ signals['exit_long'] = zscore > -self.exit_z # Exit long
173
+ signals['exit_short'] = zscore < self.exit_z # Exit short
174
+ signals['stop_loss'] = np.abs(zscore) > self.stop_z # Stop loss
175
+
176
+ return signals
177
+
178
+ def backtest(self,
179
+ prices1: np.ndarray,
180
+ prices2: np.ndarray,
181
+ hedge_ratio: Optional[np.ndarray] = None,
182
+ transaction_cost: float = 0.001) -> pd.DataFrame:
183
+ """
184
+ Backtest pairs trading strategy.
185
+
186
+ Position sizing:
187
+ - Dollar-neutral: invest $X in asset1, short $X*hedge_ratio in asset2
188
+ - Residual exposure should be ~0 beta
189
+ """
190
+ spread = self.calculate_spread(prices1, prices2, hedge_ratio)
191
+ signals = self.generate_signals(spread)
192
+
193
+ # Position tracking
194
+ position = 0 # 0 = flat, 1 = long spread, -1 = short spread
195
+ entry_price1 = 0
196
+ entry_price2 = 0
197
+ entry_z = 0
198
+ holding_days = 0
199
+
200
+ pnl = []
201
+ positions = []
202
+ zscores = []
203
+
204
+ for i in range(len(signals)):
205
+ sig = signals.iloc[i]
206
+
207
+ # Check exits
208
+ if position == 1 and (sig['exit_long'] or sig['stop_loss'] or holding_days >= self.max_holding):
209
+ # Close long spread
210
+ pnl_pct = ((prices1[i] - entry_price1) / entry_price1 -
211
+ (prices2[i] - entry_price2) / entry_price2)
212
+ pnl.append(pnl_pct - 2 * transaction_cost)
213
+ position = 0
214
+ holding_days = 0
215
+
216
+ elif position == -1 and (sig['exit_short'] or sig['stop_loss'] or holding_days >= self.max_holding):
217
+ # Close short spread
218
+ pnl_pct = ((entry_price1 - prices1[i]) / entry_price1 -
219
+ (entry_price2 - prices2[i]) / entry_price2)
220
+ pnl.append(pnl_pct - 2 * transaction_cost)
221
+ position = 0
222
+ holding_days = 0
223
+
224
+ # Check entries (only if flat)
225
+ elif position == 0:
226
+ if sig['long_spread']:
227
+ position = 1
228
+ entry_price1 = prices1[i]
229
+ entry_price2 = prices2[i]
230
+ entry_z = sig['zscore']
231
+ holding_days = 0
232
+ elif sig['short_spread']:
233
+ position = -1
234
+ entry_price1 = prices1[i]
235
+ entry_price2 = prices2[i]
236
+ entry_z = sig['zscore']
237
+ holding_days = 0
238
+
239
+ if position != 0:
240
+ holding_days += 1
241
+
242
+ positions.append(position)
243
+ zscores.append(sig['zscore'])
244
+
245
+ results = pd.DataFrame({
246
+ 'position': positions,
247
+ 'zscore': zscores,
248
+ 'spread': spread
249
+ })
250
+
251
+ # Add PnL (forward fill from trade list)
252
+ if pnl:
253
+ results['trade_pnl'] = pd.Series(pnl).reindex(results.index)
254
+
255
+ return results
256
+
257
+
258
+ class PCAMeanReversion:
259
+ """
260
+ PCA-based mean-reversion strategy.
261
+
262
+ Insight: Extract principal components (market factors).
263
+ Residuals = stock return minus projection on factors.
264
+ Trade residuals: long underperformers, short outperformers.
265
+
266
+ This is what quant funds do: factor-neutral = pure alpha.
267
+ """
268
+
269
+ def __init__(self, n_factors: int = 5):
270
+ self.n_factors = n_factors
271
+ self.eigenvectors = None
272
+ self.eigenvalues = None
273
+ self.mean_returns = None
274
+
275
+ def fit(self, returns: pd.DataFrame):
276
+ """Fit PCA on return matrix"""
277
+ # Demean
278
+ self.mean_returns = returns.mean()
279
+ centered = returns - self.mean_returns
280
+
281
+ # SVD for numerical stability
282
+ cov = centered.T @ centered / len(centered)
283
+ eigenvalues, eigenvectors = np.linalg.eigh(cov)
284
+
285
+ # Sort descending
286
+ idx = np.argsort(eigenvalues)[::-1]
287
+ self.eigenvalues = eigenvalues[idx]
288
+ self.eigenvectors = eigenvectors[:, idx]
289
+
290
+ return self
291
+
292
+ def transform(self, returns: pd.DataFrame) -> pd.DataFrame:
293
+ """Project returns onto principal components"""
294
+ centered = returns - self.mean_returns
295
+
296
+ # Factor exposures (what the market is doing)
297
+ factors = centered @ self.eigenvectors[:, :self.n_factors]
298
+
299
+ # Reconstruct using top factors
300
+ reconstructed = factors @ self.eigenvectors[:, :self.n_factors].T + self.mean_returns
301
+
302
+ # Residuals = actual - predicted (idiosyncratic component)
303
+ residuals = returns - reconstructed
304
+
305
+ return residuals
306
+
307
+ def get_factor_exposures(self, returns: pd.DataFrame) -> pd.DataFrame:
308
+ """Get each asset's exposure to each factor"""
309
+ return pd.DataFrame(
310
+ self.eigenvectors[:, :self.n_factors],
311
+ index=returns.columns,
312
+ columns=[f'factor_{i+1}' for i in range(self.n_factors)]
313
+ )
314
+
315
+ def generate_residual_signals(self,
316
+ returns: pd.DataFrame,
317
+ lookback: int = 20,
318
+ entry_z: float = 2.0) -> pd.DataFrame:
319
+ """
320
+ Generate mean-reversion signals on residuals.
321
+
322
+ Signal: z-score of residual.
323
+ Long assets with negative residual (underperformed factor model).
324
+ Short assets with positive residual (outperformed).
325
+ """
326
+ residuals = self.transform(returns)
327
+
328
+ # Z-score of residuals
329
+ zscores = (residuals - residuals.rolling(lookback).mean()) / \
330
+ (residuals.rolling(lookback).std() + 1e-10)
331
+
332
+ # Rank for portfolio construction
333
+ latest_z = zscores.iloc[-1] if len(zscores) > 0 else pd.Series(0, index=returns.columns)
334
+
335
+ # Long bottom decile (most negative residual = biggest underperformance)
336
+ # Short top decile (most positive residual = biggest outperformance)
337
+ signals = pd.DataFrame({
338
+ 'zscore': latest_z,
339
+ 'signal': 0
340
+ })
341
+
342
+ # Rank-based signals
343
+ signals['rank'] = signals['zscore'].rank()
344
+ n = len(signals)
345
+
346
+ # Bottom 20%: long (expect mean reversion up)
347
+ signals.loc[signals['rank'] <= n * 0.2, 'signal'] = 1
348
+ # Top 20%: short (expect mean reversion down)
349
+ signals.loc[signals['rank'] >= n * 0.8, 'signal'] = -1
350
+
351
+ return signals
352
+
353
+
354
+ class LeadLagDetector:
355
+ """
356
+ Detect lead-lag relationships between assets.
357
+
358
+ Example: VIX futures lead SPX. Commodity futures lead ETFs.
359
+ Use cross-correlation at different lags.
360
+ """
361
+
362
+ def __init__(self, max_lag: int = 10):
363
+ self.max_lag = max_lag
364
+
365
+ def cross_correlation(self, x: np.ndarray, y: np.ndarray) -> Dict:
366
+ """
367
+ Compute cross-correlation at different lags.
368
+
369
+ If corr at lag +k is high: x leads y by k periods.
370
+ If corr at lag -k is high: y leads x by k periods.
371
+ """
372
+ # Normalize
373
+ x = (x - np.mean(x)) / (np.std(x) + 1e-10)
374
+ y = (y - np.mean(y)) / (np.std(y) + 1e-10)
375
+
376
+ correlations = {}
377
+
378
+ for lag in range(-self.max_lag, self.max_lag + 1):
379
+ if lag == 0:
380
+ corr = np.corrcoef(x, y)[0, 1]
381
+ elif lag > 0:
382
+ # x leads y
383
+ corr = np.corrcoef(x[:-lag], y[lag:])[0, 1]
384
+ else:
385
+ # y leads x
386
+ corr = np.corrcoef(x[-lag:], y[:lag])[0, 1]
387
+
388
+ correlations[lag] = corr
389
+
390
+ # Find best lag
391
+ best_lag = max(correlations, key=lambda k: abs(correlations[k]))
392
+ best_corr = correlations[best_lag]
393
+
394
+ return {
395
+ 'correlations': correlations,
396
+ 'best_lag': best_lag,
397
+ 'best_correlation': best_corr,
398
+ 'leader': 'x' if best_lag > 0 else ('y' if best_lag < 0 else 'none')
399
+ }
400
+
401
+ def find_all_lead_lag(self, returns_df: pd.DataFrame) -> pd.DataFrame:
402
+ """Find lead-lag relationships across all asset pairs"""
403
+ assets = returns_df.columns
404
+ results = []
405
+
406
+ for i, a1 in enumerate(assets):
407
+ for j, a2 in enumerate(assets):
408
+ if i >= j:
409
+ continue
410
+
411
+ result = self.cross_correlation(
412
+ returns_df[a1].values,
413
+ returns_df[a2].values
414
+ )
415
+
416
+ results.append({
417
+ 'asset1': a1,
418
+ 'asset2': a2,
419
+ 'best_lag': result['best_lag'],
420
+ 'best_correlation': result['best_correlation'],
421
+ 'leader': result['leader']
422
+ })
423
+
424
+ return pd.DataFrame(results).sort_values('best_correlation', key=abs, ascending=False)
425
+
426
+
427
+ if __name__ == '__main__':
428
+ print("=" * 70)
429
+ print(" STATISTICAL ARBITRAGE ENGINE")
430
+ print("=" * 70)
431
+
432
+ np.random.seed(42)
433
+
434
+ # Generate cointegrated pair
435
+ n = 500
436
+ common_factor = np.cumsum(np.random.randn(n) * 0.01)
437
+
438
+ # Asset 1: 50% common factor + noise
439
+ prices1 = 100 + 0.5 * common_factor + np.cumsum(np.random.randn(n) * 0.005)
440
+ # Asset 2: 70% common factor + noise
441
+ prices2 = 100 + 0.7 * common_factor + np.cumsum(np.random.randn(n) * 0.005)
442
+
443
+ # Cointegration test
444
+ print("\n1. COINTEGRATION TEST")
445
+ result = engle_granger_cointegration(prices1, prices2)
446
+ print(f" Cointegrated: {result['cointegrated']}")
447
+ print(f" ADF Statistic: {result['adf_statistic']:.3f}")
448
+ print(f" P-value: {result['pvalue']:.3f}")
449
+ print(f" Half-life: {result['half_life']:.1f} periods")
450
+ print(f" Hedge ratio: {result['beta']:.3f}")
451
+
452
+ # Pairs trading
453
+ print("\n2. PAIRS TRADING BACKTEST")
454
+ strategy = PairsTradingStrategy(lookback=60, entry_z=2.0, exit_z=0.5)
455
+ results = strategy.backtest(prices1, prices2, transaction_cost=0.001)
456
+
457
+ trades = results[results['position'].diff() != 0]
458
+ print(f" Number of trades: {len(trades)}")
459
+
460
+ # PCA Mean-Reversion
461
+ print("\n3. PCA MEAN-REVERSION")
462
+ n_assets = 10
463
+ returns = pd.DataFrame(
464
+ np.random.randn(n, n_assets) * 0.02 + 0.0001,
465
+ columns=[f'ASSET_{i}' for i in range(n_assets)]
466
+ )
467
+
468
+ # Add common factor to some assets
469
+ for i in [0, 1, 2, 3]:
470
+ returns.iloc[:, i] += common_factor[1:] * 0.01
471
+
472
+ pca = PCAMeanReversion(n_factors=3)
473
+ pca.fit(returns)
474
+
475
+ print(f" Explained variance by top 3 factors: {pca.eigenvalues[:3].sum() / pca.eigenvalues.sum() * 100:.1f}%")
476
+
477
+ signals = pca.generate_residual_signals(returns)
478
+ print(f" Long signals: {(signals['signal'] == 1).sum()}")
479
+ print(f" Short signals: {(signals['signal'] == -1).sum()}")
480
+
481
+ # Lead-lag
482
+ print("\n4. LEAD-LAG DETECTION")
483
+ # VIX-like and SPX-like
484
+ vix_like = np.abs(np.random.randn(n) * 0.02)
485
+ spx_like = np.cumsum(-vix_like[1:] * 0.3 + np.random.randn(n-1) * 0.01)
486
+
487
+ detector = LeadLagDetector(max_lag=5)
488
+ ll = detector.cross_correlation(vix_like, spx_like)
489
+ print(f" Best lag: {ll['best_lag']} (negative = VIX leads SPX)")
490
+ print(f" Best correlation: {ll['best_correlation']:.3f}")
491
+ print(f" Leader: {ll['leader']}")
492
+
493
+ print(f"\n This is what Two Sigma and Jane Street do ALL DAY:")
494
+ print(f" Find mispricings between RELATED assets, not bet on direction")
495
+ print(f" Market-neutral = zero beta exposure")
496
+ print(f" Pure alpha from statistical relationships")