JayLacoma commited on
Commit
95833cf
·
verified ·
1 Parent(s): 1cd84e4

Update feature_engineering.py

Browse files
Files changed (1) hide show
  1. feature_engineering.py +178 -419
feature_engineering.py CHANGED
@@ -1,6 +1,6 @@
1
  """
2
  Integrated Market Theory - Feature Engineering Pipeline
3
- Combines all tickers from geo_macro.py into unified theory indicators
4
 
5
  Usage:
6
  python feature_engineering.py --input unified_market_data.csv --output enhanced_features.csv
@@ -10,479 +10,238 @@ import pandas as pd
10
  import numpy as np
11
  from sklearn.decomposition import PCA
12
  from sklearn.preprocessing import StandardScaler
13
- import warnings
14
- warnings.filterwarnings('ignore')
15
-
16
 
17
  def safe_zscore(series, window=252, min_obs=30):
18
- """Rolling z-score with fallback to 0 for unstable windows"""
19
  mean = series.rolling(window, min_periods=min_obs).mean()
20
  std = series.rolling(window, min_periods=min_obs).std()
21
  z = (series - mean) / std
22
  return z.fillna(0).clip(-3, 3)
23
 
 
 
 
 
24
 
25
  class IntegratedTheoryFeatures:
26
- """
27
- Transforms raw market data into theory-driven features combining:
28
- - Dalio's 5 Forces
29
- - Stevenson's Inequality Metrics
30
- - Thiel's Monopoly Indicators
31
- - Gundlach's Reckoning Signals
32
- """
33
-
34
  def __init__(self, df):
35
- # Validate critical columns
36
  required = {'SP500', 'DGS10', 'Gold', 'VIX', 'UNRATE', 'CPIAUCSL'}
37
  missing = required - set(df.columns)
38
  if missing:
39
  raise ValueError(f"Critical data missing: {missing}")
40
-
41
  self.df = df.copy()
42
  self.features = pd.DataFrame(index=df.index)
43
-
44
- def calculate_returns_volatility(self, windows=[21, 63, 252]):
45
- """Calculate multi-timeframe returns and volatility for all tickers"""
46
- print("Calculating returns and volatility...")
47
-
48
- for col in self.df.columns:
49
- for window in windows:
50
- # Returns
51
- self.df[f'{col}_ret{window}'] = self.df[col].pct_change(window)
52
- # Volatility
53
- self.df[f'{col}_vol{window}'] = self.df[col].pct_change().rolling(window).std()
54
- # Momentum
55
- self.df[f'{col}_mom{window}'] = (
56
- self.df[col].pct_change(window) -
57
- self.df[col].pct_change(window).shift(window)
58
- )
59
- return self
60
-
61
  def dalio_forces(self):
62
- """Ray Dalio's 5 Forces Composite Indicators"""
63
- print("Building Dalio's 5 Forces...")
64
-
65
- # Force 1: Debt/Economic Cycle
66
  yield_curve = self.df.get('DGS10', 0) - self.df.get('DGS2', 0)
67
  inflation_mom = self.df.get('CPIAUCSL', pd.Series(0)).pct_change(12) * 100
68
  hy_spread = self.df.get('BAMLH0A0HYM2', pd.Series(0)) / 100
69
-
70
- self.features['dalio_debt_cycle'] = (
71
- yield_curve * 0.3 +
72
- inflation_mom * 0.4 +
73
- hy_spread * 0.3
74
- )
75
-
76
- # Force 2: Internal Conflict
77
- consumer_weakness = (self.df.get('Consumer_Discretionary', 0) /
78
- self.df.get('Consumer_Staples', 1)).pct_change(63) * -1
79
  unemployment_stress = self.df.get('UNRATE', pd.Series(0)).diff() * 2
80
- small_large_gap = (self.df.get('Small_Cap_Value', 0) /
81
- self.df.get('SP500', 1)).pct_change(63) * -1
82
-
83
- self.features['dalio_internal_conflict'] = (
84
- consumer_weakness * 0.4 +
85
- unemployment_stress * 0.3 +
86
- small_large_gap * 0.3
87
- )
88
-
89
- # Force 3: External Conflict
90
  defense_momentum = self.df.get('Defense_Stocks', pd.Series(0)).pct_change(21)
91
- dollar_anomaly = self._calculate_dollar_anomaly()
92
- china_taiwan_tension = self._calculate_asia_tension()
93
-
94
- self.features['dalio_external_conflict'] = (
95
- defense_momentum * 0.4 +
96
- dollar_anomaly * 0.3 +
97
- china_taiwan_tension * 0.3
98
- )
99
-
100
- # Force 4: Acts of Nature
101
  water_stress = self.df.get('Water', pd.Series(0)).pct_change(63)
102
- ag_volatility = self.df.get('Agricultural', pd.Series(0)).pct_change().rolling(63).std() * 100
103
-
104
- self.features['dalio_nature_force'] = (
105
- water_stress * 0.6 +
106
- ag_volatility * 0.4
107
- )
108
-
109
- # Force 5: Technology/Inventiveness
110
- tech_outperform = (self.df.get('Technology', 0) /
111
- self.df.get('SP500', 1)).pct_change(21)
112
- cloud_momentum = self.df.get('Cloud_Computing', pd.Series(0)).pct_change(63)
113
- ai_momentum = self.df.get('Robotics_AI', pd.Series(0)).pct_change(63)
114
-
115
- self.features['dalio_tech_force'] = (
116
- tech_outperform * 0.4 +
117
- cloud_momentum * 0.3 +
118
- ai_momentum * 0.3
119
- )
120
-
121
- # Master Composite
122
- dalio_components = [
123
- self.features['dalio_debt_cycle'] * 0.35,
124
- self.features['dalio_internal_conflict'] * 0.25,
125
- self.features['dalio_external_conflict'] * 0.20,
126
- self.features['dalio_tech_force'] * 0.15,
127
  self.features['dalio_nature_force'] * 0.05
128
- ]
129
-
130
- self.features['dalio_composite'] = pd.concat(dalio_components, axis=1).sum(axis=1)
131
- self.features['dalio_composite_norm'] = self._normalize(self.features['dalio_composite'])
132
  return self
133
-
134
  def stevenson_inequality(self):
135
- """Gary Stevenson's Inequality Amplification Metrics"""
136
- print("Building Stevenson's inequality indicators...")
137
-
138
- asset_rich = (self.df.get('Gold', 0) +
139
- self.df.get('Real_Estate', 0) +
140
- self.df.get('Growth_Stocks', 0)) / 3
141
- middle_class = (self.df.get('Consumer_Staples', 0) +
142
- self.df.get('Regional_Banks', 0) +
143
- self.df.get('Small_Cap_Value', 0)) / 3
144
-
145
- self.features['inequality_wealth_flow'] = (
146
- asset_rich.pct_change(63) - middle_class.pct_change(63)
147
- )
148
-
149
  luxury = self.df.get('Retail_Luxury', pd.Series(0)).pct_change(21)
150
- mass = (self.df.get('Restaurants', 0) + self.df.get('Retail', 0)) / 2
151
- mass = mass.pct_change(21)
152
- self.features['inequality_consumption_gap'] = luxury - mass
153
-
154
- quality_credit = (self.df.get('Investment_Grade_Spread', 0) +
155
- self.df.get('Preferred_Stock', 0)) / 2
156
- junk_credit = (self.df.get('HYG', 0) +
157
- self.df.get('JNK', 0) +
158
- self.df.get('Emerging_Market_Debt', 0)) / 3
159
- self.features['inequality_credit_access'] = (
160
- quality_credit.pct_change(63) - junk_credit.pct_change(63)
161
- )
162
-
163
- self.features['stevenson_inequality'] = (
164
- self.features['inequality_wealth_flow'] * 0.4 +
165
- self.features['inequality_consumption_gap'] * 0.3 +
166
- self.features['inequality_credit_access'] * 0.3
167
  )
168
- self.features['stevenson_inequality_norm'] = self._normalize(self.features['stevenson_inequality'])
169
-
170
- asset_inflation = (self.df.get('Gold', 0) + self.df.get('Real_Estate', 0)).pct_change(21)
171
- wage_proxy = self.df.get('Staffing', pd.Series(0)).pct_change(21)
172
- self.features['inequality_transmission'] = asset_inflation - wage_proxy
173
-
174
  return self
175
-
176
  def thiel_monopoly(self):
177
- """Peter Thiel's Monopoly vs Competition Indicators"""
178
- print("Building Thiel's monopoly indicators...")
179
-
180
- tech_strength = self.df.get('Technology', 0)
181
- finance_strength = self.df.get('Financials', 1)
182
- self.features['monopoly_cash_moat'] = (
183
- tech_strength.pct_change(63) - finance_strength.pct_change(63)
184
- )
185
-
186
- network_sectors = (self.df.get('Cloud_Computing', 0) * 0.4 +
187
- self.df.get('Communication_Services', 0) * 0.3 +
188
- self.df.get('Fintech', 0) * 0.3)
189
- self.features['monopoly_network_effects'] = network_sectors.pct_change(63)
190
-
191
- tech_volatility = self.df.get('Technology', pd.Series(1)).pct_change().rolling(63).std()
192
- chip_strength = self.df.get('Semiconductors', pd.Series(0)).pct_change(63)
193
- self.features['monopoly_defensibility'] = (
194
- (1 / (tech_volatility + 0.001)) * 0.01 +
195
- chip_strength * 0.5
196
- )
197
-
198
- self.features['thiel_monopoly'] = (
199
- self.features['monopoly_cash_moat'] * 0.35 +
200
- self.features['monopoly_network_effects'] * 0.35 +
201
- self.features['monopoly_defensibility'] * 0.30
202
  )
203
- self.features['thiel_monopoly_norm'] = self._normalize(self.features['thiel_monopoly'])
204
-
205
- tech_return = self.df.get('Technology', pd.Series(0)).pct_change(21)
206
- rate_change = self.df.get('DGS10', pd.Series(0)).diff() * -1
207
- self.features['monopoly_immunity'] = tech_return / (rate_change.abs() + 0.001)
208
-
209
- specialized = (self.df.get('Semiconductors', 0) +
210
- self.df.get('Cloud_Computing', 0) +
211
- self.df.get('Robotics_AI', 0)) / 3
212
- broad_tech = self.df.get('Technology', 1)
213
- self.features['tech_concentration'] = specialized / broad_tech
214
-
215
  return self
216
-
217
  def gundlach_reckoning(self):
218
- """Jeffrey Gundlach's Debt Reckoning and Paradigm Shift Signals"""
219
- print("Building Gundlach's reckoning indicators...")
220
-
221
- fed_proxy = self.df.get('DGS3MO', pd.Series(0))
222
- long_yield = self.df.get('DGS10', pd.Series(0))
223
- fed_cutting = fed_proxy.diff() < -0.05
224
- yield_rising = long_yield.diff() > 0
225
- self.features['gundlach_yield_anomaly'] = (
226
- (fed_cutting & yield_rising).astype(float) +
227
- (long_yield - fed_proxy)
228
- )
229
-
230
- gold_return = self.df.get('Gold', pd.Series(0)).pct_change(21)
231
- treasury_return = self.df.get('US_Treasuries_Long', pd.Series(1)).pct_change(21)
232
- self.features['gundlach_flight_shift'] = gold_return / (treasury_return + 0.001)
233
-
234
- dollar_weak = self.df.get('DXY', pd.Series(0)).pct_change(21) * -1
235
- em_outperform = (self.df.get('Emerging_Markets', 0) + self.df.get('Europe', 0)) / 2
236
- em_outperform = em_outperform.pct_change(21)
237
- sp_return = self.df.get('SP500', pd.Series(0)).pct_change(21)
238
- self.features['gundlach_capital_reversal'] = (
239
- dollar_weak * 0.5 +
240
- (em_outperform - sp_return) * 0.5
241
- )
242
-
243
- regional_stress = (self.df.get('Regional_Banks', 0) /
244
- self.df.get('Financials', 1)).pct_change(21)
245
- mortgage_reit_stress = self.df.get('Mortgage_REITs', pd.Series(0)).pct_change(21)
246
  real_estate_vol = self.df.get('Real_Estate', pd.Series(1)).pct_change().rolling(21).std() * 100
247
- self.features['gundlach_private_credit_risk'] = (
248
- regional_stress * -0.4 +
249
- mortgage_reit_stress * -0.3 +
250
- real_estate_vol * 0.3
251
- )
252
-
253
- self.features['gundlach_reckoning'] = (
254
- self.features['gundlach_yield_anomaly'] * 0.30 +
255
- self.features['gundlach_flight_shift'] * 0.25 +
256
- self.features['gundlach_capital_reversal'] * 0.25 +
257
- self.features['gundlach_private_credit_risk'] * 0.20
258
- )
259
- self.features['gundlach_reckoning_norm'] = self._normalize(self.features['gundlach_reckoning'])
260
  return self
261
-
262
  def geopolitical_indicators(self):
263
- """Regional conflict and energy transition signals"""
264
- print("Building geopolitical indicators...")
265
-
266
- oil_volatility = self.df.get('Oil', pd.Series(1)).pct_change().rolling(3).std() * 100
267
- defense_spike = self.df.get('Defense_Stocks', pd.Series(0)).pct_change(5)
268
  gold_haven = self.df.get('Gold_Safe_Haven', pd.Series(0)).pct_change(5)
269
- self.features['middle_east_risk'] = (
270
- oil_volatility * 0.4 +
271
- defense_spike * 0.3 +
272
- gold_haven * 0.3
273
- )
274
-
275
- gas_volatility = self.df.get('NaturalGas', pd.Series(1)).pct_change().rolling(5).std() * 100
276
- europe_decline = self.df.get('Europe', pd.Series(0)).pct_change(21) * -1
277
- swiss_franc_strength = self.df.get('Swiss_Franc', pd.Series(0)).pct_change(21) * -1
278
- self.features['europe_risk'] = (
279
- gas_volatility * 0.5 +
280
- europe_decline * 0.3 +
281
- swiss_franc_strength * 0.2
282
- )
283
-
284
  chip_stress = self.df.get('Semiconductors', pd.Series(1)).pct_change().rolling(21).std() * 100
285
- taiwan_korea = (self.df.get('Taiwan', 0) + self.df.get('South_Korea', 0)) / 2
286
- china_diverge = taiwan_korea.pct_change(21) - self.df.get('China', pd.Series(0)).pct_change(21)
287
  rare_earth = self.df.get('Rare_Earth', pd.Series(0)).pct_change(21)
288
- self.features['asia_risk'] = (
289
- chip_stress * 0.4 +
290
- china_diverge * 0.3 +
291
- rare_earth * 0.3
292
- )
293
-
294
- self.features['geopolitical_risk'] = (
295
- self.features['middle_east_risk'] * 0.4 +
296
- self.features['europe_risk'] * 0.3 +
297
- self.features['asia_risk'] * 0.3
298
- )
299
- self.features['geopolitical_risk_norm'] = self._normalize(self.features['geopolitical_risk'])
300
-
301
- uranium_momentum = self.df.get('Uranium', pd.Series(0)).pct_change(63)
302
- clean_momentum = self.df.get('Clean_Energy', pd.Series(0)).pct_change(63)
303
- oil_decline = self.df.get('Oil', pd.Series(0)).pct_change(252) * -1
304
- self.features['energy_transition'] = (
305
- uranium_momentum * 0.5 +
306
- clean_momentum * 0.3 +
307
- oil_decline * 0.2
308
  )
309
  return self
310
-
311
- def cross_asset_features(self):
312
- """Advanced cross-asset relationships"""
313
- print("Building cross-asset features...")
314
-
315
- defensive = (self.df.get('Gold', 0) +
316
- self.df.get('Utilities', 0) +
317
- self.df.get('Healthcare', 0)) / 3
318
- risk_on = (self.df.get('Technology', 0) +
319
- self.df.get('Consumer_Discretionary', 0) +
320
- self.df.get('Real_Estate', 0)) / 3
321
- self.features['flight_ratio'] = defensive / (risk_on + 0.001)
322
-
323
- regional_vs_broad = (self.df.get('Regional_Banks', 0) -
324
- self.df.get('Financials', 0))
325
- mortgage_vs_reit = (self.df.get('Mortgage_REITs', 0) -
326
- self.df.get('REITs', 0))
327
- em_vs_ig = (self.df.get('Emerging_Market_Debt', 0) -
328
- self.df.get('Investment_Grade_Spread', 0))
329
- self.features['credit_contagion'] = (
330
- regional_vs_broad.pct_change(21) +
331
- mortgage_vs_reit.pct_change(21) +
332
- em_vs_ig.pct_change(21)
333
- ) / 3
334
-
335
- vix = self.df.get('VIX', pd.Series(20))
336
- vix_historical_avg = vix.rolling(252).mean()
337
- geo_max = self.features[['middle_east_risk', 'europe_risk', 'asia_risk']].max(axis=1)
338
- self.features['geo_amplification'] = geo_max * (vix / vix_historical_avg)
339
- return self
340
-
341
  def scenario_probabilities(self):
342
- """Dynamic probability weights for future scenarios"""
343
- print("Calculating scenario probabilities...")
344
-
345
- # Scenario 1: Credit Collapse
346
- self.features['prob_credit_collapse'] = (
347
- self.features['gundlach_reckoning_norm'] * 0.4 +
348
- safe_zscore(self.features['gundlach_private_credit_risk']) * 0.03 +
349
- safe_zscore(self.features['dalio_debt_cycle']) * 0.03
 
350
  )
351
- self.features['prob_credit_collapse'] = np.clip(self.features['prob_credit_collapse'], 0, 1)
352
-
353
- # Scenario 2: Stagflation
354
- inflation_high = (self.df.get('CPIAUCSL', pd.Series(0)).pct_change(12) * 100 > 2.5).astype(float)
355
- unemployment_rising = (self.df.get('UNRATE', pd.Series(0)).diff() > 0).astype(float)
356
- self.features['prob_stagflation'] = (
357
- (inflation_high * unemployment_rising) * 0.3 +
358
- safe_zscore(self.features['dalio_external_conflict']) * 0.03 +
359
- safe_zscore(self.features['gundlach_capital_reversal']) * 0.02 +
360
- self.features['stevenson_inequality_norm'] * 0.2
361
  )
362
- self.features['prob_stagflation'] = np.clip(self.features['prob_stagflation'], 0, 1)
363
-
364
- # Scenario 3: Tech Monopoly Boom
365
- self.features['prob_tech_boom'] = (
366
- self.features['thiel_monopoly_norm'] * 0.4 +
367
- safe_zscore(self.features['dalio_tech_force'] - self.features['dalio_debt_cycle']) * 0.03 +
368
- safe_zscore(self.features['energy_transition']) * 0.02 +
369
- (self.df.get('China_Tech', pd.Series(0)).pct_change(63) <
370
- self.df.get('Technology', pd.Series(0)).pct_change(63)).astype(float) * 0.1
371
  )
372
- self.features['prob_tech_boom'] = np.clip(self.features['prob_tech_boom'], 0, 1)
373
-
374
- self.features['prob_controlled_reset'] = 0.05
375
- return self
376
-
377
- def regime_detection(self):
378
- """Classify current market regime"""
379
- print("Detecting market regimes...")
380
-
381
- def classify_regime(row):
382
- if (row['gundlach_reckoning_norm'] > 0.6 and row['prob_credit_collapse'] > 0.5):
383
- return 'CRISIS'
384
- elif row['thiel_monopoly_norm'] > 0.7:
385
- return 'TECH_MONOPOLY'
386
- elif (row['stevenson_inequality_norm'] > 0.6 and row['prob_stagflation'] > 0.4):
387
- return 'INEQUALITY_TRAP'
388
- elif row['geopolitical_risk_norm'] > 0.7:
389
- return 'GEOPOLITICAL_SHOCK'
390
- else:
391
- return 'TRANSITION'
392
-
393
- self.features['regime'] = self.features.apply(classify_regime, axis=1)
394
  return self
395
-
396
- def dimensionality_reduction(self):
397
- """Apply PCA to reduce feature space"""
398
- print("Applying dimensionality reduction...")
399
-
400
- debt_cols = [c for c in self.features.columns if 'dalio_debt' in c or 'gundlach' in c]
401
- inequality_cols = [c for c in self.features.columns if 'inequality' in c or 'stevenson' in c]
402
- geo_cols = [c for c in self.features.columns if 'risk' in c or 'middle_east' in c or 'europe' in c or 'asia' in c]
403
- tech_cols = [c for c in self.features.columns if 'monopoly' in c or 'thiel' in c or 'tech' in c]
404
-
405
- for name, cols in [('debt', debt_cols), ('inequality', inequality_cols),
406
- ('geo', geo_cols), ('tech', tech_cols)]:
407
- if len(cols) > 0:
408
- data = self.features[cols].dropna()
409
- if len(data) > 10:
410
- scaler = StandardScaler()
411
- data_scaled = scaler.fit_transform(data)
412
- pca = PCA(n_components=min(2, len(cols)))
413
- pcs = pca.fit_transform(data_scaled)
414
- for i in range(pcs.shape[1]):
415
- self.features.loc[data.index, f'{name}_PC{i+1}'] = pcs[:, i]
416
  return self
417
-
418
- def _calculate_dollar_anomaly(self):
419
- sp_correction = self.df.get('SP500', pd.Series(0)).pct_change(5) < -0.05
420
- dollar_weakness = self.df.get('DXY', pd.Series(0)).pct_change(5) < 0
421
- return (sp_correction & dollar_weakness).astype(float)
422
-
423
- def _calculate_asia_tension(self):
424
- taiwan = self.df.get('Taiwan', pd.Series(0))
425
- china = self.df.get('China', pd.Series(0))
426
- return (taiwan.pct_change(21) - china.pct_change(21)).fillna(0)
427
-
428
- def _normalize(self, series, window=252):
429
- rolling_mean = series.rolling(window, min_periods=20).mean()
430
- rolling_std = series.rolling(window, min_periods=20).std()
431
- return ((series - rolling_mean) / (rolling_std + 0.001)).clip(-3, 3) / 3
432
-
433
- def build_all_features(self):
434
- print("\n" + "="*80)
435
- print("INTEGRATED THEORY FEATURE ENGINEERING")
436
- print("="*80 + "\n")
437
-
438
- self.calculate_returns_volatility()
439
- self.dalio_forces()
440
- self.stevenson_inequality()
441
- self.thiel_monopoly()
442
- self.gundlach_reckoning()
443
- self.geopolitical_indicators()
444
- self.cross_asset_features()
445
- self.scenario_probabilities()
446
- self.regime_detection()
447
- self.dimensionality_reduction()
448
-
449
- print("\n" + "="*80)
450
- print("FEATURE ENGINEERING COMPLETE")
451
- print("="*80)
452
- print(f"Total features created: {len(self.features.columns)}")
453
- print(f"Regimes detected: {self.features['regime'].value_counts().to_dict()}")
454
- print(f"\nCurrent state (latest):")
455
- print(f" - Dalio Composite: {self.features['dalio_composite_norm'].iloc[-1]:.3f}")
456
- print(f" - Stevenson Inequality: {self.features['stevenson_inequality_norm'].iloc[-1]:.3f}")
457
- print(f" - Thiel Monopoly: {self.features['thiel_monopoly_norm'].iloc[-1]:.3f}")
458
- print(f" - Gundlach Reckoning: {self.features['gundlach_reckoning_norm'].iloc[-1]:.3f}")
459
- print(f" - Regime: {self.features['regime'].iloc[-1]}")
460
- print(f"\nScenario Probabilities:")
461
- print(f" - Credit Collapse: {self.features['prob_credit_collapse'].iloc[-1]:.1%}")
462
- print(f" - Stagflation: {self.features['prob_stagflation'].iloc[-1]:.1%}")
463
- print(f" - Tech Boom: {self.features['prob_tech_boom'].iloc[-1]:.1%}")
464
-
465
  return self.features
466
 
467
 
468
  def main():
469
  import argparse
470
- parser = argparse.ArgumentParser(description='Integrated Market Theory Feature Engineering')
471
- parser.add_argument('--input', default='unified_market_data.csv',
472
- help='Input CSV file from geo_macro.py')
473
- parser.add_argument('--output', default='enhanced_market_features.csv',
474
- help='Output CSV file with engineered features')
475
  args = parser.parse_args()
476
-
477
- print(f"Loading data from {args.input}...")
478
  df = pd.read_csv(args.input, index_col=0, parse_dates=True)
479
- print(f"Loaded {len(df)} rows, {len(df.columns)} columns")
480
- print(f"Date range: {df.index.min()} to {df.index.max()}")
481
-
482
  engine = IntegratedTheoryFeatures(df)
483
- features = engine.build_all_features()
484
-
485
- features.to_csv(args.output) # ✅ FIXED: added missing parenthesis
486
 
487
 
488
  if __name__ == "__main__":
 
1
  """
2
  Integrated Market Theory - Feature Engineering Pipeline
3
+ Generates transparent, theory-driven features for regime detection and strategic allocation.
4
 
5
  Usage:
6
  python feature_engineering.py --input unified_market_data.csv --output enhanced_features.csv
 
10
  import numpy as np
11
  from sklearn.decomposition import PCA
12
  from sklearn.preprocessing import StandardScaler
 
 
 
13
 
14
  def safe_zscore(series, window=252, min_obs=30):
 
15
  mean = series.rolling(window, min_periods=min_obs).mean()
16
  std = series.rolling(window, min_periods=min_obs).std()
17
  z = (series - mean) / std
18
  return z.fillna(0).clip(-3, 3)
19
 
20
+ def normalize(series, window=252):
21
+ rolling_mean = series.rolling(window, min_periods=20).mean()
22
+ rolling_std = series.rolling(window, min_periods=20).std()
23
+ return ((series - rolling_mean) / (rolling_std + 0.001)).clip(-3, 3) / 3
24
 
25
  class IntegratedTheoryFeatures:
 
 
 
 
 
 
 
 
26
  def __init__(self, df):
 
27
  required = {'SP500', 'DGS10', 'Gold', 'VIX', 'UNRATE', 'CPIAUCSL'}
28
  missing = required - set(df.columns)
29
  if missing:
30
  raise ValueError(f"Critical data missing: {missing}")
 
31
  self.df = df.copy()
32
  self.features = pd.DataFrame(index=df.index)
33
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def dalio_forces(self):
35
+ # Debt Cycle
 
 
 
36
  yield_curve = self.df.get('DGS10', 0) - self.df.get('DGS2', 0)
37
  inflation_mom = self.df.get('CPIAUCSL', pd.Series(0)).pct_change(12) * 100
38
  hy_spread = self.df.get('BAMLH0A0HYM2', pd.Series(0)) / 100
39
+ self.features['dalio_debt_cycle'] = yield_curve * 0.3 + inflation_mom * 0.4 + hy_spread * 0.3
40
+
41
+ # Internal Conflict
42
+ consumer_weakness = (self.df.get('Consumer_Discretionary', 0) / self.df.get('Consumer_Staples', 1)).pct_change(63) * -1
 
 
 
 
 
 
43
  unemployment_stress = self.df.get('UNRATE', pd.Series(0)).diff() * 2
44
+ small_large_gap = (self.df.get('Small_Cap_Value', 0) / self.df.get('SP500', 1)).pct_change(63) * -1
45
+ self.features['dalio_internal_conflict'] = consumer_weakness * 0.4 + unemployment_stress * 0.3 + small_large_gap * 0.3
46
+
47
+ # External Conflict
 
 
 
 
 
 
48
  defense_momentum = self.df.get('Defense_Stocks', pd.Series(0)).pct_change(21)
49
+ sp_corr = self.df.get('SP500', pd.Series(0)).pct_change(5) < -0.05
50
+ dollar_weak = self.df.get('DXY', pd.Series(0)).pct_change(5) < 0
51
+ dollar_anomaly = (sp_corr & dollar_weak).astype(float)
52
+ taiwan = self.df.get('Taiwan', pd.Series(0))
53
+ china = self.df.get('China', pd.Series(0))
54
+ china_taiwan_tension = (taiwan.pct_change(21) - china.pct_change(21)).fillna(0)
55
+ self.features['dalio_external_conflict'] = defense_momentum * 0.4 + dollar_anomaly * 0.3 + china_taiwan_tension * 0.3
56
+
57
+ # Nature
 
58
  water_stress = self.df.get('Water', pd.Series(0)).pct_change(63)
59
+ ag_vol = self.df.get('Agricultural', pd.Series(0)).pct_change().rolling(63).std() * 100
60
+ self.features['dalio_nature_force'] = water_stress * 0.6 + ag_vol * 0.4
61
+
62
+ # Tech Force
63
+ tech_outperform = (self.df.get('Technology', 0) / self.df.get('SP500', 1)).pct_change(21)
64
+ cloud_mom = self.df.get('Cloud_Computing', pd.Series(0)).pct_change(63)
65
+ ai_mom = self.df.get('Robotics_AI', pd.Series(0)).pct_change(63)
66
+ self.features['dalio_tech_force'] = tech_outperform * 0.4 + cloud_mom * 0.3 + ai_mom * 0.3
67
+
68
+ # Composite
69
+ comp = (
70
+ self.features['dalio_debt_cycle'] * 0.35 +
71
+ self.features['dalio_internal_conflict'] * 0.25 +
72
+ self.features['dalio_external_conflict'] * 0.20 +
73
+ self.features['dalio_tech_force'] * 0.15 +
 
 
 
 
 
 
 
 
 
 
74
  self.features['dalio_nature_force'] * 0.05
75
+ )
76
+ self.features['dalio_composite_norm'] = normalize(comp)
 
 
77
  return self
78
+
79
  def stevenson_inequality(self):
80
+ asset_rich = (self.df.get('Gold', 0) + self.df.get('Real_Estate', 0) + self.df.get('Growth_Stocks', 0)) / 3
81
+ middle_class = (self.df.get('Consumer_Staples', 0) + self.df.get('Regional_Banks', 0) + self.df.get('Small_Cap_Value', 0)) / 3
82
+ wealth_flow = asset_rich.pct_change(63) - middle_class.pct_change(63)
83
+
 
 
 
 
 
 
 
 
 
 
84
  luxury = self.df.get('Retail_Luxury', pd.Series(0)).pct_change(21)
85
+ mass = ((self.df.get('Restaurants', 0) + self.df.get('Retail', 0)) / 2).pct_change(21)
86
+ cons_gap = luxury - mass
87
+
88
+ quality = (self.df.get('Investment_Grade_Spread', 0) + self.df.get('Preferred_Stock', 0)) / 2
89
+ junk = (self.df.get('HYG', 0) + self.df.get('JNK', 0) + self.df.get('Emerging_Market_Debt', 0)) / 3
90
+ credit_gap = quality.pct_change(63) - junk.pct_change(63)
91
+
92
+ self.features['stevenson_inequality_norm'] = normalize(
93
+ wealth_flow * 0.4 + cons_gap * 0.3 + credit_gap * 0.3
 
 
 
 
 
 
 
 
94
  )
 
 
 
 
 
 
95
  return self
96
+
97
  def thiel_monopoly(self):
98
+ tech = self.df.get('Technology', 0)
99
+ finance = self.df.get('Financials', 1)
100
+ cash_moat = tech.pct_change(63) - finance.pct_change(63)
101
+
102
+ network = (
103
+ self.df.get('Cloud_Computing', 0) * 0.4 +
104
+ self.df.get('Communication_Services', 0) * 0.3 +
105
+ self.df.get('Fintech', 0) * 0.3
106
+ ).pct_change(63)
107
+
108
+ tech_vol = self.df.get('Technology', pd.Series(1)).pct_change().rolling(63).std()
109
+ chip = self.df.get('Semiconductors', pd.Series(0)).pct_change(63)
110
+ defensibility = (1 / (tech_vol + 0.001)) * 0.01 + chip * 0.5
111
+
112
+ self.features['thiel_monopoly_norm'] = normalize(
113
+ cash_moat * 0.35 + network * 0.35 + defensibility * 0.30
 
 
 
 
 
 
 
 
 
114
  )
 
 
 
 
 
 
 
 
 
 
 
 
115
  return self
116
+
117
  def gundlach_reckoning(self):
118
+ fed = self.df.get('DGS3MO', pd.Series(0))
119
+ teny = self.df.get('DGS10', pd.Series(0))
120
+ yield_anomaly = ((fed.diff() < -0.05) & (teny.diff() > 0)).astype(float) + (teny - fed)
121
+
122
+ gold_ret = self.df.get('Gold', pd.Series(0)).pct_change(21)
123
+ tlt_ret = self.df.get('US_Treasuries_Long', pd.Series(1)).pct_change(21)
124
+ flight_shift = gold_ret / (tlt_ret + 0.001)
125
+
126
+ dxy_weak = self.df.get('DXY', pd.Series(0)).pct_change(21) * -1
127
+ em = (self.df.get('Emerging_Markets', 0) + self.df.get('Europe', 0)) / 2
128
+ em_out = em.pct_change(21)
129
+ sp_ret = self.df.get('SP500', pd.Series(0)).pct_change(21)
130
+ capital_reversal = dxy_weak * 0.5 + (em_out - sp_ret) * 0.5
131
+
132
+ reg_banks = (self.df.get('Regional_Banks', 0) / self.df.get('Financials', 1)).pct_change(21)
133
+ mortgage_reit = self.df.get('Mortgage_REITs', pd.Series(0)).pct_change(21)
 
 
 
 
 
 
 
 
 
 
 
 
134
  real_estate_vol = self.df.get('Real_Estate', pd.Series(1)).pct_change().rolling(21).std() * 100
135
+ private_credit_risk = reg_banks * -0.4 + mortgage_reit * -0.3 + real_estate_vol * 0.3
136
+
137
+ reckoning = (
138
+ yield_anomaly * 0.30 +
139
+ flight_shift * 0.25 +
140
+ capital_reversal * 0.25 +
141
+ private_credit_risk * 0.20
142
+ )
143
+ self.features['gundlach_reckoning_norm'] = normalize(reckoning)
144
+ self.features['gundlach_private_credit_risk'] = private_credit_risk
 
 
 
145
  return self
146
+
147
  def geopolitical_indicators(self):
148
+ oil_vol = self.df.get('Oil', pd.Series(1)).pct_change().rolling(3).std() * 100
149
+ def_spike = self.df.get('Defense_Stocks', pd.Series(0)).pct_change(5)
 
 
 
150
  gold_haven = self.df.get('Gold_Safe_Haven', pd.Series(0)).pct_change(5)
151
+ me_risk = oil_vol * 0.4 + def_spike * 0.3 + gold_haven * 0.3
152
+
153
+ gas_vol = self.df.get('NaturalGas', pd.Series(1)).pct_change().rolling(5).std() * 100
154
+ eu_decline = self.df.get('Europe', pd.Series(0)).pct_change(21) * -1
155
+ chf_str = self.df.get('Swiss_Franc', pd.Series(0)).pct_change(21) * -1
156
+ eu_risk = gas_vol * 0.5 + eu_decline * 0.3 + chf_str * 0.2
157
+
 
 
 
 
 
 
 
 
158
  chip_stress = self.df.get('Semiconductors', pd.Series(1)).pct_change().rolling(21).std() * 100
159
+ tw_kr = (self.df.get('Taiwan', 0) + self.df.get('South_Korea', 0)) / 2
160
+ china_div = tw_kr.pct_change(21) - self.df.get('China', pd.Series(0)).pct_change(21)
161
  rare_earth = self.df.get('Rare_Earth', pd.Series(0)).pct_change(21)
162
+ asia_risk = chip_stress * 0.4 + china_div * 0.3 + rare_earth * 0.3
163
+
164
+ self.features['geopolitical_risk_norm'] = normalize(
165
+ me_risk * 0.4 + eu_risk * 0.3 + asia_risk * 0.3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  )
167
  return self
168
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  def scenario_probabilities(self):
170
+ f = self.features
171
+ df = self.df
172
+
173
+ # Credit Collapse
174
+ f['prob_credit_collapse'] = np.clip(
175
+ f['gundlach_reckoning_norm'] * 0.4 +
176
+ safe_zscore(f['gundlach_private_credit_risk']) * 0.03 +
177
+ safe_zscore(f['dalio_debt_cycle']) * 0.03,
178
+ 0, 1
179
  )
180
+
181
+ # Stagflation
182
+ inflation_high = (df['CPIAUCSL'].pct_change(12) * 100 > 2.5).astype(float)
183
+ unemp_rising = (df['UNRATE'].diff() > 0).astype(float)
184
+ f['prob_stagflation'] = np.clip(
185
+ (inflation_high & unemp_rising) * 0.3 +
186
+ safe_zscore(f['dalio_external_conflict']) * 0.03 +
187
+ safe_zscore(f['gundlach_capital_reversal']) * 0.02 +
188
+ f['stevenson_inequality_norm'] * 0.2,
189
+ 0, 1
190
  )
191
+
192
+ # Tech Boom
193
+ china_tech_lag = (df.get('China_Tech', pd.Series(0)).pct_change(63) < df.get('Technology', pd.Series(0)).pct_change(63)).astype(float)
194
+ f['prob_tech_boom'] = np.clip(
195
+ f['thiel_monopoly_norm'] * 0.4 +
196
+ safe_zscore(f['dalio_tech_force'] - f['dalio_debt_cycle']) * 0.03 +
197
+ safe_zscore(f.get('energy_transition', pd.Series(0))) * 0.02 +
198
+ china_tech_lag * 0.1,
199
+ 0, 1
200
  )
201
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  return self
203
+
204
+ def regime_flags(self):
205
+ f = self.features
206
+ # Binary regime flags
207
+ f['debt_unsustainable'] = ((f['gundlach_reckoning_norm'] > 0.5) & (f['prob_credit_collapse'] > 0.3)).astype(int)
208
+ f['inequality_trap'] = ((f['stevenson_inequality_norm'] > 0.6) & (f['prob_stagflation'] > 0.4)).astype(int)
209
+ f['tech_monopoly'] = (f['thiel_monopoly_norm'] > 0.6).astype(int)
210
+ f['geopolitical_shock'] = (f['geopolitical_risk_norm'] > 0.7).astype(int)
211
+
212
+ # Regime label
213
+ conditions = [
214
+ f['debt_unsustainable'],
215
+ f['tech_monopoly'],
216
+ f['inequality_trap'],
217
+ f['geopolitical_shock']
218
+ ]
219
+ choices = ['CRISIS', 'TECH_MONOPOLY', 'INEQUALITY_TRAP', 'GEOPOLITICAL_SHOCK']
220
+ f['regime'] = np.select(conditions, choices, default='TRANSITION')
 
 
 
221
  return self
222
+
223
+ def build_features(self):
224
+ (self.dalio_forces()
225
+ .stevenson_inequality()
226
+ .thiel_monopoly()
227
+ .gundlach_reckoning()
228
+ .geopolitical_indicators()
229
+ .scenario_probabilities()
230
+ .regime_flags())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  return self.features
232
 
233
 
234
  def main():
235
  import argparse
236
+ parser = argparse.ArgumentParser()
237
+ parser.add_argument('--input', default='unified_market_data.csv')
238
+ parser.add_argument('--output', default='enhanced_features.csv')
 
 
239
  args = parser.parse_args()
240
+
 
241
  df = pd.read_csv(args.input, index_col=0, parse_dates=True)
 
 
 
242
  engine = IntegratedTheoryFeatures(df)
243
+ features = engine.build_features()
244
+ features.to_csv(args.output)
 
245
 
246
 
247
  if __name__ == "__main__":