JayLacoma commited on
Commit
b5dd131
·
verified ·
1 Parent(s): f7a7cae

Create feature_engineering.py

Browse files
Files changed (1) hide show
  1. feature_engineering.py +559 -0
feature_engineering.py ADDED
@@ -0,0 +1,559 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Integrated Market Theory - Feature Engineering Pipeline
3
+ Combines all tickers from geo_macro.py into unified theory indicators
4
+
5
+ Usage:
6
+ python feature_engineering.py --input unified_market_data.csv --output enhanced_features.csv
7
+ """
8
+
9
+ import pandas as pd
10
+ import numpy as np
11
+ from sklearn.decomposition import PCA
12
+ from sklearn.preprocessing import StandardScaler
13
+ import warnings
14
+ warnings.filterwarnings('ignore')
15
+
16
+
17
+ class IntegratedTheoryFeatures:
18
+ """
19
+ Transforms raw market data into theory-driven features combining:
20
+ - Dalio's 5 Forces
21
+ - Stevenson's Inequality Metrics
22
+ - Thiel's Monopoly Indicators
23
+ - Gundlach's Reckoning Signals
24
+ """
25
+
26
+ def __init__(self, df):
27
+ self.df = df.copy()
28
+ self.features = pd.DataFrame(index=df.index)
29
+
30
+ def calculate_returns_volatility(self, windows=[21, 63, 252]):
31
+ """Calculate multi-timeframe returns and volatility for all tickers"""
32
+ print("Calculating returns and volatility...")
33
+
34
+ for col in self.df.columns:
35
+ for window in windows:
36
+ # Returns
37
+ self.df[f'{col}_ret{window}'] = self.df[col].pct_change(window)
38
+ # Volatility
39
+ self.df[f'{col}_vol{window}'] = self.df[col].pct_change().rolling(window).std()
40
+ # Momentum (rate of change acceleration)
41
+ self.df[f'{col}_mom{window}'] = self.df[col].pct_change(window) - self.df[col].pct_change(window).shift(window)
42
+
43
+ return self
44
+
45
+ def dalio_forces(self):
46
+ """Ray Dalio's 5 Forces Composite Indicators"""
47
+ print("Building Dalio's 5 Forces...")
48
+
49
+ # Force 1: Debt/Economic Cycle
50
+ yield_curve = self.df.get('DGS10', 0) - self.df.get('DGS2', 0)
51
+ inflation_mom = self.df.get('CPIAUCSL', pd.Series(0)).pct_change(12) * 100
52
+ hy_spread = self.df.get('BAMLH0A0HYM2', pd.Series(0)) / 100
53
+
54
+ self.features['dalio_debt_cycle'] = (
55
+ yield_curve * 0.3 +
56
+ inflation_mom * 0.4 +
57
+ hy_spread * 0.3
58
+ )
59
+
60
+ # Force 2: Internal Conflict (inequality-driven)
61
+ consumer_weakness = (self.df.get('Consumer_Discretionary', 0) /
62
+ self.df.get('Consumer_Staples', 1)).pct_change(63) * -1
63
+ unemployment_stress = self.df.get('UNRATE', pd.Series(0)).diff() * 2
64
+ small_large_gap = (self.df.get('Small_Cap_Value', 0) /
65
+ self.df.get('SP500', 1)).pct_change(63) * -1
66
+
67
+ self.features['dalio_internal_conflict'] = (
68
+ consumer_weakness * 0.4 +
69
+ unemployment_stress * 0.3 +
70
+ small_large_gap * 0.3
71
+ )
72
+
73
+ # Force 3: External Conflict
74
+ defense_momentum = self.df.get('Defense_Stocks', pd.Series(0)).pct_change(21)
75
+ dollar_anomaly = self._calculate_dollar_anomaly()
76
+ china_taiwan_tension = self._calculate_asia_tension()
77
+
78
+ self.features['dalio_external_conflict'] = (
79
+ defense_momentum * 0.4 +
80
+ dollar_anomaly * 0.3 +
81
+ china_taiwan_tension * 0.3
82
+ )
83
+
84
+ # Force 4: Acts of Nature
85
+ water_stress = self.df.get('Water', pd.Series(0)).pct_change(63)
86
+ ag_volatility = self.df.get('Agricultural', pd.Series(0)).pct_change().rolling(63).std() * 100
87
+
88
+ self.features['dalio_nature_force'] = (
89
+ water_stress * 0.6 +
90
+ ag_volatility * 0.4
91
+ )
92
+
93
+ # Force 5: Technology/Inventiveness
94
+ tech_outperform = (self.df.get('Technology', 0) /
95
+ self.df.get('SP500', 1)).pct_change(21)
96
+ cloud_momentum = self.df.get('Cloud_Computing', pd.Series(0)).pct_change(63)
97
+ ai_momentum = self.df.get('Robotics_AI', pd.Series(0)).pct_change(63)
98
+
99
+ self.features['dalio_tech_force'] = (
100
+ tech_outperform * 0.4 +
101
+ cloud_momentum * 0.3 +
102
+ ai_momentum * 0.3
103
+ )
104
+
105
+ # Master Composite (normalized)
106
+ dalio_components = [
107
+ self.features['dalio_debt_cycle'] * 0.35,
108
+ self.features['dalio_internal_conflict'] * 0.25,
109
+ self.features['dalio_external_conflict'] * 0.20,
110
+ self.features['dalio_tech_force'] * 0.15,
111
+ self.features['dalio_nature_force'] * 0.05
112
+ ]
113
+
114
+ self.features['dalio_composite'] = pd.concat(dalio_components, axis=1).sum(axis=1)
115
+ self.features['dalio_composite_norm'] = self._normalize(self.features['dalio_composite'])
116
+
117
+ return self
118
+
119
+ def stevenson_inequality(self):
120
+ """Gary Stevenson's Inequality Amplification Metrics"""
121
+ print("Building Stevenson's inequality indicators...")
122
+
123
+ # Wealth Flow (money flowing to asset owners vs middle class)
124
+ asset_rich = (self.df.get('Gold', 0) +
125
+ self.df.get('Real_Estate', 0) +
126
+ self.df.get('Growth_Stocks', 0)) / 3
127
+
128
+ middle_class = (self.df.get('Consumer_Staples', 0) +
129
+ self.df.get('Regional_Banks', 0) +
130
+ self.df.get('Small_Cap_Value', 0)) / 3
131
+
132
+ self.features['inequality_wealth_flow'] = (
133
+ asset_rich.pct_change(63) - middle_class.pct_change(63)
134
+ )
135
+
136
+ # Consumption Gap (luxury vs mass market)
137
+ luxury = self.df.get('Retail_Luxury', pd.Series(0)).pct_change(21)
138
+ mass = (self.df.get('Restaurants', 0) + self.df.get('Retail', 0)) / 2
139
+ mass = mass.pct_change(21)
140
+
141
+ self.features['inequality_consumption_gap'] = luxury - mass
142
+
143
+ # Credit Access Gap
144
+ quality_credit = (self.df.get('Investment_Grade_Spread', 0) +
145
+ self.df.get('Preferred_Stock', 0)) / 2
146
+ junk_credit = (self.df.get('HYG', 0) +
147
+ self.df.get('JNK', 0) +
148
+ self.df.get('Emerging_Market_Debt', 0)) / 3
149
+
150
+ self.features['inequality_credit_access'] = (
151
+ quality_credit.pct_change(63) - junk_credit.pct_change(63)
152
+ )
153
+
154
+ # Master Inequality Score
155
+ self.features['stevenson_inequality'] = (
156
+ self.features['inequality_wealth_flow'] * 0.4 +
157
+ self.features['inequality_consumption_gap'] * 0.3 +
158
+ self.features['inequality_credit_access'] * 0.3
159
+ )
160
+ self.features['stevenson_inequality_norm'] = self._normalize(self.features['stevenson_inequality'])
161
+
162
+ # Inequality Transmission (how stimulus flows to rich)
163
+ # High when asset prices rise faster than wages
164
+ asset_inflation = (self.df.get('Gold', 0) + self.df.get('Real_Estate', 0)).pct_change(21)
165
+ wage_proxy = self.df.get('Staffing', pd.Series(0)).pct_change(21) # Labor market proxy
166
+
167
+ self.features['inequality_transmission'] = asset_inflation - wage_proxy
168
+
169
+ return self
170
+
171
+ def thiel_monopoly(self):
172
+ """Peter Thiel's Monopoly vs Competition Indicators"""
173
+ print("Building Thiel's monopoly indicators...")
174
+
175
+ # Cash Moat (tech vs credit-dependent sectors)
176
+ tech_strength = self.df.get('Technology', 0)
177
+ finance_strength = self.df.get('Financials', 1)
178
+
179
+ self.features['monopoly_cash_moat'] = (
180
+ tech_strength.pct_change(63) - finance_strength.pct_change(63)
181
+ )
182
+
183
+ # Network Effects (winner-take-all platforms)
184
+ network_sectors = (self.df.get('Cloud_Computing', 0) * 0.4 +
185
+ self.df.get('Communication_Services', 0) * 0.3 +
186
+ self.df.get('Fintech', 0) * 0.3)
187
+
188
+ self.features['monopoly_network_effects'] = network_sectors.pct_change(63)
189
+
190
+ # Defensibility (stability = moat strength)
191
+ tech_volatility = self.df.get('Technology', pd.Series(1)).pct_change().rolling(63).std()
192
+ chip_strength = self.df.get('Semiconductors', pd.Series(0)).pct_change(63)
193
+
194
+ # Inverse volatility (lower vol = stronger moat)
195
+ self.features['monopoly_defensibility'] = (
196
+ (1 / (tech_volatility + 0.001)) * 0.01 + # Normalize
197
+ chip_strength * 0.5
198
+ )
199
+
200
+ # Master Monopoly Score
201
+ self.features['thiel_monopoly'] = (
202
+ self.features['monopoly_cash_moat'] * 0.35 +
203
+ self.features['monopoly_network_effects'] * 0.35 +
204
+ self.features['monopoly_defensibility'] * 0.30
205
+ )
206
+ self.features['thiel_monopoly_norm'] = self._normalize(self.features['thiel_monopoly'])
207
+
208
+ # Monopoly Immunity Test (tech ignoring rate moves)
209
+ tech_return = self.df.get('Technology', pd.Series(0)).pct_change(21)
210
+ rate_change = self.df.get('DGS10', pd.Series(0)).diff() * -1 # Inverse (cuts = positive)
211
+
212
+ self.features['monopoly_immunity'] = tech_return / (rate_change.abs() + 0.001)
213
+
214
+ # Tech Concentration (narrow leadership = bubble risk)
215
+ specialized = (self.df.get('Semiconductors', 0) +
216
+ self.df.get('Cloud_Computing', 0) +
217
+ self.df.get('Robotics_AI', 0)) / 3
218
+ broad_tech = self.df.get('Technology', 1)
219
+
220
+ self.features['tech_concentration'] = specialized / broad_tech
221
+
222
+ return self
223
+
224
+ def gundlach_reckoning(self):
225
+ """Jeffrey Gundlach's Debt Reckoning and Paradigm Shift Signals"""
226
+ print("Building Gundlach's reckoning indicators...")
227
+
228
+ # Yield Anomaly (yields rising post-cuts = fiscal dominance)
229
+ fed_proxy = self.df.get('DGS3MO', pd.Series(0))
230
+ long_yield = self.df.get('DGS10', pd.Series(0))
231
+
232
+ # Detect cuts (3mo falling) and measure 10Y response
233
+ fed_cutting = fed_proxy.diff() < -0.05
234
+ yield_rising = long_yield.diff() > 0
235
+
236
+ self.features['gundlach_yield_anomaly'] = (
237
+ (fed_cutting & yield_rising).astype(float) +
238
+ (long_yield - fed_proxy) # Curve steepening
239
+ )
240
+
241
+ # Flight-to-Quality Shift (gold vs Treasuries)
242
+ gold_return = self.df.get('Gold', pd.Series(0)).pct_change(21)
243
+ treasury_return = self.df.get('US_Treasuries_Long', pd.Series(1)).pct_change(21)
244
+
245
+ self.features['gundlach_flight_shift'] = gold_return / (treasury_return + 0.001)
246
+
247
+ # Capital Reversal (dollar weakness + EM outperformance)
248
+ dollar_weak = self.df.get('DXY', pd.Series(0)).pct_change(21) * -1
249
+ em_outperform = (self.df.get('Emerging_Markets', 0) + self.df.get('Europe', 0)) / 2
250
+ em_outperform = em_outperform.pct_change(21)
251
+ sp_return = self.df.get('SP500', pd.Series(0)).pct_change(21)
252
+
253
+ self.features['gundlach_capital_reversal'] = (
254
+ dollar_weak * 0.5 +
255
+ (em_outperform - sp_return) * 0.5
256
+ )
257
+
258
+ # Private Credit Risk (2007 CDO echo)
259
+ regional_stress = (self.df.get('Regional_Banks', 0) /
260
+ self.df.get('Financials', 1)).pct_change(21)
261
+ mortgage_reit_stress = self.df.get('Mortgage_REITs', pd.Series(0)).pct_change(21)
262
+ real_estate_vol = self.df.get('Real_Estate', pd.Series(1)).pct_change().rolling(21).std() * 100
263
+
264
+ self.features['gundlach_private_credit_risk'] = (
265
+ regional_stress * -0.4 + # Decline = stress
266
+ mortgage_reit_stress * -0.3 +
267
+ real_estate_vol * 0.3
268
+ )
269
+
270
+ # Master Reckoning Score
271
+ self.features['gundlach_reckoning'] = (
272
+ self.features['gundlach_yield_anomaly'] * 0.30 +
273
+ self.features['gundlach_flight_shift'] * 0.25 +
274
+ self.features['gundlach_capital_reversal'] * 0.25 +
275
+ self.features['gundlach_private_credit_risk'] * 0.20
276
+ )
277
+ self.features['gundlach_reckoning_norm'] = self._normalize(self.features['gundlach_reckoning'])
278
+
279
+ return self
280
+
281
+ def geopolitical_indicators(self):
282
+ """Regional conflict and energy transition signals"""
283
+ print("Building geopolitical indicators...")
284
+
285
+ # Middle East Risk
286
+ oil_volatility = self.df.get('Oil', pd.Series(1)).pct_change().rolling(3).std() * 100
287
+ defense_spike = self.df.get('Defense_Stocks', pd.Series(0)).pct_change(5)
288
+ gold_haven = self.df.get('Gold_Safe_Haven', pd.Series(0)).pct_change(5)
289
+
290
+ self.features['middle_east_risk'] = (
291
+ oil_volatility * 0.4 +
292
+ defense_spike * 0.3 +
293
+ gold_haven * 0.3
294
+ )
295
+
296
+ # Europe Risk
297
+ gas_volatility = self.df.get('NaturalGas', pd.Series(1)).pct_change().rolling(5).std() * 100
298
+ europe_decline = self.df.get('Europe', pd.Series(0)).pct_change(21) * -1
299
+ swiss_franc_strength = self.df.get('Swiss_Franc', pd.Series(0)).pct_change(21) * -1 # Inverse quote
300
+
301
+ self.features['europe_risk'] = (
302
+ gas_volatility * 0.5 +
303
+ europe_decline * 0.3 +
304
+ swiss_franc_strength * 0.2
305
+ )
306
+
307
+ # Asia Risk
308
+ chip_stress = self.df.get('Semiconductors', pd.Series(1)).pct_change().rolling(21).std() * 100
309
+ taiwan_korea = (self.df.get('Taiwan', 0) + self.df.get('South_Korea', 0)) / 2
310
+ china_diverge = taiwan_korea.pct_change(21) - self.df.get('China', pd.Series(0)).pct_change(21)
311
+ rare_earth = self.df.get('Rare_Earth', pd.Series(0)).pct_change(21)
312
+
313
+ self.features['asia_risk'] = (
314
+ chip_stress * 0.4 +
315
+ china_diverge * 0.3 +
316
+ rare_earth * 0.3
317
+ )
318
+
319
+ # Overall Geopolitical Risk
320
+ self.features['geopolitical_risk'] = (
321
+ self.features['middle_east_risk'] * 0.4 +
322
+ self.features['europe_risk'] * 0.3 +
323
+ self.features['asia_risk'] * 0.3
324
+ )
325
+ self.features['geopolitical_risk_norm'] = self._normalize(self.features['geopolitical_risk'])
326
+
327
+ # Energy Transition Indicators
328
+ uranium_momentum = self.df.get('Uranium', pd.Series(0)).pct_change(63)
329
+ clean_momentum = self.df.get('Clean_Energy', pd.Series(0)).pct_change(63)
330
+ oil_decline = self.df.get('Oil', pd.Series(0)).pct_change(252) * -1
331
+
332
+ self.features['energy_transition'] = (
333
+ uranium_momentum * 0.5 +
334
+ clean_momentum * 0.3 +
335
+ oil_decline * 0.2
336
+ )
337
+
338
+ return self
339
+
340
+ def cross_asset_features(self):
341
+ """Advanced cross-asset relationships"""
342
+ print("Building cross-asset features...")
343
+
344
+ # Flight-to-Quality Ratio
345
+ defensive = (self.df.get('Gold', 0) +
346
+ self.df.get('Utilities', 0) +
347
+ self.df.get('Healthcare', 0)) / 3
348
+ risk_on = (self.df.get('Technology', 0) +
349
+ self.df.get('Consumer_Discretionary', 0) +
350
+ self.df.get('Real_Estate', 0)) / 3
351
+
352
+ self.features['flight_ratio'] = defensive / (risk_on + 0.001)
353
+
354
+ # Credit Contagion Spread
355
+ regional_vs_broad = (self.df.get('Regional_Banks', 0) -
356
+ self.df.get('Financials', 0))
357
+ mortgage_vs_reit = (self.df.get('Mortgage_REITs', 0) -
358
+ self.df.get('REITs', 0))
359
+ em_vs_ig = (self.df.get('Emerging_Market_Debt', 0) -
360
+ self.df.get('Investment_Grade_Spread', 0))
361
+
362
+ self.features['credit_contagion'] = (
363
+ regional_vs_broad.pct_change(21) +
364
+ mortgage_vs_reit.pct_change(21) +
365
+ em_vs_ig.pct_change(21)
366
+ ) / 3
367
+
368
+ # VIX Amplification
369
+ vix = self.df.get('VIX', pd.Series(20))
370
+ vix_historical_avg = vix.rolling(252).mean()
371
+ geo_max = self.features[['middle_east_risk', 'europe_risk', 'asia_risk']].max(axis=1)
372
+
373
+ self.features['geo_amplification'] = geo_max * (vix / vix_historical_avg)
374
+
375
+ return self
376
+
377
+ def scenario_probabilities(self):
378
+ """Dynamic probability weights for future scenarios"""
379
+ print("Calculating scenario probabilities...")
380
+
381
+ # Scenario 1: Credit Collapse
382
+ self.features['prob_credit_collapse'] = (
383
+ self.features['gundlach_reckoning_norm'] * 0.4 +
384
+ self.features['gundlach_private_credit_risk'] / self.features['gundlach_private_credit_risk'].std() * 0.1 * 0.3 +
385
+ self.features['dalio_debt_cycle'] / self.features['dalio_debt_cycle'].std() * 0.1 * 0.3
386
+ )
387
+ self.features['prob_credit_collapse'] = np.clip(self.features['prob_credit_collapse'], 0, 1)
388
+
389
+ # Scenario 2: Stagflation
390
+ inflation_high = (self.df.get('CPIAUCSL', pd.Series(0)).pct_change(12) * 100 > 2.5).astype(float)
391
+ unemployment_rising = (self.df.get('UNRATE', pd.Series(0)).diff() > 0).astype(float)
392
+
393
+ self.features['prob_stagflation'] = (
394
+ (inflation_high * unemployment_rising) * 0.3 +
395
+ self.features['dalio_external_conflict'] / self.features['dalio_external_conflict'].std() * 0.1 * 0.3 +
396
+ self.features['gundlach_capital_reversal'] / self.features['gundlach_capital_reversal'].std() * 0.1 * 0.2 +
397
+ self.features['stevenson_inequality_norm'] * 0.2
398
+ )
399
+ self.features['prob_stagflation'] = np.clip(self.features['prob_stagflation'], 0, 1)
400
+
401
+ # Scenario 3: Tech Monopoly Boom
402
+ self.features['prob_tech_boom'] = (
403
+ self.features['thiel_monopoly_norm'] * 0.4 +
404
+ (self.features['dalio_tech_force'] - self.features['dalio_debt_cycle']) /
405
+ (self.features['dalio_tech_force'].std() + 0.001) * 0.1 * 0.3 +
406
+ self.features['energy_transition'] / (self.features['energy_transition'].std() + 0.001) * 0.1 * 0.2 +
407
+ (self.df.get('China_Tech', pd.Series(0)).pct_change(63) <
408
+ self.df.get('Technology', pd.Series(0)).pct_change(63)).astype(float) * 0.1
409
+ )
410
+ self.features['prob_tech_boom'] = np.clip(self.features['prob_tech_boom'], 0, 1)
411
+
412
+ # Scenario 4: Controlled Reset (low probability without policy action)
413
+ self.features['prob_controlled_reset'] = 0.05 # Baseline, would need policy signals
414
+
415
+ return self
416
+
417
+ def regime_detection(self):
418
+ """Classify current market regime"""
419
+ print("Detecting market regimes...")
420
+
421
+ def classify_regime(row):
422
+ # Crisis conditions
423
+ if (row['gundlach_reckoning_norm'] > 0.6 and
424
+ row['prob_credit_collapse'] > 0.5):
425
+ return 'CRISIS'
426
+
427
+ # Tech Monopoly Dominance
428
+ elif row['thiel_monopoly_norm'] > 0.7:
429
+ return 'TECH_MONOPOLY'
430
+
431
+ # Inequality Trap (stagflation)
432
+ elif (row['stevenson_inequality_norm'] > 0.6 and
433
+ row['prob_stagflation'] > 0.4):
434
+ return 'INEQUALITY_TRAP'
435
+
436
+ # Geopolitical Shock
437
+ elif row['geopolitical_risk_norm'] > 0.7:
438
+ return 'GEOPOLITICAL_SHOCK'
439
+
440
+ # Default: Transition phase
441
+ else:
442
+ return 'TRANSITION'
443
+
444
+ self.features['regime'] = self.features.apply(classify_regime, axis=1)
445
+
446
+ return self
447
+
448
+ def dimensionality_reduction(self):
449
+ """Apply PCA to reduce feature space"""
450
+ print("Applying dimensionality reduction...")
451
+
452
+ # Define feature groups for PCA
453
+ debt_cols = [c for c in self.features.columns if 'dalio_debt' in c or 'gundlach' in c]
454
+ inequality_cols = [c for c in self.features.columns if 'inequality' in c or 'stevenson' in c]
455
+ geo_cols = [c for c in self.features.columns if 'risk' in c or 'middle_east' in c or 'europe' in c or 'asia' in c]
456
+ tech_cols = [c for c in self.features.columns if 'monopoly' in c or 'thiel' in c or 'tech' in c]
457
+
458
+ for name, cols in [('debt', debt_cols), ('inequality', inequality_cols),
459
+ ('geo', geo_cols), ('tech', tech_cols)]:
460
+ if len(cols) > 0:
461
+ # Get data and drop NaNs
462
+ data = self.features[cols].dropna()
463
+
464
+ if len(data) > 10: # Need sufficient data
465
+ # Standardize
466
+ scaler = StandardScaler()
467
+ data_scaled = scaler.fit_transform(data)
468
+
469
+ # PCA
470
+ pca = PCA(n_components=min(2, len(cols)))
471
+ pcs = pca.fit_transform(data_scaled)
472
+
473
+ # Add back
474
+ for i in range(pcs.shape[1]):
475
+ self.features.loc[data.index, f'{name}_PC{i+1}'] = pcs[:, i]
476
+
477
+ return self
478
+
479
+ def _calculate_dollar_anomaly(self):
480
+ """Detect dollar weakness during stock corrections (40-year anomaly)"""
481
+ sp_correction = self.df.get('SP500', pd.Series(0)).pct_change(5) < -0.05
482
+ dollar_weakness = self.df.get('DXY', pd.Series(0)).pct_change(5) < 0
483
+
484
+ return (sp_correction & dollar_weakness).astype(float)
485
+
486
+ def _calculate_asia_tension(self):
487
+ """Taiwan-China divergence as tension proxy"""
488
+ taiwan = self.df.get('Taiwan', pd.Series(0))
489
+ china = self.df.get('China', pd.Series(0))
490
+
491
+ return (taiwan.pct_change(21) - china.pct_change(21)).fillna(0)
492
+
493
+ def _normalize(self, series, window=252):
494
+ """Rolling z-score normalization"""
495
+ rolling_mean = series.rolling(window, min_periods=20).mean()
496
+ rolling_std = series.rolling(window, min_periods=20).std()
497
+
498
+ return ((series - rolling_mean) / (rolling_std + 0.001)).clip(-3, 3) / 3 # Scale to -1, 1
499
+
500
+ def build_all_features(self):
501
+ """Run complete feature engineering pipeline"""
502
+ print("\n" + "="*80)
503
+ print("INTEGRATED THEORY FEATURE ENGINEERING")
504
+ print("="*80 + "\n")
505
+
506
+ self.calculate_returns_volatility()
507
+ self.dalio_forces()
508
+ self.stevenson_inequality()
509
+ self.thiel_monopoly()
510
+ self.gundlach_reckoning()
511
+ self.geopolitical_indicators()
512
+ self.cross_asset_features()
513
+ self.scenario_probabilities()
514
+ self.regime_detection()
515
+ self.dimensionality_reduction()
516
+
517
+ print("\n" + "="*80)
518
+ print("FEATURE ENGINEERING COMPLETE")
519
+ print("="*80)
520
+ print(f"Total features created: {len(self.features.columns)}")
521
+ print(f"Regimes detected: {self.features['regime'].value_counts().to_dict()}")
522
+ print(f"\nCurrent state (latest):")
523
+ print(f" - Dalio Composite: {self.features['dalio_composite_norm'].iloc[-1]:.3f}")
524
+ print(f" - Stevenson Inequality: {self.features['stevenson_inequality_norm'].iloc[-1]:.3f}")
525
+ print(f" - Thiel Monopoly: {self.features['thiel_monopoly_norm'].iloc[-1]:.3f}")
526
+ print(f" - Gundlach Reckoning: {self.features['gundlach_reckoning_norm'].iloc[-1]:.3f}")
527
+ print(f" - Regime: {self.features['regime'].iloc[-1]}")
528
+ print(f"\nScenario Probabilities:")
529
+ print(f" - Credit Collapse: {self.features['prob_credit_collapse'].iloc[-1]:.1%}")
530
+ print(f" - Stagflation: {self.features['prob_stagflation'].iloc[-1]:.1%}")
531
+ print(f" - Tech Boom: {self.features['prob_tech_boom'].iloc[-1]:.1%}")
532
+
533
+ return self.features
534
+
535
+
536
+ def main():
537
+ """Main execution function"""
538
+ import argparse
539
+
540
+ parser = argparse.ArgumentParser(description='Integrated Market Theory Feature Engineering')
541
+ parser.add_argument('--input', default='unified_market_data.csv',
542
+ help='Input CSV file from geo_macro.py')
543
+ parser.add_argument('--output', default='enhanced_market_features.csv',
544
+ help='Output CSV file with engineered features')
545
+
546
+ args = parser.parse_args()
547
+
548
+ # Load data
549
+ print(f"Loading data from {args.input}...")
550
+ df = pd.read_csv(args.input, index_col=0, parse_dates=True)
551
+ print(f"Loaded {len(df)} rows, {len(df.columns)} columns")
552
+ print(f"Date range: {df.index.min()} to {df.index.max()}")
553
+
554
+ # Build features
555
+ engine = IntegratedTheoryFeatures(df)
556
+ features = engine.build_all_features()
557
+
558
+ # Save
559
+ features.to_csv(args.output