avinashhm commited on
Commit
0e8833f
·
verified ·
1 Parent(s): fe15745

Add trading_intelligence/feature_engine.py

Browse files
trading_intelligence/feature_engine.py ADDED
@@ -0,0 +1,401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Feature Engine Module
3
+ =====================
4
+ Computes OHLCV features, technical indicators, volatility metrics,
5
+ market regime detection, and sentiment features.
6
+
7
+ Inspired by:
8
+ - Kronos (2508.02739): OHLCVA K-line tokenization
9
+ - PatchTST (2211.14730): Patch-based time series representation
10
+ - FinMultiTime (2506.05019): Multi-modal financial features
11
+ """
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+ from typing import Dict, List, Optional, Tuple
16
+ import ta
17
+
18
+
19
+ class FeatureEngine:
20
+ """Comprehensive feature engineering for financial time series."""
21
+
22
+ def __init__(self, lookback_window: int = 60, prediction_horizons: List[int] = [1, 5, 20]):
23
+ """
24
+ Args:
25
+ lookback_window: Number of periods for feature computation
26
+ prediction_horizons: Short (1), mid (5), long (20) term horizons
27
+ """
28
+ self.lookback_window = lookback_window
29
+ self.prediction_horizons = prediction_horizons
30
+ self.feature_names = []
31
+
32
+ def compute_all_features(self, df: pd.DataFrame) -> pd.DataFrame:
33
+ """
34
+ Compute all features from OHLCV data.
35
+
36
+ Args:
37
+ df: DataFrame with columns [open, high, low, close, volume]
38
+
39
+ Returns:
40
+ DataFrame with all computed features
41
+ """
42
+ features = df.copy()
43
+
44
+ # 1. Price-based features
45
+ features = self._compute_price_features(features)
46
+
47
+ # 2. Technical indicators (RSI, MACD, ATR, EMA, Bollinger)
48
+ features = self._compute_technical_indicators(features)
49
+
50
+ # 3. Volatility metrics
51
+ features = self._compute_volatility_features(features)
52
+
53
+ # 4. Volume features
54
+ features = self._compute_volume_features(features)
55
+
56
+ # 5. Market regime features
57
+ features = self._compute_regime_features(features)
58
+
59
+ # 6. Return targets for multi-horizon prediction
60
+ features = self._compute_targets(features)
61
+
62
+ # Drop NaN rows from indicator computation
63
+ features = features.dropna().reset_index(drop=True)
64
+
65
+ self.feature_names = [c for c in features.columns
66
+ if c not in ['open', 'high', 'low', 'close', 'volume', 'timestamp', 'date', 'symbol']
67
+ and 'target' not in c and 'direction' not in c]
68
+
69
+ return features
70
+
71
+ def _compute_price_features(self, df: pd.DataFrame) -> pd.DataFrame:
72
+ """Compute raw price-derived features."""
73
+ df = df.copy()
74
+
75
+ # Log returns
76
+ df['log_return'] = np.log(df['close'] / df['close'].shift(1))
77
+
78
+ # Price ratios
79
+ df['high_low_ratio'] = df['high'] / df['low']
80
+ df['close_open_ratio'] = df['close'] / df['open']
81
+
82
+ # Candlestick body and shadows (Kronos-inspired OHLCVA encoding)
83
+ df['body'] = df['close'] - df['open']
84
+ df['upper_shadow'] = df['high'] - df[['close', 'open']].max(axis=1)
85
+ df['lower_shadow'] = df[['close', 'open']].min(axis=1) - df['low']
86
+ df['body_ratio'] = df['body'] / (df['high'] - df['low'] + 1e-8)
87
+
88
+ # Price momentum
89
+ for period in [5, 10, 20]:
90
+ df[f'momentum_{period}'] = df['close'] / df['close'].shift(period) - 1
91
+ df[f'sma_{period}'] = df['close'].rolling(period).mean()
92
+ df[f'price_to_sma_{period}'] = df['close'] / df[f'sma_{period}']
93
+
94
+ # Gap analysis
95
+ df['gap'] = df['open'] / df['close'].shift(1) - 1
96
+
97
+ return df
98
+
99
+ def _compute_technical_indicators(self, df: pd.DataFrame) -> pd.DataFrame:
100
+ """Compute standard technical analysis indicators using ta library."""
101
+ df = df.copy()
102
+
103
+ # RSI (multiple periods)
104
+ df['rsi_14'] = ta.momentum.RSIIndicator(close=df['close'], window=14).rsi()
105
+ df['rsi_7'] = ta.momentum.RSIIndicator(close=df['close'], window=7).rsi()
106
+
107
+ # MACD
108
+ macd = ta.trend.MACD(close=df['close'])
109
+ df['macd'] = macd.macd()
110
+ df['macd_signal'] = macd.macd_signal()
111
+ df['macd_histogram'] = macd.macd_diff()
112
+
113
+ # ATR (Average True Range)
114
+ df['atr_14'] = ta.volatility.AverageTrueRange(
115
+ high=df['high'], low=df['low'], close=df['close'], window=14
116
+ ).average_true_range()
117
+ df['atr_ratio'] = df['atr_14'] / df['close']
118
+
119
+ # EMAs
120
+ for period in [9, 21, 50]:
121
+ df[f'ema_{period}'] = ta.trend.EMAIndicator(close=df['close'], window=period).ema_indicator()
122
+ df[f'price_to_ema_{period}'] = df['close'] / df[f'ema_{period}']
123
+
124
+ # Bollinger Bands
125
+ bb = ta.volatility.BollingerBands(close=df['close'], window=20, window_dev=2)
126
+ df['bb_upper'] = bb.bollinger_hband()
127
+ df['bb_lower'] = bb.bollinger_lband()
128
+ df['bb_width'] = (df['bb_upper'] - df['bb_lower']) / df['close']
129
+ df['bb_position'] = (df['close'] - df['bb_lower']) / (df['bb_upper'] - df['bb_lower'] + 1e-8)
130
+
131
+ # Stochastic Oscillator
132
+ stoch = ta.momentum.StochasticOscillator(
133
+ high=df['high'], low=df['low'], close=df['close']
134
+ )
135
+ df['stoch_k'] = stoch.stoch()
136
+ df['stoch_d'] = stoch.stoch_signal()
137
+
138
+ # ADX (Average Directional Index)
139
+ adx = ta.trend.ADXIndicator(high=df['high'], low=df['low'], close=df['close'])
140
+ df['adx'] = adx.adx()
141
+ df['di_plus'] = adx.adx_pos()
142
+ df['di_minus'] = adx.adx_neg()
143
+
144
+ # Williams %R
145
+ df['williams_r'] = ta.momentum.WilliamsRIndicator(
146
+ high=df['high'], low=df['low'], close=df['close']
147
+ ).williams_r()
148
+
149
+ # CCI (Commodity Channel Index)
150
+ df['cci'] = ta.trend.CCIIndicator(
151
+ high=df['high'], low=df['low'], close=df['close']
152
+ ).cci()
153
+
154
+ return df
155
+
156
+ def _compute_volatility_features(self, df: pd.DataFrame) -> pd.DataFrame:
157
+ """Compute volatility metrics for risk modeling."""
158
+ df = df.copy()
159
+
160
+ # Realized volatility (multiple windows)
161
+ for window in [5, 10, 20]:
162
+ df[f'realized_vol_{window}'] = df['log_return'].rolling(window).std() * np.sqrt(252)
163
+
164
+ # Garman-Klass volatility estimator
165
+ df['gk_vol'] = np.sqrt(
166
+ 0.5 * np.log(df['high'] / df['low'])**2
167
+ - (2 * np.log(2) - 1) * np.log(df['close'] / df['open'])**2
168
+ )
169
+ df['gk_vol_20'] = df['gk_vol'].rolling(20).mean()
170
+
171
+ # Parkinson volatility
172
+ df['parkinson_vol'] = np.sqrt(
173
+ 1 / (4 * np.log(2)) * np.log(df['high'] / df['low'])**2
174
+ )
175
+ df['parkinson_vol_20'] = df['parkinson_vol'].rolling(20).mean()
176
+
177
+ # Volatility ratio (short-term vs long-term)
178
+ df['vol_ratio'] = df['realized_vol_5'] / (df['realized_vol_20'] + 1e-8)
179
+
180
+ # Volatility of volatility (vol-of-vol)
181
+ df['vol_of_vol'] = df['realized_vol_5'].rolling(10).std()
182
+
183
+ return df
184
+
185
+ def _compute_volume_features(self, df: pd.DataFrame) -> pd.DataFrame:
186
+ """Compute volume-based features."""
187
+ df = df.copy()
188
+
189
+ # Volume moving averages
190
+ for period in [5, 10, 20]:
191
+ df[f'vol_sma_{period}'] = df['volume'].rolling(period).mean()
192
+ df[f'vol_ratio_{period}'] = df['volume'] / (df[f'vol_sma_{period}'] + 1e-8)
193
+
194
+ # On-Balance Volume (OBV)
195
+ df['obv'] = ta.volume.OnBalanceVolumeIndicator(
196
+ close=df['close'], volume=df['volume']
197
+ ).on_balance_volume()
198
+ df['obv_sma'] = df['obv'].rolling(20).mean()
199
+ df['obv_ratio'] = df['obv'] / (df['obv_sma'] + 1e-8)
200
+
201
+ # Volume-Price Trend
202
+ df['vpt'] = ta.volume.VolumePriceTrendIndicator(
203
+ close=df['close'], volume=df['volume']
204
+ ).volume_price_trend()
205
+
206
+ # VWAP approximation
207
+ df['vwap'] = (df['volume'] * (df['high'] + df['low'] + df['close']) / 3).cumsum() / df['volume'].cumsum()
208
+ df['price_to_vwap'] = df['close'] / (df['vwap'] + 1e-8)
209
+
210
+ # Money Flow Index
211
+ df['mfi'] = ta.volume.MFIIndicator(
212
+ high=df['high'], low=df['low'], close=df['close'], volume=df['volume']
213
+ ).money_flow_index()
214
+
215
+ return df
216
+
217
+ def _compute_regime_features(self, df: pd.DataFrame) -> pd.DataFrame:
218
+ """
219
+ Market regime detection features.
220
+
221
+ Regimes: Trending (bullish/bearish), Mean-reverting, High-volatility
222
+ Based on ADX, volatility clustering, and trend strength.
223
+ """
224
+ df = df.copy()
225
+
226
+ # Trend strength (based on ADX and EMAs)
227
+ if 'adx' in df.columns:
228
+ df['is_trending'] = (df['adx'] > 25).astype(float)
229
+ df['trend_direction'] = np.where(
230
+ df['ema_9'] > df['ema_21'], 1.0, -1.0
231
+ )
232
+ df['trend_strength'] = df['is_trending'] * df['trend_direction']
233
+
234
+ # Regime: volatility regime
235
+ vol_median = df['realized_vol_20'].rolling(60).median()
236
+ df['high_vol_regime'] = (df['realized_vol_20'] > vol_median).astype(float)
237
+
238
+ # Regime: mean reversion tendency
239
+ # Hurst exponent approximation (simple R/S analysis)
240
+ window = 20
241
+ returns = df['log_return']
242
+ cumdev = (returns - returns.rolling(window).mean()).rolling(window).sum()
243
+ r_range = cumdev.rolling(window).max() - cumdev.rolling(window).min()
244
+ s = returns.rolling(window).std()
245
+ df['hurst_approx'] = np.log(r_range / (s + 1e-8) + 1e-8) / np.log(window)
246
+
247
+ # Regime classification: 0=mean-reverting, 1=random walk, 2=trending
248
+ df['regime_class'] = np.where(
249
+ df['hurst_approx'] < 0.4, 0,
250
+ np.where(df['hurst_approx'] > 0.6, 2, 1)
251
+ )
252
+
253
+ # EMA crossover signals
254
+ df['ema_cross_9_21'] = np.where(
255
+ (df['ema_9'] > df['ema_21']) & (df['ema_9'].shift(1) <= df['ema_21'].shift(1)), 1,
256
+ np.where(
257
+ (df['ema_9'] < df['ema_21']) & (df['ema_9'].shift(1) >= df['ema_21'].shift(1)), -1, 0
258
+ )
259
+ ).astype(float)
260
+
261
+ return df
262
+
263
+ def _compute_targets(self, df: pd.DataFrame) -> pd.DataFrame:
264
+ """Compute multi-horizon prediction targets."""
265
+ df = df.copy()
266
+
267
+ for h in self.prediction_horizons:
268
+ # Return target (continuous)
269
+ df[f'target_return_{h}'] = df['close'].shift(-h) / df['close'] - 1
270
+
271
+ # Direction target (binary: 1=up, 0=down)
272
+ df[f'target_direction_{h}'] = (df[f'target_return_{h}'] > 0).astype(float)
273
+
274
+ # Magnitude target (absolute return)
275
+ df[f'target_magnitude_{h}'] = df[f'target_return_{h}'].abs()
276
+
277
+ return df
278
+
279
+ def normalize_features(self, df: pd.DataFrame, method: str = 'zscore') -> Tuple[pd.DataFrame, Dict]:
280
+ """
281
+ Normalize features using z-score or min-max.
282
+
283
+ Returns:
284
+ Normalized DataFrame and normalization parameters
285
+ """
286
+ feature_cols = self.feature_names
287
+ norm_params = {}
288
+ df_norm = df.copy()
289
+
290
+ for col in feature_cols:
291
+ if col in df_norm.columns:
292
+ if method == 'zscore':
293
+ mean = df_norm[col].mean()
294
+ std = df_norm[col].std() + 1e-8
295
+ df_norm[col] = (df_norm[col] - mean) / std
296
+ norm_params[col] = {'mean': mean, 'std': std}
297
+ elif method == 'minmax':
298
+ min_val = df_norm[col].min()
299
+ max_val = df_norm[col].max()
300
+ df_norm[col] = (df_norm[col] - min_val) / (max_val - min_val + 1e-8)
301
+ norm_params[col] = {'min': min_val, 'max': max_val}
302
+
303
+ return df_norm, norm_params
304
+
305
+ def create_sequences(self, df: pd.DataFrame, feature_cols: List[str] = None,
306
+ target_cols: List[str] = None) -> Tuple[np.ndarray, np.ndarray]:
307
+ """
308
+ Create windowed sequences for model input.
309
+
310
+ PatchTST-style: (batch, channels, sequence_length)
311
+
312
+ Args:
313
+ df: Feature DataFrame
314
+ feature_cols: Columns to use as input features
315
+ target_cols: Columns to use as targets
316
+
317
+ Returns:
318
+ X: (N, num_features, lookback_window)
319
+ y: (N, num_targets)
320
+ """
321
+ if feature_cols is None:
322
+ feature_cols = self.feature_names
323
+ if target_cols is None:
324
+ target_cols = [c for c in df.columns if 'target' in c]
325
+
326
+ # Filter to existing columns
327
+ feature_cols = [c for c in feature_cols if c in df.columns]
328
+ target_cols = [c for c in target_cols if c in df.columns]
329
+
330
+ X_data = df[feature_cols].values
331
+ y_data = df[target_cols].values
332
+
333
+ X_sequences = []
334
+ y_sequences = []
335
+
336
+ for i in range(self.lookback_window, len(df)):
337
+ X_sequences.append(X_data[i - self.lookback_window:i].T) # (features, lookback)
338
+ y_sequences.append(y_data[i])
339
+
340
+ return np.array(X_sequences, dtype=np.float32), np.array(y_sequences, dtype=np.float32)
341
+
342
+
343
+ class SentimentFeatureEngine:
344
+ """
345
+ Process sentiment from financial news/tweets.
346
+
347
+ Inspired by FinMultiTime (2506.05019) multi-modal approach.
348
+ Supports pre-computed sentiment scores.
349
+ """
350
+
351
+ def __init__(self):
352
+ self.sentiment_vocab = {
353
+ 'bullish': 1.0, 'bearish': -1.0, 'upgrade': 0.8, 'downgrade': -0.8,
354
+ 'beat': 0.6, 'miss': -0.6, 'growth': 0.5, 'decline': -0.5,
355
+ 'profit': 0.4, 'loss': -0.4, 'buy': 0.7, 'sell': -0.7,
356
+ 'outperform': 0.8, 'underperform': -0.8, 'raise': 0.5, 'cut': -0.5,
357
+ 'positive': 0.6, 'negative': -0.6, 'strong': 0.4, 'weak': -0.4,
358
+ 'rally': 0.7, 'crash': -0.9, 'surge': 0.8, 'plunge': -0.8,
359
+ 'breakout': 0.6, 'breakdown': -0.6, 'recovery': 0.5, 'recession': -0.7,
360
+ }
361
+
362
+ def compute_rule_based_sentiment(self, text: str) -> float:
363
+ """Simple rule-based sentiment scorer using financial lexicon."""
364
+ text_lower = text.lower()
365
+ score = 0.0
366
+ count = 0
367
+ for word, value in self.sentiment_vocab.items():
368
+ if word in text_lower:
369
+ score += value
370
+ count += 1
371
+ return score / max(count, 1)
372
+
373
+ def aggregate_daily_sentiment(self, sentiments: pd.DataFrame,
374
+ date_col: str = 'date',
375
+ score_col: str = 'sentiment') -> pd.DataFrame:
376
+ """
377
+ Aggregate sentiment scores to daily level.
378
+
379
+ Returns: DataFrame with daily sentiment features:
380
+ - mean sentiment
381
+ - sentiment std (disagreement)
382
+ - sentiment count (attention)
383
+ - positive ratio
384
+ """
385
+ daily = sentiments.groupby(date_col).agg(
386
+ sentiment_mean=(score_col, 'mean'),
387
+ sentiment_std=(score_col, 'std'),
388
+ sentiment_count=(score_col, 'count'),
389
+ sentiment_positive_ratio=(score_col, lambda x: (x > 0).mean()),
390
+ ).reset_index()
391
+
392
+ daily['sentiment_std'] = daily['sentiment_std'].fillna(0)
393
+
394
+ # Momentum of sentiment
395
+ daily['sentiment_momentum_3'] = daily['sentiment_mean'].rolling(3).mean()
396
+ daily['sentiment_momentum_7'] = daily['sentiment_mean'].rolling(7).mean()
397
+
398
+ # Sentiment reversal signal
399
+ daily['sentiment_reversal'] = daily['sentiment_mean'] - daily['sentiment_momentum_7']
400
+
401
+ return daily