Premchan369 commited on
Commit
c522dca
·
verified ·
1 Parent(s): 2065fdc

Upload market_data.py

Browse files
Files changed (1) hide show
  1. market_data.py +200 -0
market_data.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Market Data Pipeline for AlphaForge."""
2
+ import numpy as np
3
+ import pandas as pd
4
+ import yfinance as yf
5
+ from typing import Dict, List, Optional, Tuple
6
+ import warnings
7
+ warnings.filterwarnings('ignore')
8
+
9
+
10
+ class MarketDataPipeline:
11
+ """Fetch and preprocess market data"""
12
+
13
+ def __init__(self, tickers: List[str], start_date: str, end_date: str):
14
+ self.tickers = tickers
15
+ self.start_date = start_date
16
+ self.end_date = end_date
17
+ self.data = {}
18
+
19
+ def fetch_data(self) -> Dict[str, pd.DataFrame]:
20
+ """Fetch OHLCV data for all tickers"""
21
+ print(f"Fetching data for {len(self.tickers)} tickers...")
22
+ for ticker in self.tickers:
23
+ try:
24
+ df = yf.download(ticker, start=self.start_date, end=self.end_date, progress=False)
25
+ if len(df) > 100:
26
+ # Flatten multi-index columns if present
27
+ if isinstance(df.columns, pd.MultiIndex):
28
+ df.columns = df.columns.get_level_values(0)
29
+ df.columns = [c.title() if isinstance(c, str) else c for c in df.columns]
30
+ # Ensure standard column names
31
+ col_map = {}
32
+ for c in df.columns:
33
+ sc = str(c).upper()
34
+ if 'OPEN' in sc:
35
+ col_map[c] = 'Open'
36
+ elif 'HIGH' in sc:
37
+ col_map[c] = 'High'
38
+ elif 'LOW' in sc:
39
+ col_map[c] = 'Low'
40
+ elif 'CLOSE' in sc:
41
+ col_map[c] = 'Close'
42
+ elif 'VOLUME' in sc or 'VOL' in sc:
43
+ col_map[c] = 'Volume'
44
+ if col_map:
45
+ df = df.rename(columns=col_map)
46
+ for req in ['Open', 'High', 'Low', 'Close', 'Volume']:
47
+ if req not in df.columns:
48
+ df[req] = np.nan
49
+ self.data[ticker] = df
50
+ except Exception as e:
51
+ print(f"Error fetching {ticker}: {e}")
52
+ print(f"Successfully fetched {len(self.data)} tickers")
53
+ return self.data
54
+
55
+ def compute_technical_indicators(self, df: pd.DataFrame) -> pd.DataFrame:
56
+ """Compute technical indicators"""
57
+ features = pd.DataFrame(index=df.index)
58
+ close = df['Close'].values.flatten() if hasattr(df['Close'], 'values') else np.array(df['Close']).flatten()
59
+ high = df['High'].values.flatten() if hasattr(df['High'], 'values') else np.array(df['High']).flatten()
60
+ low = df['Low'].values.flatten() if hasattr(df['Low'], 'values') else np.array(df['Low']).flatten()
61
+ volume = df['Volume'].values.flatten() if hasattr(df['Volume'], 'values') else np.array(df['Volume']).flatten()
62
+
63
+ # Returns
64
+ for d in [1, 5, 10, 21, 63]:
65
+ features[f'return_{d}d'] = np.log(close / np.roll(close, d))
66
+
67
+ # Realized volatility
68
+ log_ret = np.log(close / np.roll(close, 1))
69
+ for d in [5, 21, 63]:
70
+ rvol = pd.Series(log_ret).rolling(d).apply(lambda x: np.sqrt(252/d * np.sum(x**2)))
71
+ features[f'rvol_{d}d'] = rvol.values
72
+
73
+ # Moving averages
74
+ for d in [5, 10, 20, 50, 200]:
75
+ sma = pd.Series(close).rolling(d).mean()
76
+ features[f'sma_{d}d'] = sma.values / close - 1
77
+
78
+ # RSI
79
+ delta = pd.Series(close).diff()
80
+ gain = delta.where(delta > 0, 0)
81
+ loss = -delta.where(delta < 0, 0)
82
+ avg_gain = gain.rolling(14).mean()
83
+ avg_loss = loss.rolling(14).mean()
84
+ rs = avg_gain / avg_loss
85
+ features['rsi_14'] = (100 - 100 / (1 + rs)).values
86
+
87
+ # MACD
88
+ ema12 = pd.Series(close).ewm(span=12).mean()
89
+ ema26 = pd.Series(close).ewm(span=26).mean()
90
+ features['macd'] = (ema12 - ema26).values / close
91
+ features['macd_signal'] = pd.Series(features['macd']).ewm(span=9).mean().values
92
+
93
+ # Bollinger Bands
94
+ sma20 = pd.Series(close).rolling(20).mean()
95
+ std20 = pd.Series(close).rolling(20).std()
96
+ features['bb_position'] = ((close - sma20) / (2 * std20)).flatten() if hasattr(sma20, 'values') else (close - sma20.values) / (2 * std20.values)
97
+
98
+ # Volume indicators
99
+ features['volume_sma_ratio'] = (volume / pd.Series(volume).rolling(20).mean().values)
100
+ features['volume_change'] = np.log(volume / np.roll(volume, 1))
101
+
102
+ # Price-based
103
+ features['intraday_range'] = (high - low) / close
104
+ features['open_gap'] = (close - np.roll(close, 1)) / np.roll(close, 1)
105
+
106
+ return features.replace([np.inf, -np.inf], np.nan).fillna(0)
107
+
108
+ def compute_cross_asset_features(self) -> pd.DataFrame:
109
+ """Compute cross-asset correlation and spread features"""
110
+ returns = {}
111
+ for ticker, df in self.data.items():
112
+ close = df['Close'].values.flatten() if hasattr(df['Close'], 'values') else np.array(df['Close']).flatten()
113
+ returns[ticker] = np.log(close / np.roll(close, 1))
114
+
115
+ returns_df = pd.DataFrame(returns, index=list(self.data.values())[0].index).fillna(0)
116
+
117
+ features = pd.DataFrame(index=returns_df.index)
118
+
119
+ # Market beta (vs SPY)
120
+ if 'SPY' in returns_df.columns:
121
+ for ticker in returns_df.columns:
122
+ if ticker != 'SPY':
123
+ beta = returns_df[ticker].rolling(63).cov(returns_df['SPY']) / returns_df['SPY'].rolling(63).var()
124
+ features[f'{ticker}_beta'] = beta.values
125
+
126
+ # Average correlation
127
+ corr_window = returns_df.rolling(63).corr()
128
+ features['avg_correlation'] = corr_window.groupby(level=0).mean().mean(axis=1).values
129
+
130
+ # Sector momentum spreads
131
+ if all(x in returns_df.columns for x in ['XLF', 'XLK', 'XLE']):
132
+ features['fin_vs_tech'] = (returns_df['XLF'] - returns_df['XLK']).rolling(21).sum().values
133
+ features['energy_vs_market'] = (returns_df['XLE'] - returns_df['SPY']).rolling(21).sum().values
134
+
135
+ return features.fillna(0)
136
+
137
+ def create_feature_matrix(self) -> pd.DataFrame:
138
+ """Create unified feature matrix for all assets"""
139
+ all_features = []
140
+
141
+ for ticker, df in self.data.items():
142
+ tech_features = self.compute_technical_indicators(df)
143
+ tech_features['ticker'] = ticker
144
+ tech_features['close'] = df['Close'].values.flatten() if hasattr(df['Close'], 'values') else np.array(df['Close']).flatten()
145
+ all_features.append(tech_features)
146
+
147
+ features_df = pd.concat(all_features, ignore_index=False)
148
+
149
+ # Add cross-asset features
150
+ cross_features = self.compute_cross_asset_features()
151
+
152
+ # Merge
153
+ for col in cross_features.columns:
154
+ features_df[col] = np.nan
155
+ for idx in features_df.index.unique():
156
+ if idx in cross_features.index:
157
+ features_df.loc[idx, col] = cross_features.loc[idx, col]
158
+
159
+ features_df = features_df.fillna(0)
160
+
161
+ # Sliding window z-score normalization per ticker
162
+ numeric_cols = [c for c in features_df.columns if c not in ['ticker', 'close']]
163
+ normalized = features_df.copy()
164
+
165
+ for ticker in normalized['ticker'].unique():
166
+ mask = normalized['ticker'] == ticker
167
+ for col in numeric_cols:
168
+ series = normalized.loc[mask, col]
169
+ normalized.loc[mask, col] = (series - series.rolling(21).mean()) / series.rolling(21).std().replace(0, 1)
170
+
171
+ return normalized.replace([np.inf, -np.inf], 0).fillna(0)
172
+
173
+ def create_sequences(self, features_df: pd.DataFrame, lookback: int = 60,
174
+ forecast_horizon: int = 5) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
175
+ """Create sequences for time series models"""
176
+ X_list, y_list, tickers_list, dates_list = [], [], [], []
177
+
178
+ feature_cols = [c for c in features_df.columns if c not in ['ticker', 'close']]
179
+
180
+ for ticker in features_df['ticker'].unique():
181
+ ticker_df = features_df[features_df['ticker'] == ticker].copy()
182
+ ticker_df = ticker_df.sort_index()
183
+
184
+ if len(ticker_df) < lookback + forecast_horizon + 21:
185
+ continue
186
+
187
+ values = ticker_df[feature_cols].values
188
+ closes = ticker_df['close'].values
189
+
190
+ for i in range(lookback, len(values) - forecast_horizon):
191
+ X_list.append(values[i-lookback:i])
192
+
193
+ # Target: future return
194
+ future_return = np.log(closes[i + forecast_horizon] / closes[i])
195
+ y_list.append(future_return)
196
+ tickers_list.append(ticker)
197
+ dates_list.append(ticker_df.index[i])
198
+
199
+ return (np.array(X_list), np.array(y_list),
200
+ np.array(tickers_list), np.array(dates_list))