Premchan369
/

alphaforge-quant-system

+"""Market Data Pipeline for AlphaForge."""
+import numpy as np
+import pandas as pd
+import yfinance as yf
+from typing import Dict, List, Optional, Tuple
+import warnings
+warnings.filterwarnings('ignore')
+class MarketDataPipeline:
+    """Fetch and preprocess market data"""
+    def __init__(self, tickers: List[str], start_date: str, end_date: str):
+        self.tickers = tickers
+        self.start_date = start_date
+        self.end_date = end_date
+        self.data = {}
+    def fetch_data(self) -> Dict[str, pd.DataFrame]:
+        """Fetch OHLCV data for all tickers"""
+        print(f"Fetching data for {len(self.tickers)} tickers...")
+        for ticker in self.tickers:
+            try:
+                df = yf.download(ticker, start=self.start_date, end=self.end_date, progress=False)
+                if len(df) > 100:
+                    # Flatten multi-index columns if present
+                    if isinstance(df.columns, pd.MultiIndex):
+                        df.columns = df.columns.get_level_values(0)
+                    df.columns = [c.title() if isinstance(c, str) else c for c in df.columns]
+                    # Ensure standard column names
+                    col_map = {}
+                    for c in df.columns:
+                        sc = str(c).upper()
+                        if 'OPEN' in sc:
+                            col_map[c] = 'Open'
+                        elif 'HIGH' in sc:
+                            col_map[c] = 'High'
+                        elif 'LOW' in sc:
+                            col_map[c] = 'Low'
+                        elif 'CLOSE' in sc:
+                            col_map[c] = 'Close'
+                        elif 'VOLUME' in sc or 'VOL' in sc:
+                            col_map[c] = 'Volume'
+                    if col_map:
+                        df = df.rename(columns=col_map)
+                    for req in ['Open', 'High', 'Low', 'Close', 'Volume']:
+                        if req not in df.columns:
+                            df[req] = np.nan
+                    self.data[ticker] = df
+            except Exception as e:
+                print(f"Error fetching {ticker}: {e}")
+        print(f"Successfully fetched {len(self.data)} tickers")
+        return self.data
+    def compute_technical_indicators(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Compute technical indicators"""
+        features = pd.DataFrame(index=df.index)
+        close = df['Close'].values.flatten() if hasattr(df['Close'], 'values') else np.array(df['Close']).flatten()
+        high = df['High'].values.flatten() if hasattr(df['High'], 'values') else np.array(df['High']).flatten()
+        low = df['Low'].values.flatten() if hasattr(df['Low'], 'values') else np.array(df['Low']).flatten()
+        volume = df['Volume'].values.flatten() if hasattr(df['Volume'], 'values') else np.array(df['Volume']).flatten()
+        # Returns
+        for d in [1, 5, 10, 21, 63]:
+            features[f'return_{d}d'] = np.log(close / np.roll(close, d))
+        # Realized volatility
+        log_ret = np.log(close / np.roll(close, 1))
+        for d in [5, 21, 63]:
+            rvol = pd.Series(log_ret).rolling(d).apply(lambda x: np.sqrt(252/d * np.sum(x**2)))
+            features[f'rvol_{d}d'] = rvol.values
+        # Moving averages
+        for d in [5, 10, 20, 50, 200]:
+            sma = pd.Series(close).rolling(d).mean()
+            features[f'sma_{d}d'] = sma.values / close - 1
+        # RSI
+        delta = pd.Series(close).diff()
+        gain = delta.where(delta > 0, 0)
+        loss = -delta.where(delta < 0, 0)
+        avg_gain = gain.rolling(14).mean()
+        avg_loss = loss.rolling(14).mean()
+        rs = avg_gain / avg_loss
+        features['rsi_14'] = (100 - 100 / (1 + rs)).values
+        # MACD
+        ema12 = pd.Series(close).ewm(span=12).mean()
+        ema26 = pd.Series(close).ewm(span=26).mean()
+        features['macd'] = (ema12 - ema26).values / close
+        features['macd_signal'] = pd.Series(features['macd']).ewm(span=9).mean().values
+        # Bollinger Bands
+        sma20 = pd.Series(close).rolling(20).mean()
+        std20 = pd.Series(close).rolling(20).std()
+        features['bb_position'] = ((close - sma20) / (2 * std20)).flatten() if hasattr(sma20, 'values') else (close - sma20.values) / (2 * std20.values)
+        # Volume indicators
+        features['volume_sma_ratio'] = (volume / pd.Series(volume).rolling(20).mean().values)
+        features['volume_change'] = np.log(volume / np.roll(volume, 1))
+        # Price-based
+        features['intraday_range'] = (high - low) / close
+        features['open_gap'] = (close - np.roll(close, 1)) / np.roll(close, 1)
+        return features.replace([np.inf, -np.inf], np.nan).fillna(0)
+    def compute_cross_asset_features(self) -> pd.DataFrame:
+        """Compute cross-asset correlation and spread features"""
+        returns = {}
+        for ticker, df in self.data.items():
+            close = df['Close'].values.flatten() if hasattr(df['Close'], 'values') else np.array(df['Close']).flatten()
+            returns[ticker] = np.log(close / np.roll(close, 1))
+        returns_df = pd.DataFrame(returns, index=list(self.data.values())[0].index).fillna(0)
+        features = pd.DataFrame(index=returns_df.index)
+        # Market beta (vs SPY)
+        if 'SPY' in returns_df.columns:
+            for ticker in returns_df.columns:
+                if ticker != 'SPY':
+                    beta = returns_df[ticker].rolling(63).cov(returns_df['SPY']) / returns_df['SPY'].rolling(63).var()
+                    features[f'{ticker}_beta'] = beta.values
+        # Average correlation
+        corr_window = returns_df.rolling(63).corr()
+        features['avg_correlation'] = corr_window.groupby(level=0).mean().mean(axis=1).values
+        # Sector momentum spreads
+        if all(x in returns_df.columns for x in ['XLF', 'XLK', 'XLE']):
+            features['fin_vs_tech'] = (returns_df['XLF'] - returns_df['XLK']).rolling(21).sum().values
+            features['energy_vs_market'] = (returns_df['XLE'] - returns_df['SPY']).rolling(21).sum().values
+        return features.fillna(0)
+    def create_feature_matrix(self) -> pd.DataFrame:
+        """Create unified feature matrix for all assets"""
+        all_features = []
+        for ticker, df in self.data.items():
+            tech_features = self.compute_technical_indicators(df)
+            tech_features['ticker'] = ticker
+            tech_features['close'] = df['Close'].values.flatten() if hasattr(df['Close'], 'values') else np.array(df['Close']).flatten()
+            all_features.append(tech_features)
+        features_df = pd.concat(all_features, ignore_index=False)
+        # Add cross-asset features
+        cross_features = self.compute_cross_asset_features()
+        # Merge
+        for col in cross_features.columns:
+            features_df[col] = np.nan
+            for idx in features_df.index.unique():
+                if idx in cross_features.index:
+                    features_df.loc[idx, col] = cross_features.loc[idx, col]
+        features_df = features_df.fillna(0)
+        # Sliding window z-score normalization per ticker
+        numeric_cols = [c for c in features_df.columns if c not in ['ticker', 'close']]
+        normalized = features_df.copy()
+        for ticker in normalized['ticker'].unique():
+            mask = normalized['ticker'] == ticker
+            for col in numeric_cols:
+                series = normalized.loc[mask, col]
+                normalized.loc[mask, col] = (series - series.rolling(21).mean()) / series.rolling(21).std().replace(0, 1)
+        return normalized.replace([np.inf, -np.inf], 0).fillna(0)
+    def create_sequences(self, features_df: pd.DataFrame, lookback: int = 60,
+                         forecast_horizon: int = 5) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """Create sequences for time series models"""
+        X_list, y_list, tickers_list, dates_list = [], [], [], []
+        feature_cols = [c for c in features_df.columns if c not in ['ticker', 'close']]
+        for ticker in features_df['ticker'].unique():
+            ticker_df = features_df[features_df['ticker'] == ticker].copy()
+            ticker_df = ticker_df.sort_index()
+            if len(ticker_df) < lookback + forecast_horizon + 21:
+                continue
+            values = ticker_df[feature_cols].values
+            closes = ticker_df['close'].values
+            for i in range(lookback, len(values) - forecast_horizon):
+                X_list.append(values[i-lookback:i])
+                # Target: future return
+                future_return = np.log(closes[i + forecast_horizon] / closes[i])
+                y_list.append(future_return)
+                tickers_list.append(ticker)
+                dates_list.append(ticker_df.index[i])
+        return (np.array(X_list), np.array(y_list),
+                np.array(tickers_list), np.array(dates_list))