File size: 9,201 Bytes

c522dca

"""Market Data Pipeline for AlphaForge."""
import numpy as np
import pandas as pd
import yfinance as yf
from typing import Dict, List, Optional, Tuple
import warnings
warnings.filterwarnings('ignore')


class MarketDataPipeline:
    """Fetch and preprocess market data"""
    
    def __init__(self, tickers: List[str], start_date: str, end_date: str):
        self.tickers = tickers
        self.start_date = start_date
        self.end_date = end_date
        self.data = {}
        
    def fetch_data(self) -> Dict[str, pd.DataFrame]:
        """Fetch OHLCV data for all tickers"""
        print(f"Fetching data for {len(self.tickers)} tickers...")
        for ticker in self.tickers:
            try:
                df = yf.download(ticker, start=self.start_date, end=self.end_date, progress=False)
                if len(df) > 100:
                    # Flatten multi-index columns if present
                    if isinstance(df.columns, pd.MultiIndex):
                        df.columns = df.columns.get_level_values(0)
                    df.columns = [c.title() if isinstance(c, str) else c for c in df.columns]
                    # Ensure standard column names
                    col_map = {}
                    for c in df.columns:
                        sc = str(c).upper()
                        if 'OPEN' in sc:
                            col_map[c] = 'Open'
                        elif 'HIGH' in sc:
                            col_map[c] = 'High'
                        elif 'LOW' in sc:
                            col_map[c] = 'Low'
                        elif 'CLOSE' in sc:
                            col_map[c] = 'Close'
                        elif 'VOLUME' in sc or 'VOL' in sc:
                            col_map[c] = 'Volume'
                    if col_map:
                        df = df.rename(columns=col_map)
                    for req in ['Open', 'High', 'Low', 'Close', 'Volume']:
                        if req not in df.columns:
                            df[req] = np.nan
                    self.data[ticker] = df
            except Exception as e:
                print(f"Error fetching {ticker}: {e}")
        print(f"Successfully fetched {len(self.data)} tickers")
        return self.data
    
    def compute_technical_indicators(self, df: pd.DataFrame) -> pd.DataFrame:
        """Compute technical indicators"""
        features = pd.DataFrame(index=df.index)
        close = df['Close'].values.flatten() if hasattr(df['Close'], 'values') else np.array(df['Close']).flatten()
        high = df['High'].values.flatten() if hasattr(df['High'], 'values') else np.array(df['High']).flatten()
        low = df['Low'].values.flatten() if hasattr(df['Low'], 'values') else np.array(df['Low']).flatten()
        volume = df['Volume'].values.flatten() if hasattr(df['Volume'], 'values') else np.array(df['Volume']).flatten()
        
        # Returns
        for d in [1, 5, 10, 21, 63]:
            features[f'return_{d}d'] = np.log(close / np.roll(close, d))
        
        # Realized volatility
        log_ret = np.log(close / np.roll(close, 1))
        for d in [5, 21, 63]:
            rvol = pd.Series(log_ret).rolling(d).apply(lambda x: np.sqrt(252/d * np.sum(x**2)))
            features[f'rvol_{d}d'] = rvol.values
        
        # Moving averages
        for d in [5, 10, 20, 50, 200]:
            sma = pd.Series(close).rolling(d).mean()
            features[f'sma_{d}d'] = sma.values / close - 1
        
        # RSI
        delta = pd.Series(close).diff()
        gain = delta.where(delta > 0, 0)
        loss = -delta.where(delta < 0, 0)
        avg_gain = gain.rolling(14).mean()
        avg_loss = loss.rolling(14).mean()
        rs = avg_gain / avg_loss
        features['rsi_14'] = (100 - 100 / (1 + rs)).values
        
        # MACD
        ema12 = pd.Series(close).ewm(span=12).mean()
        ema26 = pd.Series(close).ewm(span=26).mean()
        features['macd'] = (ema12 - ema26).values / close
        features['macd_signal'] = pd.Series(features['macd']).ewm(span=9).mean().values
        
        # Bollinger Bands
        sma20 = pd.Series(close).rolling(20).mean()
        std20 = pd.Series(close).rolling(20).std()
        features['bb_position'] = ((close - sma20) / (2 * std20)).flatten() if hasattr(sma20, 'values') else (close - sma20.values) / (2 * std20.values)
        
        # Volume indicators
        features['volume_sma_ratio'] = (volume / pd.Series(volume).rolling(20).mean().values)
        features['volume_change'] = np.log(volume / np.roll(volume, 1))
        
        # Price-based
        features['intraday_range'] = (high - low) / close
        features['open_gap'] = (close - np.roll(close, 1)) / np.roll(close, 1)
        
        return features.replace([np.inf, -np.inf], np.nan).fillna(0)
    
    def compute_cross_asset_features(self) -> pd.DataFrame:
        """Compute cross-asset correlation and spread features"""
        returns = {}
        for ticker, df in self.data.items():
            close = df['Close'].values.flatten() if hasattr(df['Close'], 'values') else np.array(df['Close']).flatten()
            returns[ticker] = np.log(close / np.roll(close, 1))
        
        returns_df = pd.DataFrame(returns, index=list(self.data.values())[0].index).fillna(0)
        
        features = pd.DataFrame(index=returns_df.index)
        
        # Market beta (vs SPY)
        if 'SPY' in returns_df.columns:
            for ticker in returns_df.columns:
                if ticker != 'SPY':
                    beta = returns_df[ticker].rolling(63).cov(returns_df['SPY']) / returns_df['SPY'].rolling(63).var()
                    features[f'{ticker}_beta'] = beta.values
        
        # Average correlation
        corr_window = returns_df.rolling(63).corr()
        features['avg_correlation'] = corr_window.groupby(level=0).mean().mean(axis=1).values
        
        # Sector momentum spreads
        if all(x in returns_df.columns for x in ['XLF', 'XLK', 'XLE']):
            features['fin_vs_tech'] = (returns_df['XLF'] - returns_df['XLK']).rolling(21).sum().values
            features['energy_vs_market'] = (returns_df['XLE'] - returns_df['SPY']).rolling(21).sum().values
        
        return features.fillna(0)
    
    def create_feature_matrix(self) -> pd.DataFrame:
        """Create unified feature matrix for all assets"""
        all_features = []
        
        for ticker, df in self.data.items():
            tech_features = self.compute_technical_indicators(df)
            tech_features['ticker'] = ticker
            tech_features['close'] = df['Close'].values.flatten() if hasattr(df['Close'], 'values') else np.array(df['Close']).flatten()
            all_features.append(tech_features)
        
        features_df = pd.concat(all_features, ignore_index=False)
        
        # Add cross-asset features
        cross_features = self.compute_cross_asset_features()
        
        # Merge
        for col in cross_features.columns:
            features_df[col] = np.nan
            for idx in features_df.index.unique():
                if idx in cross_features.index:
                    features_df.loc[idx, col] = cross_features.loc[idx, col]
        
        features_df = features_df.fillna(0)
        
        # Sliding window z-score normalization per ticker
        numeric_cols = [c for c in features_df.columns if c not in ['ticker', 'close']]
        normalized = features_df.copy()
        
        for ticker in normalized['ticker'].unique():
            mask = normalized['ticker'] == ticker
            for col in numeric_cols:
                series = normalized.loc[mask, col]
                normalized.loc[mask, col] = (series - series.rolling(21).mean()) / series.rolling(21).std().replace(0, 1)
        
        return normalized.replace([np.inf, -np.inf], 0).fillna(0)
    
    def create_sequences(self, features_df: pd.DataFrame, lookback: int = 60, 
                         forecast_horizon: int = 5) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """Create sequences for time series models"""
        X_list, y_list, tickers_list, dates_list = [], [], [], []
        
        feature_cols = [c for c in features_df.columns if c not in ['ticker', 'close']]
        
        for ticker in features_df['ticker'].unique():
            ticker_df = features_df[features_df['ticker'] == ticker].copy()
            ticker_df = ticker_df.sort_index()
            
            if len(ticker_df) < lookback + forecast_horizon + 21:
                continue
            
            values = ticker_df[feature_cols].values
            closes = ticker_df['close'].values
            
            for i in range(lookback, len(values) - forecast_horizon):
                X_list.append(values[i-lookback:i])
                
                # Target: future return
                future_return = np.log(closes[i + forecast_horizon] / closes[i])
                y_list.append(future_return)
                tickers_list.append(ticker)
                dates_list.append(ticker_df.index[i])
        
        return (np.array(X_list), np.array(y_list), 
                np.array(tickers_list), np.array(dates_list))