import os from typing import Tuple, List import numpy as np import pandas as pd import torch from torch import nn from torch_geometric.data import Data, DataLoader from torch_geometric.nn import GCNConv from sklearn.preprocessing import StandardScaler from sklearn.metrics import mean_squared_error, mean_absolute_error import matplotlib.pyplot as plt import datetime as dt import time # ----------------------------------------------------- # 1. Data loading and preprocessing # ----------------------------------------------------- def load_it_sector_data_from_csvs( infy_csv: str, tcs_csv: str, nifty_it_csv: str, ) -> Tuple[np.ndarray, np.ndarray, List[pd.Timestamp], List[str]]: """Load IT sector data from separate CSV files and build cleaned feature + target tensors. Methodology alignment --------------------- - Data Collection: uses OHLCV-style fields from the NSE IT sector file. - Preprocessing / Cleaning: * Parse dates and sort. * Filter to equity series (EQ). * Remove duplicates and rows with missing / invalid key values. * Filter out non-trading days (zero / negative volume). * Forward-fill remaining gaps. - Derived Indicators: * Daily returns. * 5-day moving average of close. * 20-day rolling volatility of returns. Returns ------- features : np.ndarray Shape [num_dates, num_companies, num_features]. Features per company per day include normalized price/volume and indicators. targets : np.ndarray Shape [num_dates, num_companies]. Daily returns per company (prediction target). dates : list of pd.Timestamp Trading dates. companies : list of str List of company tickers (node names in the graph). """ # --------------------------------- # Load individual CSVs # --------------------------------- infy_df = pd.read_csv(infy_csv) tcs_df = pd.read_csv(tcs_csv) index_df = pd.read_csv(nifty_it_csv) # Add a Company identifier manually infy_df["Company"] = "INFY" tcs_df["Company"] = "TCS" index_df["Company"] = "NIFTY_IT" # Harmonize columns where needed for the index # Ensure required OHLCV columns exist (use Close/Volume, ignore others if missing) for df in [infy_df, tcs_df, index_df]: df["Date"] = pd.to_datetime(df["Date"]) # For the index, mimic equity-style columns for compatibility if "Series" not in index_df.columns: index_df["Series"] = "EQ" if "Close" not in index_df.columns and "Close" in index_df.columns: # Already present; this branch is just a safety net pass if "Volume" not in index_df.columns and "Volume" in index_df.columns: # Already present; just a safety net pass # Unify columns to a common subset common_cols = [ "Date", "Company", "Series", "Open", "High", "Low", "Close", "Volume", ] # For stock CSVs, ensure the above columns are present for stock_df in [infy_df, tcs_df]: # They already have Symbol, Series, Prev Close, Open, High, Low, Last, Close, VWAP, Volume, ... # We just keep the columns we need and drop the rest later. pass # For index, keep only the needed OHLCV columns and Series/Company index_df = index_df[["Date", "Open", "High", "Low", "Close", "Volume", "Company", "Series"]] # Make sure column order matches common_cols index_df = index_df[["Date", "Company", "Series", "Open", "High", "Low", "Close", "Volume"]] # Align stock DataFrames to the same schema infy_df = infy_df[["Date", "Company", "Series", "Open", "High", "Low", "Close", "Volume"]] tcs_df = tcs_df[["Date", "Company", "Series", "Open", "High", "Low", "Close", "Volume"]] # Concatenate all into one panel-like table df = pd.concat([infy_df, tcs_df, index_df], ignore_index=True) # ------------------------- # Basic cleaning steps # ------------------------- # Ensure proper dtypes and ordering df["Date"] = pd.to_datetime(df["Date"]) # Keep only equity series if "Series" in df.columns: df = df[df["Series"] == "EQ"] # Drop rows with critical missing values df = df.dropna(subset=["Company", "Close", "Volume", "Open", "High", "Low"]) # Remove zero / negative volume (non-trading or bad records) df = df[df["Volume"] > 0] # Drop exact duplicates on (Date, Company) df = df.drop_duplicates(subset=["Date", "Company"]) # Sort by date then company df = df.sort_values(["Date", "Company"]) # Use the "Company" column as canonical ticker (INFY, TCS, HCLTECH, TECHM, WIPRO, ...) companies = sorted(df["Company"].unique().tolist()) # Pivot to Date x Company for OHLCV-like data close = df.pivot_table(index="Date", columns="Company", values="Close") volume = df.pivot_table(index="Date", columns="Company", values="Volume") # Ensure consistent column order close = close[companies] volume = volume[companies] # Forward-fill missing values along time for each company close = close.ffill() volume = volume.ffill() # ------------------------- # Derived indicators # ------------------------- # 1-day simple returns (percentage change) returns = close.pct_change().replace([np.inf, -np.inf], np.nan).fillna(0.0) # 5-day moving average of closing price (trend) ma5 = close.rolling(window=5, min_periods=1).mean().ffill() # 20-day rolling volatility of returns (risk) vol20 = ( returns.rolling(window=20, min_periods=1) .std() .replace([np.inf, -np.inf], np.nan) .fillna(0.0) .ffill() ) # ------------------------- # Normalization per company # ------------------------- scaler_close = StandardScaler() scaler_vol = StandardScaler() scaler_ma5 = StandardScaler() scaler_vol20 = StandardScaler() close_scaled = pd.DataFrame( scaler_close.fit_transform(close.values), index=close.index, columns=close.columns, ) volume_scaled = pd.DataFrame( scaler_vol.fit_transform(volume.values), index=volume.index, columns=volume.columns, ) ma5_scaled = pd.DataFrame( scaler_ma5.fit_transform(ma5.values), index=ma5.index, columns=ma5.columns, ) vol20_scaled = pd.DataFrame( scaler_vol20.fit_transform(vol20.values), index=vol20.index, columns=vol20.columns, ) dates = close.index.to_list() num_dates = len(dates) num_companies = len(companies) # Features per node per day: # [normalized close, normalized volume, raw return, normalized MA5, normalized VOL20] num_features = 5 features = np.zeros((num_dates, num_companies, num_features), dtype=np.float32) for j, c in enumerate(companies): features[:, j, 0] = close_scaled[c].values features[:, j, 1] = volume_scaled[c].values features[:, j, 2] = returns[c].values features[:, j, 3] = ma5_scaled[c].values features[:, j, 4] = vol20_scaled[c].values targets = returns.values.astype(np.float32) # predict daily returns return features, targets, dates, companies # ----------------------------------------------------- # 2. Graph construction (correlation-based) # ----------------------------------------------------- def build_correlation_graph(returns: np.ndarray, threshold: float = 0.2) -> torch.Tensor: """Build an undirected graph of companies based on return correlations. Parameters ---------- returns : np.ndarray Array of shape [num_dates, num_companies] with daily returns. threshold : float Minimum absolute correlation to create an edge. Returns ------- edge_index : torch.Tensor Tensor of shape [2, num_edges] in COO format for PyTorch Geometric. """ # Correlation across companies corr = np.corrcoef(returns.T) # [num_companies, num_companies] num_nodes = corr.shape[0] edge_index_list = [] for i in range(num_nodes): for j in range(num_nodes): if i == j: continue if np.abs(corr[i, j]) >= threshold: edge_index_list.append([i, j]) # Fallback: fully-connected graph (without self-loops) if threshold is too high if len(edge_index_list) == 0: for i in range(num_nodes): for j in range(num_nodes): if i != j: edge_index_list.append([i, j]) edge_index = torch.tensor(edge_index_list, dtype=torch.long).t().contiguous() return edge_index # ----------------------------------------------------- # 3. Dataset for time-windowed graph snapshots # ----------------------------------------------------- class TimeSeriesGraphDataset(torch.utils.data.Dataset): """Dataset that converts time series into windowed graph snapshots for GNNs. Each item is a Data object with: - x: node features [num_nodes, window_size * num_features] - edge_index: static company correlation graph - y: target returns [num_nodes] """ def __init__( self, features: np.ndarray, targets: np.ndarray, edge_index: torch.Tensor, window_size: int, start_t: int, end_t: int, ) -> None: super().__init__() self.features = features self.targets = targets self.edge_index = edge_index self.window_size = window_size self.start_t = start_t self.end_t = end_t def __len__(self) -> int: return self.end_t - self.start_t def __getitem__(self, idx: int) -> Data: t = self.start_t + idx # Use previous `window_size` days to predict returns at day t window_feats = self.features[t - self.window_size : t] # [W, N, F] window, num_nodes, num_feat = window_feats.shape # Keep the temporal dimension for LSTM-based encoding. # Shape: [num_nodes, window, num_feat] x_seq = window_feats.transpose(1, 0, 2) y = self.targets[t] # [num_nodes] data = Data( x=torch.from_numpy(x_seq), # [num_nodes, window, num_feat] edge_index=self.edge_index, y=torch.from_numpy(y), ) return data # ----------------------------------------------------- # 4. GNN model definition (GCN for regression) # ----------------------------------------------------- class GNNTimeSeriesModel(nn.Module): """LSTM + GCN hybrid for multi-node time-series regression. Methodology alignment --------------------- - Temporal Feature Extraction: shared LSTM encodes each stock's past W days. - GNN Application: GCN layers propagate information over the inter-stock graph. - Prediction: per-node regression head outputs next-day return. """ def __init__( self, window_size: int, num_features: int, hidden_lstm: int = 64, hidden_gnn: int = 64, dropout: float = 0.2, ) -> None: super().__init__() self.window_size = window_size self.num_features = num_features # Temporal encoder: LSTM over W x F for each stock self.lstm = nn.LSTM( input_size=num_features, hidden_size=hidden_lstm, num_layers=1, batch_first=False, # we will feed [W, N, F] ) # Graph convolution layers operating on LSTM embeddings self.conv1 = GCNConv(hidden_lstm, hidden_gnn) self.conv2 = GCNConv(hidden_gnn, hidden_gnn) self.lin = nn.Linear(hidden_gnn, 1) self.dropout = nn.Dropout(dropout) def forward(self, x: torch.Tensor, edge_index: torch.Tensor) -> torch.Tensor: """Forward pass. Parameters ---------- x : torch.Tensor Shape [num_nodes_total_in_batch, window, num_features]. edge_index : torch.Tensor Graph edges for the batched graph. """ # ----------------------------- # Temporal feature extraction # ----------------------------- # x_seq: [num_nodes_total, window, num_features] num_nodes_total, window, num_feat = x.shape assert ( window == self.window_size and num_feat == self.num_features ), "Input window/feature dims do not match model configuration." # LSTM expects [seq_len, batch, input_size] x_seq = x.permute(1, 0, 2) # [window, num_nodes_total, num_features] _, (h_n, _) = self.lstm(x_seq) # Last layer hidden state: [num_nodes_total, hidden_lstm] h_last = h_n[-1] # ----------------------------- # Graph convolution over stocks # ----------------------------- x_g = self.conv1(h_last, edge_index) x_g = torch.relu(x_g) x_g = self.dropout(x_g) x_g = self.conv2(x_g, edge_index) x_g = torch.relu(x_g) x_g = self.dropout(x_g) out = self.lin(x_g).squeeze(-1) # [num_nodes_total] return out # ----------------------------------------------------- # 5. Training and evaluation utilities # ----------------------------------------------------- def train_one_epoch( model: nn.Module, loader: DataLoader, optimizer: torch.optim.Optimizer, device: torch.device, ) -> float: model.train() criterion = nn.MSELoss() total_loss = 0.0 for batch in loader: batch = batch.to(device) optimizer.zero_grad() out = model(batch.x, batch.edge_index) loss = criterion(out, batch.y) loss.backward() optimizer.step() total_loss += loss.item() * batch.num_graphs avg_loss = total_loss / len(loader.dataset) return avg_loss def evaluate( model: nn.Module, loader: DataLoader, device: torch.device, ): model.eval() criterion = nn.MSELoss() total_loss = 0.0 all_y_true = [] all_y_pred = [] with torch.no_grad(): for batch in loader: batch = batch.to(device) out = model(batch.x, batch.edge_index) loss = criterion(out, batch.y) total_loss += loss.item() * batch.num_graphs all_y_true.append(batch.y.cpu().numpy()) all_y_pred.append(out.cpu().numpy()) y_true = np.concatenate(all_y_true) y_pred = np.concatenate(all_y_pred) # ------------------------------------------------- # Guard against NaN/Inf in predictions or targets # ------------------------------------------------- mask = np.isfinite(y_true) & np.isfinite(y_pred) if mask.sum() == 0: # Fallback: avoid crashing; metrics will be NaN but training can continue mse = float("nan") mae = float("nan") directional_accuracy = float("nan") avg_loss = total_loss / max(len(loader.dataset), 1) return avg_loss, mse, mae, directional_accuracy, y_true, y_pred y_true_clean = y_true[mask] y_pred_clean = y_pred[mask] mse = mean_squared_error(y_true_clean, y_pred_clean) mae = mean_absolute_error(y_true_clean, y_pred_clean) # Directional accuracy: how often the sign of return is predicted correctly directional_accuracy = float((np.sign(y_true_clean) == np.sign(y_pred_clean)).mean()) avg_loss = total_loss / len(loader.dataset) return avg_loss, mse, mae, directional_accuracy, y_true_clean, y_pred_clean # ----------------------------------------------------- # 6. Baseline (before GNN) and real-time helpers # ----------------------------------------------------- def compute_naive_baseline_metrics(targets: np.ndarray, train_start: int, train_end: int, val_start: int, val_end: int, test_start: int, test_end: int): """Compute a simple baseline: predict zero return (no change) and plot vs actual. This represents a "before GNN" naive model where we assume next-day return = 0. """ # Flatten across all nodes y_train = targets[train_start:train_end].reshape(-1) y_val = targets[val_start:val_end].reshape(-1) y_test = targets[test_start:test_end].reshape(-1) # Baseline predictions are all zeros y_train_pred = np.zeros_like(y_train) y_val_pred = np.zeros_like(y_val) y_test_pred = np.zeros_like(y_test) train_mse = mean_squared_error(y_train, y_train_pred) val_mse = mean_squared_error(y_val, y_val_pred) test_mse = mean_squared_error(y_test, y_test_pred) # Plot for test set plt.figure(figsize=(6, 6)) plt.scatter(y_test, y_test_pred, alpha=0.3, s=10) plt.xlabel("Actual returns") plt.ylabel("Predicted returns (baseline: 0)") plt.title("Baseline (No GNN) Predicted vs Actual Returns") lims = [min(y_test.min(), y_test_pred.min()), max(y_test.max(), y_test_pred.max())] plt.plot(lims, lims, "r--", linewidth=1) plt.tight_layout() plt.savefig("baseline_pred_vs_actual.png", dpi=200) plt.close() print(f"Baseline Train MSE: {train_mse:.6f}, Val MSE: {val_mse:.6f}, Test MSE: {test_mse:.6f}") print("Saved baseline scatter plot to baseline_pred_vs_actual.png") def realtime_predict_last_window( model: nn.Module, features: np.ndarray, edge_index: torch.Tensor, window_size: int, device: torch.device, ): """Generate a real-time style prediction for the latest available day. This uses the most recent `window_size` days in `features` as if it were "live" data. """ model.eval() num_dates, num_nodes, num_feat = features.shape if num_dates < window_size: raise ValueError("Not enough data points for real-time window prediction.") # Last window window_feats = features[num_dates - window_size : num_dates] # [W, N, F] window, N, F = window_feats.shape x_seq = window_feats.transpose(1, 0, 2) # [N, W, F] data = Data( x=torch.from_numpy(x_seq).to(device), edge_index=edge_index.to(device), ) with torch.no_grad(): out = model(data.x, data.edge_index).cpu().numpy() return out # [num_nodes] # ----------------------------------------------------- # 7. Main experiment pipeline # ----------------------------------------------------- def main(): infy_csv = "infy_stock.csv" tcs_csv = "tcs_stock.csv" nifty_it_csv = "nifty_it_index.csv" for p in [infy_csv, tcs_csv, nifty_it_csv]: if not os.path.exists(p): raise FileNotFoundError(f"Could not find required CSV file: {p}") print("Loading and preprocessing data from CSVs...") features, targets, dates, companies = load_it_sector_data_from_csvs( infy_csv=infy_csv, tcs_csv=tcs_csv, nifty_it_csv=nifty_it_csv, ) num_dates, num_companies, num_features = features.shape print(f"Num dates: {num_dates}, Num companies (nodes): {num_companies}, Num features: {num_features}") # Build graph from training-period correlations only (to avoid look-ahead bias) window_size = 20 if num_dates <= window_size + 1: raise ValueError("Not enough dates to create time windows. Reduce window_size or use more data.") first_t = window_size last_t = num_dates - 1 total_samples = last_t - first_t + 1 train_samples = int(total_samples * 0.7) val_samples = int(total_samples * 0.15) test_samples = total_samples - train_samples - val_samples train_start_t = first_t train_end_t = train_start_t + train_samples val_start_t = train_end_t val_end_t = val_start_t + val_samples test_start_t = val_end_t test_end_t = last_t + 1 print(f"Total usable samples: {total_samples}") print(f"Train: {train_samples}, Val: {val_samples}, Test: {test_samples}") # ----------------------------- # Baseline (before GNN) # ----------------------------- compute_naive_baseline_metrics( targets, train_start=train_start_t, train_end=train_end_t, val_start=val_start_t, val_end=val_end_t, test_start=test_start_t, test_end=test_end_t, ) # Use only training period to compute correlations train_returns = targets[train_start_t:train_end_t] edge_index = build_correlation_graph(train_returns, threshold=0.2) print("Edge index shape:", edge_index.shape) # Create datasets train_dataset = TimeSeriesGraphDataset( features=features, targets=targets, edge_index=edge_index, window_size=window_size, start_t=train_start_t, end_t=train_end_t, ) val_dataset = TimeSeriesGraphDataset( features=features, targets=targets, edge_index=edge_index, window_size=window_size, start_t=val_start_t, end_t=val_end_t, ) test_dataset = TimeSeriesGraphDataset( features=features, targets=targets, edge_index=edge_index, window_size=window_size, start_t=test_start_t, end_t=test_end_t, ) train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False) test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Using device:", device) model = GNNTimeSeriesModel( window_size=window_size, num_features=num_features, hidden_lstm=64, hidden_gnn=64, dropout=0.2, ).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4) num_epochs = 30 best_val_loss = float("inf") best_state_dict = None print("Starting training...") for epoch in range(1, num_epochs + 1): train_loss = train_one_epoch(model, train_loader, optimizer, device) val_loss, val_mse, val_mae, val_dir_acc, _, _ = evaluate(model, val_loader, device) if val_loss < best_val_loss: best_val_loss = val_loss best_state_dict = {k: v.cpu().clone() for k, v in model.state_dict().items()} print( f"Epoch {epoch:03d} | " f"Train Loss: {train_loss:.6f} | " f"Val Loss: {val_loss:.6f}, Val MSE: {val_mse:.6f}, Val MAE: {val_mae:.6f}, " f"Val DirAcc: {val_dir_acc:.4f}" ) if best_state_dict is not None: model.load_state_dict(best_state_dict) print("Evaluating on test set...") test_loss, test_mse, test_mae, test_dir_acc, y_true, y_pred = evaluate(model, test_loader, device) print( f"Test Loss: {test_loss:.6f}, Test MSE: {test_mse:.6f}, " f"Test MAE: {test_mae:.6f}, Test DirAcc: {test_dir_acc:.4f}" ) # ------------------------------------------------- # Simple visualization: predicted vs actual returns # ------------------------------------------------- plt.figure(figsize=(6, 6)) plt.scatter(y_true, y_pred, alpha=0.3, s=10) plt.xlabel("Actual returns") plt.ylabel("Predicted returns") plt.title("GNN Predicted vs Actual Daily Returns (All IT Stocks)") lims = [min(y_true.min(), y_pred.min()), max(y_true.max(), y_pred.max())] plt.plot(lims, lims, "r--", linewidth=1) plt.tight_layout() plt.savefig("gnn_it_sector_pred_vs_actual.png", dpi=200) plt.close() print("Saved scatter plot to gnn_it_sector_pred_vs_actual.png") # ------------------------------------------------- # Real-time style prediction using latest window # ------------------------------------------------- latest_pred = realtime_predict_last_window( model=model, features=features, edge_index=edge_index, window_size=window_size, device=device, ) print("Real-time style next-day return prediction per node (order of companies):") for comp, val in zip(companies, latest_pred): print(f" {comp}: {val:.6f}") if __name__ == "__main__": main()