import os
from typing import Tuple, List

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import datetime as dt
import time


# -----------------------------------------------------
# 1. Data loading and preprocessing
# -----------------------------------------------------

def load_it_sector_data_from_csvs(
    infy_csv: str,
    tcs_csv: str,
    nifty_it_csv: str,
) -> Tuple[np.ndarray, np.ndarray, List[pd.Timestamp], List[str]]:
    """Load IT sector data from separate CSV files and build cleaned feature + target tensors.

    Methodology alignment
    ---------------------
    - Data Collection: uses OHLCV-style fields from the NSE IT sector file.
    - Preprocessing / Cleaning:
        * Parse dates and sort.
        * Filter to equity series (EQ).
        * Remove duplicates and rows with missing / invalid key values.
        * Filter out non-trading days (zero / negative volume).
        * Forward-fill remaining gaps.
    - Derived Indicators:
        * Daily returns.
        * 5-day moving average of close.
        * 20-day rolling volatility of returns.

    Returns
    -------
    features : np.ndarray
        Shape [num_dates, num_companies, num_features].
        Features per company per day include normalized price/volume and indicators.
    targets : np.ndarray
        Shape [num_dates, num_companies]. Daily returns per company (prediction target).
    dates : list of pd.Timestamp
        Trading dates.
    companies : list of str
        List of company tickers (node names in the graph).
    """
    # ---------------------------------
    # Load individual CSVs
    # ---------------------------------
    infy_df = pd.read_csv(infy_csv)
    tcs_df = pd.read_csv(tcs_csv)
    index_df = pd.read_csv(nifty_it_csv)

    # Add a Company identifier manually
    infy_df["Company"] = "INFY"
    tcs_df["Company"] = "TCS"
    index_df["Company"] = "NIFTY_IT"

    # Harmonize columns where needed for the index
    # Ensure required OHLCV columns exist (use Close/Volume, ignore others if missing)
    for df in [infy_df, tcs_df, index_df]:
        df["Date"] = pd.to_datetime(df["Date"])

    # For the index, mimic equity-style columns for compatibility
    if "Series" not in index_df.columns:
        index_df["Series"] = "EQ"
    if "Close" not in index_df.columns and "Close" in index_df.columns:
        # Already present; this branch is just a safety net
        pass
    if "Volume" not in index_df.columns and "Volume" in index_df.columns:
        # Already present; just a safety net
        pass

    # Unify columns to a common subset
    common_cols = [
        "Date",
        "Company",
        "Series",
        "Open",
        "High",
        "Low",
        "Close",
        "Volume",
    ]

    # For stock CSVs, ensure the above columns are present
    for stock_df in [infy_df, tcs_df]:
        # They already have Symbol, Series, Prev Close, Open, High, Low, Last, Close, VWAP, Volume, ...
        # We just keep the columns we need and drop the rest later.
        pass

    # For index, keep only the needed OHLCV columns and Series/Company
    index_df = index_df[["Date", "Open", "High", "Low", "Close", "Volume", "Company", "Series"]]

    # Make sure column order matches common_cols
    index_df = index_df[["Date", "Company", "Series", "Open", "High", "Low", "Close", "Volume"]]

    # Align stock DataFrames to the same schema
    infy_df = infy_df[["Date", "Company", "Series", "Open", "High", "Low", "Close", "Volume"]]
    tcs_df = tcs_df[["Date", "Company", "Series", "Open", "High", "Low", "Close", "Volume"]]

    # Concatenate all into one panel-like table
    df = pd.concat([infy_df, tcs_df, index_df], ignore_index=True)

    # -------------------------
    # Basic cleaning steps
    # -------------------------
    # Ensure proper dtypes and ordering
    df["Date"] = pd.to_datetime(df["Date"])

    # Keep only equity series
    if "Series" in df.columns:
        df = df[df["Series"] == "EQ"]

    # Drop rows with critical missing values
    df = df.dropna(subset=["Company", "Close", "Volume", "Open", "High", "Low"])

    # Remove zero / negative volume (non-trading or bad records)
    df = df[df["Volume"] > 0]

    # Drop exact duplicates on (Date, Company)
    df = df.drop_duplicates(subset=["Date", "Company"])

    # Sort by date then company
    df = df.sort_values(["Date", "Company"])

    # Use the "Company" column as canonical ticker (INFY, TCS, HCLTECH, TECHM, WIPRO, ...)
    companies = sorted(df["Company"].unique().tolist())

    # Pivot to Date x Company for OHLCV-like data
    close = df.pivot_table(index="Date", columns="Company", values="Close")
    volume = df.pivot_table(index="Date", columns="Company", values="Volume")

    # Ensure consistent column order
    close = close[companies]
    volume = volume[companies]

    # Forward-fill missing values along time for each company
    close = close.ffill()
    volume = volume.ffill()

    # -------------------------
    # Derived indicators
    # -------------------------
    # 1-day simple returns (percentage change)
    returns = close.pct_change().replace([np.inf, -np.inf], np.nan).fillna(0.0)

    # 5-day moving average of closing price (trend)
    ma5 = close.rolling(window=5, min_periods=1).mean().ffill()

    # 20-day rolling volatility of returns (risk)
    vol20 = (
        returns.rolling(window=20, min_periods=1)
        .std()
        .replace([np.inf, -np.inf], np.nan)
        .fillna(0.0)
        .ffill()
    )

    # -------------------------
    # Normalization per company
    # -------------------------
    scaler_close = StandardScaler()
    scaler_vol = StandardScaler()
    scaler_ma5 = StandardScaler()
    scaler_vol20 = StandardScaler()

    close_scaled = pd.DataFrame(
        scaler_close.fit_transform(close.values),
        index=close.index,
        columns=close.columns,
    )
    volume_scaled = pd.DataFrame(
        scaler_vol.fit_transform(volume.values),
        index=volume.index,
        columns=volume.columns,
    )
    ma5_scaled = pd.DataFrame(
        scaler_ma5.fit_transform(ma5.values),
        index=ma5.index,
        columns=ma5.columns,
    )
    vol20_scaled = pd.DataFrame(
        scaler_vol20.fit_transform(vol20.values),
        index=vol20.index,
        columns=vol20.columns,
    )

    dates = close.index.to_list()
    num_dates = len(dates)
    num_companies = len(companies)

    # Features per node per day:
    # [normalized close, normalized volume, raw return, normalized MA5, normalized VOL20]
    num_features = 5
    features = np.zeros((num_dates, num_companies, num_features), dtype=np.float32)

    for j, c in enumerate(companies):
        features[:, j, 0] = close_scaled[c].values
        features[:, j, 1] = volume_scaled[c].values
        features[:, j, 2] = returns[c].values
        features[:, j, 3] = ma5_scaled[c].values
        features[:, j, 4] = vol20_scaled[c].values

    targets = returns.values.astype(np.float32)  # predict daily returns

    return features, targets, dates, companies


# -----------------------------------------------------
# 2. Graph construction (correlation-based)
# -----------------------------------------------------

def build_correlation_graph(returns: np.ndarray, threshold: float = 0.2) -> torch.Tensor:
    """Build an undirected graph of companies based on return correlations.

    Parameters
    ----------
    returns : np.ndarray
        Array of shape [num_dates, num_companies] with daily returns.
    threshold : float
        Minimum absolute correlation to create an edge.

    Returns
    -------
    edge_index : torch.Tensor
        Tensor of shape [2, num_edges] in COO format for PyTorch Geometric.
    """
    # Correlation across companies
    corr = np.corrcoef(returns.T)  # [num_companies, num_companies]
    num_nodes = corr.shape[0]

    edge_index_list = []
    for i in range(num_nodes):
        for j in range(num_nodes):
            if i == j:
                continue
            if np.abs(corr[i, j]) >= threshold:
                edge_index_list.append([i, j])

    # Fallback: fully-connected graph (without self-loops) if threshold is too high
    if len(edge_index_list) == 0:
        for i in range(num_nodes):
            for j in range(num_nodes):
                if i != j:
                    edge_index_list.append([i, j])

    edge_index = torch.tensor(edge_index_list, dtype=torch.long).t().contiguous()
    return edge_index


# -----------------------------------------------------
# 3. Dataset for time-windowed graph snapshots
# -----------------------------------------------------


class TimeSeriesGraphDataset(torch.utils.data.Dataset):
    """Dataset that converts time series into windowed graph snapshots for GNNs.

    Each item is a Data object with:
      - x: node features [num_nodes, window_size * num_features]
      - edge_index: static company correlation graph
      - y: target returns [num_nodes]
    """

    def __init__(
        self,
        features: np.ndarray,
        targets: np.ndarray,
        edge_index: torch.Tensor,
        window_size: int,
        start_t: int,
        end_t: int,
    ) -> None:
        super().__init__()
        self.features = features
        self.targets = targets
        self.edge_index = edge_index
        self.window_size = window_size
        self.start_t = start_t
        self.end_t = end_t

    def __len__(self) -> int:
        return self.end_t - self.start_t

    def __getitem__(self, idx: int) -> Data:
        t = self.start_t + idx
        # Use previous `window_size` days to predict returns at day t
        window_feats = self.features[t - self.window_size : t]  # [W, N, F]
        window, num_nodes, num_feat = window_feats.shape

        # Keep the temporal dimension for LSTM-based encoding.
        # Shape: [num_nodes, window, num_feat]
        x_seq = window_feats.transpose(1, 0, 2)
        y = self.targets[t]  # [num_nodes]

        data = Data(
            x=torch.from_numpy(x_seq),  # [num_nodes, window, num_feat]
            edge_index=self.edge_index,
            y=torch.from_numpy(y),
        )
        return data


# -----------------------------------------------------
# 4. GNN model definition (GCN for regression)
# -----------------------------------------------------


class GNNTimeSeriesModel(nn.Module):
    """LSTM + GCN hybrid for multi-node time-series regression.

    Methodology alignment
    ---------------------
    - Temporal Feature Extraction: shared LSTM encodes each stock's past W days.
    - GNN Application: GCN layers propagate information over the inter-stock graph.
    - Prediction: per-node regression head outputs next-day return.
    """

    def __init__(
        self,
        window_size: int,
        num_features: int,
        hidden_lstm: int = 64,
        hidden_gnn: int = 64,
        dropout: float = 0.2,
    ) -> None:
        super().__init__()
        self.window_size = window_size
        self.num_features = num_features

        # Temporal encoder: LSTM over W x F for each stock
        self.lstm = nn.LSTM(
            input_size=num_features,
            hidden_size=hidden_lstm,
            num_layers=1,
            batch_first=False,  # we will feed [W, N, F]
        )

        # Graph convolution layers operating on LSTM embeddings
        self.conv1 = GCNConv(hidden_lstm, hidden_gnn)
        self.conv2 = GCNConv(hidden_gnn, hidden_gnn)
        self.lin = nn.Linear(hidden_gnn, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor, edge_index: torch.Tensor) -> torch.Tensor:
        """Forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Shape [num_nodes_total_in_batch, window, num_features].
        edge_index : torch.Tensor
            Graph edges for the batched graph.
        """
        # -----------------------------
        # Temporal feature extraction
        # -----------------------------
        # x_seq: [num_nodes_total, window, num_features]
        num_nodes_total, window, num_feat = x.shape
        assert (
            window == self.window_size and num_feat == self.num_features
        ), "Input window/feature dims do not match model configuration."

        # LSTM expects [seq_len, batch, input_size]
        x_seq = x.permute(1, 0, 2)  # [window, num_nodes_total, num_features]
        _, (h_n, _) = self.lstm(x_seq)

        # Last layer hidden state: [num_nodes_total, hidden_lstm]
        h_last = h_n[-1]

        # -----------------------------
        # Graph convolution over stocks
        # -----------------------------
        x_g = self.conv1(h_last, edge_index)
        x_g = torch.relu(x_g)
        x_g = self.dropout(x_g)

        x_g = self.conv2(x_g, edge_index)
        x_g = torch.relu(x_g)
        x_g = self.dropout(x_g)

        out = self.lin(x_g).squeeze(-1)  # [num_nodes_total]
        return out


# -----------------------------------------------------
# 5. Training and evaluation utilities
# -----------------------------------------------------


def train_one_epoch(
    model: nn.Module,
    loader: DataLoader,
    optimizer: torch.optim.Optimizer,
    device: torch.device,
) -> float:
    model.train()
    criterion = nn.MSELoss()
    total_loss = 0.0

    for batch in loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        out = model(batch.x, batch.edge_index)
        loss = criterion(out, batch.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * batch.num_graphs

    avg_loss = total_loss / len(loader.dataset)
    return avg_loss


def evaluate(
    model: nn.Module,
    loader: DataLoader,
    device: torch.device,
):
    model.eval()
    criterion = nn.MSELoss()
    total_loss = 0.0
    all_y_true = []
    all_y_pred = []

    with torch.no_grad():
        for batch in loader:
            batch = batch.to(device)
            out = model(batch.x, batch.edge_index)
            loss = criterion(out, batch.y)
            total_loss += loss.item() * batch.num_graphs
            all_y_true.append(batch.y.cpu().numpy())
            all_y_pred.append(out.cpu().numpy())

    y_true = np.concatenate(all_y_true)
    y_pred = np.concatenate(all_y_pred)

    # -------------------------------------------------
    # Guard against NaN/Inf in predictions or targets
    # -------------------------------------------------
    mask = np.isfinite(y_true) & np.isfinite(y_pred)
    if mask.sum() == 0:
        # Fallback: avoid crashing; metrics will be NaN but training can continue
        mse = float("nan")
        mae = float("nan")
        directional_accuracy = float("nan")
        avg_loss = total_loss / max(len(loader.dataset), 1)
        return avg_loss, mse, mae, directional_accuracy, y_true, y_pred

    y_true_clean = y_true[mask]
    y_pred_clean = y_pred[mask]

    mse = mean_squared_error(y_true_clean, y_pred_clean)
    mae = mean_absolute_error(y_true_clean, y_pred_clean)
    # Directional accuracy: how often the sign of return is predicted correctly
    directional_accuracy = float((np.sign(y_true_clean) == np.sign(y_pred_clean)).mean())

    avg_loss = total_loss / len(loader.dataset)
    return avg_loss, mse, mae, directional_accuracy, y_true_clean, y_pred_clean


# -----------------------------------------------------
# 6. Baseline (before GNN) and real-time helpers
# -----------------------------------------------------


def compute_naive_baseline_metrics(targets: np.ndarray, train_start: int, train_end: int, val_start: int, val_end: int, test_start: int, test_end: int):
    """Compute a simple baseline: predict zero return (no change) and plot vs actual.

    This represents a "before GNN" naive model where we assume next-day return = 0.
    """
    # Flatten across all nodes
    y_train = targets[train_start:train_end].reshape(-1)
    y_val = targets[val_start:val_end].reshape(-1)
    y_test = targets[test_start:test_end].reshape(-1)

    # Baseline predictions are all zeros
    y_train_pred = np.zeros_like(y_train)
    y_val_pred = np.zeros_like(y_val)
    y_test_pred = np.zeros_like(y_test)

    train_mse = mean_squared_error(y_train, y_train_pred)
    val_mse = mean_squared_error(y_val, y_val_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)

    # Plot for test set
    plt.figure(figsize=(6, 6))
    plt.scatter(y_test, y_test_pred, alpha=0.3, s=10)
    plt.xlabel("Actual returns")
    plt.ylabel("Predicted returns (baseline: 0)")
    plt.title("Baseline (No GNN) Predicted vs Actual Returns")
    lims = [min(y_test.min(), y_test_pred.min()), max(y_test.max(), y_test_pred.max())]
    plt.plot(lims, lims, "r--", linewidth=1)
    plt.tight_layout()
    plt.savefig("baseline_pred_vs_actual.png", dpi=200)
    plt.close()

    print(f"Baseline Train MSE: {train_mse:.6f}, Val MSE: {val_mse:.6f}, Test MSE: {test_mse:.6f}")
    print("Saved baseline scatter plot to baseline_pred_vs_actual.png")


def realtime_predict_last_window(
    model: nn.Module,
    features: np.ndarray,
    edge_index: torch.Tensor,
    window_size: int,
    device: torch.device,
):
    """Generate a real-time style prediction for the latest available day.

    This uses the most recent `window_size` days in `features` as if it were "live" data.
    """
    model.eval()
    num_dates, num_nodes, num_feat = features.shape
    if num_dates < window_size:
        raise ValueError("Not enough data points for real-time window prediction.")

    # Last window
    window_feats = features[num_dates - window_size : num_dates]  # [W, N, F]
    window, N, F = window_feats.shape
    x_seq = window_feats.transpose(1, 0, 2)  # [N, W, F]

    data = Data(
        x=torch.from_numpy(x_seq).to(device),
        edge_index=edge_index.to(device),
    )

    with torch.no_grad():
        out = model(data.x, data.edge_index).cpu().numpy()

    return out  # [num_nodes]


# -----------------------------------------------------
# 7. Main experiment pipeline
# -----------------------------------------------------


def main():
    infy_csv = "infy_stock.csv"
    tcs_csv = "tcs_stock.csv"
    nifty_it_csv = "nifty_it_index.csv"
    for p in [infy_csv, tcs_csv, nifty_it_csv]:
        if not os.path.exists(p):
            raise FileNotFoundError(f"Could not find required CSV file: {p}")

    print("Loading and preprocessing data from CSVs...")
    features, targets, dates, companies = load_it_sector_data_from_csvs(
        infy_csv=infy_csv,
        tcs_csv=tcs_csv,
        nifty_it_csv=nifty_it_csv,
    )
    num_dates, num_companies, num_features = features.shape
    print(f"Num dates: {num_dates}, Num companies (nodes): {num_companies}, Num features: {num_features}")

    # Build graph from training-period correlations only (to avoid look-ahead bias)
    window_size = 20
    if num_dates <= window_size + 1:
        raise ValueError("Not enough dates to create time windows. Reduce window_size or use more data.")

    first_t = window_size
    last_t = num_dates - 1
    total_samples = last_t - first_t + 1

    train_samples = int(total_samples * 0.7)
    val_samples = int(total_samples * 0.15)
    test_samples = total_samples - train_samples - val_samples

    train_start_t = first_t
    train_end_t = train_start_t + train_samples
    val_start_t = train_end_t
    val_end_t = val_start_t + val_samples
    test_start_t = val_end_t
    test_end_t = last_t + 1

    print(f"Total usable samples: {total_samples}")
    print(f"Train: {train_samples}, Val: {val_samples}, Test: {test_samples}")

    # -----------------------------
    # Baseline (before GNN)
    # -----------------------------
    compute_naive_baseline_metrics(
        targets,
        train_start=train_start_t,
        train_end=train_end_t,
        val_start=val_start_t,
        val_end=val_end_t,
        test_start=test_start_t,
        test_end=test_end_t,
    )

    # Use only training period to compute correlations
    train_returns = targets[train_start_t:train_end_t]
    edge_index = build_correlation_graph(train_returns, threshold=0.2)
    print("Edge index shape:", edge_index.shape)

    # Create datasets
    train_dataset = TimeSeriesGraphDataset(
        features=features,
        targets=targets,
        edge_index=edge_index,
        window_size=window_size,
        start_t=train_start_t,
        end_t=train_end_t,
    )

    val_dataset = TimeSeriesGraphDataset(
        features=features,
        targets=targets,
        edge_index=edge_index,
        window_size=window_size,
        start_t=val_start_t,
        end_t=val_end_t,
    )

    test_dataset = TimeSeriesGraphDataset(
        features=features,
        targets=targets,
        edge_index=edge_index,
        window_size=window_size,
        start_t=test_start_t,
        end_t=test_end_t,
    )

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

    model = GNNTimeSeriesModel(
        window_size=window_size,
        num_features=num_features,
        hidden_lstm=64,
        hidden_gnn=64,
        dropout=0.2,
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

    num_epochs = 30
    best_val_loss = float("inf")
    best_state_dict = None

    print("Starting training...")
    for epoch in range(1, num_epochs + 1):
        train_loss = train_one_epoch(model, train_loader, optimizer, device)
        val_loss, val_mse, val_mae, val_dir_acc, _, _ = evaluate(model, val_loader, device)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_state_dict = {k: v.cpu().clone() for k, v in model.state_dict().items()}

        print(
            f"Epoch {epoch:03d} | "
            f"Train Loss: {train_loss:.6f} | "
            f"Val Loss: {val_loss:.6f}, Val MSE: {val_mse:.6f}, Val MAE: {val_mae:.6f}, "
            f"Val DirAcc: {val_dir_acc:.4f}"
        )

    if best_state_dict is not None:
        model.load_state_dict(best_state_dict)

    print("Evaluating on test set...")
    test_loss, test_mse, test_mae, test_dir_acc, y_true, y_pred = evaluate(model, test_loader, device)
    print(
        f"Test Loss: {test_loss:.6f}, Test MSE: {test_mse:.6f}, "
        f"Test MAE: {test_mae:.6f}, Test DirAcc: {test_dir_acc:.4f}"
    )

    # -------------------------------------------------
    # Simple visualization: predicted vs actual returns
    # -------------------------------------------------
    plt.figure(figsize=(6, 6))
    plt.scatter(y_true, y_pred, alpha=0.3, s=10)
    plt.xlabel("Actual returns")
    plt.ylabel("Predicted returns")
    plt.title("GNN Predicted vs Actual Daily Returns (All IT Stocks)")
    lims = [min(y_true.min(), y_pred.min()), max(y_true.max(), y_pred.max())]
    plt.plot(lims, lims, "r--", linewidth=1)
    plt.tight_layout()
    plt.savefig("gnn_it_sector_pred_vs_actual.png", dpi=200)
    plt.close()
    print("Saved scatter plot to gnn_it_sector_pred_vs_actual.png")

    # -------------------------------------------------
    # Real-time style prediction using latest window
    # -------------------------------------------------
    latest_pred = realtime_predict_last_window(
        model=model,
        features=features,
        edge_index=edge_index,
        window_size=window_size,
        device=device,
    )
    print("Real-time style next-day return prediction per node (order of companies):")
    for comp, val in zip(companies, latest_pred):
        print(f"  {comp}: {val:.6f}")


if __name__ == "__main__":
    main()