import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import statsmodels.api as sm
from tqdm import tqdm

# Small epsilon to avoid division by zero
eps = 1e-6

# --- Objective function components ---
def calculate_sortino(
        returns: torch.Tensor,
        min_acceptable_return: torch.Tensor
    ):
    """Calculates the Sortino ratio."""
    if min_acceptable_return is not None:
        excess_returns = returns - min_acceptable_return
    else:
        # If no MAR provided, treat 0 as the target
        excess_returns = returns

    # Calculate downside deviation only on returns below the target
    downside_returns = torch.where(excess_returns < 0, excess_returns, torch.tensor(0.0, device=returns.device))
    downside_deviation = torch.std(downside_returns, dim=0)
    
    # More robust division - avoid division by very small numbers
    downside_deviation = torch.clamp(downside_deviation, min=eps)
    
    # Calculate Sortino ratio with better stability
    sortino = torch.mean(excess_returns, dim=0) / downside_deviation
    
    # Clip extreme values to prevent propagation of extreme gradients
    sortino = torch.clamp(sortino, min=-100.0, max=100.0)
    
    return sortino

def calculate_max_drawdown(
        returns: torch.Tensor
    ):
    """Calculates max drawdown for the duration of the returns passed.
       Max drawdown is defined to be positive, takes the range [0, \\infty).
    """
    if returns.numel() == 0:
        return torch.tensor(0.0, device=returns.device) # Handle empty tensor
    
    # Handle NaN values in returns if any
    clean_returns = torch.nan_to_num(returns, nan=0.0)
    
    cum_returns = (clean_returns + 1).cumprod(dim=0)
    peak = torch.cummax(cum_returns, dim=0).values # Use torch.cummax
    
    # Prevent division by zero or very small peaks
    safe_peak = torch.clamp(peak, min=eps)
    
    drawdown = (peak - cum_returns) / safe_peak # Calculate drawdown relative to peak
    max_drawdown = torch.max(drawdown)
    
    # Clip extreme values
    max_drawdown = torch.clamp(max_drawdown, min=0.0, max=1.0)
    
    return max_drawdown

def calculate_turnover(
        new_weights: torch.Tensor,
        prev_weights: torch.Tensor
    ):
    """Turnover is defined as the sum of absolute differences
       between new and previous weights, divided by 2.
       Takes the range [0, \\infty).
    """
    # Safe handling of NaN weights
    new_weights_safe = torch.nan_to_num(new_weights, nan=1.0/new_weights.size(0))
    prev_weights_safe = torch.nan_to_num(prev_weights, nan=1.0/prev_weights.size(0))
    
    turnover = torch.sum(torch.abs(new_weights_safe - prev_weights_safe)) / 2.0
    
    # Clip to reasonable values
    turnover = torch.clamp(turnover, min=0.0, max=1.0)
    
    return turnover

def calculate_hhi(weights: torch.Tensor):
    """Calculate Herfindahl-Hirschman Index, a measure of concentration.
    Higher values indicate more concentration (less diversification).
    """
    return torch.sum(weights ** 2)

def concentration_penalty(
        weights: torch.Tensor,
        enp_min: float = 5.0,
        enp_max: float = 20.0
    ):
    """Calculate concentration penalty based on effective number of positions (ENP).
    ENP is the inverse of HHI. This encourages having between enp_min and enp_max
    effective positions.
    """
    hhi = calculate_hhi(weights)
    enp = 1.0 / (hhi + eps)
    penalty = torch.relu(enp_min - enp) + torch.relu(enp - enp_max)
    return penalty

def calculate_objective_func(
        returns: torch.Tensor,
        risk_free_rate: torch.Tensor,
        new_weights: torch.Tensor,
        prev_weights: torch.Tensor,
        alphas = [1.0, 1.0, 0.1, 0.25],  # Default alpha values [Sortino, MaxDrawdown, Turnover, Concentration]
        enp_min: float = 5.0,
        enp_max: float = 20.0
    ):
    """Calculates the weighted objective function to be MINIMIZED.
       Note: Sortino is maximized, drawdown, turnover, and concentration are minimized.
    """
    sortino = calculate_sortino(returns, risk_free_rate)
    max_drawdown = calculate_max_drawdown(returns)
    turnover = calculate_turnover(new_weights, prev_weights)
    conc_penalty = concentration_penalty(new_weights, enp_min, enp_max)

    # Apply scaling to individual components
    sortino_scaled = torch.clamp(sortino, min=-10.0, max=10.0)
    max_drawdown_scaled = torch.clamp(max_drawdown, min=0.0, max=1.0)
    turnover_scaled = torch.clamp(turnover, min=0.0, max=1.0)
    conc_penalty_scaled = torch.clamp(conc_penalty, min=0.0, max=10.0)

    # Objective: Maximize Sortino, Minimize MaxDrawdown, Minimize Turnover, Control Concentration
    # We negate Sortino because the optimizer minimizes the objective.
    objective = (
        -alphas[0] * sortino_scaled +
        alphas[1] * max_drawdown_scaled +
        alphas[2] * turnover_scaled +
        alphas[3] * conc_penalty_scaled
    )
                
    # Ensure objective is not NaN
    if torch.isnan(objective):
        print("Warning: NaN objective detected, using default value")
        objective = torch.tensor(0.0, requires_grad=True)
        
    return objective

# --- Main OGD Optimization Function ---
def run_ogd(
        data_df: pd.DataFrame,
        window_size: int = 20,
        learning_rate: float = 0.01,
        alphas: list[float] = [1.0, 1.0, 0.1, 0.25],  # Added concentration weight
        enp_min: float = 5.0,
        enp_max: float = 20.0,
        use_tqdm: bool = True,
        factor_data: pd.DataFrame = None
    ):
    """Runs the Online Gradient Descent (OGD) portfolio optimization.

    Args:
        data_df (pd.DataFrame): DataFrame with dates as index, ticker returns as columns,
                                and a final column named 'rf' for the risk-free rate.
        window_size (int): Lookback window for objective calculation.
        learning_rate (float): Learning rate for the SGD optimizer.
        alphas (list[float]): Weights for [Sortino, MaxDrawdown, Turnover, Concentration] in the objective.
        enp_min (float): Minimum effective number of positions target.
        enp_max (float): Maximum effective number of positions target.
        use_tqdm (bool): Whether to use tqdm progress bar.
        factor_data (pd.DataFrame, optional): DataFrame with factors for CAPM/FF3 analysis.

    Returns:
        tuple[pd.DataFrame, pd.DataFrame]:
            - weights_df: DataFrame of daily portfolio weights (dates index, tickers columns).
            - returns_series: Series of daily portfolio returns (dates index).
    """
    if data_df.empty or len(data_df) <= window_size:
        print("Warning: Dataframe too small for OGD with the given window size.")
        return pd.DataFrame(), pd.Series(dtype=float)

    # --- Add data validation ---
    # Check for NaN values in the input data
    num_nan_values = data_df.isna().sum().sum()
    if num_nan_values > 0:
        print(f"WARNING: Input data contains {num_nan_values} NaN values. Filling with 0.")
        data_df = data_df.fillna(0)
    
    # --- Print diagnostic info ---
    print(f"Data shape: {data_df.shape}")
    print(f"Sample data (first few rows):")
    print(data_df.iloc[:3, :5])  # Show first 3 rows, first 5 columns
    
    # Check for any columns with all zeros or NaNs
    zero_cols = (data_df == 0).all()
    if zero_cols.any():
        zero_count = zero_cols.sum()
        print(f"WARNING: {zero_count} columns contain all zeros.")

    # Separate stock returns and risk-free rate
    returns = data_df.drop(columns=['rf'])
    rf = data_df['rf']
    tickers = returns.columns.tolist()
    num_assets = len(tickers)
    num_days = len(data_df)

    # Convert to PyTorch tensors with explicit handling of NaN values
    # Replace NaN values with 0 during tensor conversion
    returns_tensor = torch.tensor(returns.fillna(0).values, dtype=torch.float32)
    rf_tensor = torch.tensor(rf.fillna(0).values, dtype=torch.float32)
    
    # Check if returns_tensor contains any NaN values (after conversion)
    if torch.isnan(returns_tensor).any():
        print("WARNING: returns_tensor contains NaN values after conversion. Replacing with zeros.")
        returns_tensor = torch.nan_to_num(returns_tensor, nan=0.0)

    # Initialize weights as logits (will be converted to probabilities via softmax)
    # Starting with zeros gives equal weights after softmax
    weights = torch.zeros((num_assets,), requires_grad=True)

    # Use Adam optimizer with reduced learning rate
    optimizer = torch.optim.Adam([weights], lr=learning_rate)

    # Logging structures
    weights_log = torch.zeros((num_days, num_assets), dtype=torch.float32)
    portfolio_returns_log = torch.zeros((num_days,), dtype=torch.float32)
    rolling_portfolio_returns = [] # Store recent portfolio returns for objective calc

    print(f"Starting OGD optimization for {num_days} days, {num_assets} assets...")
    
    # Initial weights distribution - equal weights
    initial_weights = torch.full((num_assets,), 1.0/num_assets)
    
    # Use tqdm for progress tracking if requested
    day_iterator = tqdm(range(num_days)) if use_tqdm else range(num_days)
    
    for i in day_iterator:
        # Check for NaN in weights and reset if needed
        if torch.isnan(weights).any():
            print(f"WARNING: NaN detected in weights at day {i}, resetting to uniform weights")
            with torch.no_grad():
                weights.copy_(torch.zeros((num_assets,)))
                
        # More restrictive clamping for numerical stability
        clamped_weights = torch.clamp(weights, min=-5, max=5)
        normalized_weights = torch.nn.functional.softmax(clamped_weights, dim=0)
        
        # Verify normalized weights are valid probabilities
        if torch.isnan(normalized_weights).any() or torch.sum(normalized_weights) < 0.99:
            print(f"WARNING: Invalid normalized weights at day {i}, using uniform weights")
            normalized_weights = initial_weights.clone()

        # Get daily asset returns and check for NaN values
        daily_asset_returns = returns_tensor[i, :]
        if torch.isnan(daily_asset_returns).any():
            print(f"WARNING: NaN detected in asset returns at day {i}, replacing with zeros")
            daily_asset_returns = torch.nan_to_num(daily_asset_returns, nan=0.0)
        
        # Calculate portfolio return for the current day
        daily_portfolio_return = torch.dot(normalized_weights, daily_asset_returns)

        # Check for NaN in portfolio return
        if torch.isnan(daily_portfolio_return):
            print(f"WARNING: NaN detected in portfolio return at day {i}, using zero")
            daily_portfolio_return = torch.tensor(0.0)
            
            # Debug information - print sample weights and returns to diagnose the issue
            if i < 5 or i % 50 == 0:  # Print for first few days and then occasionally
                print(f"  Debug info for day {i}:")
                print(f"  Sample weights: {normalized_weights[:5].tolist()}")
                print(f"  Sample returns: {daily_asset_returns[:5].tolist()}")
                print(f"  Sum of weights: {torch.sum(normalized_weights).item()}")
                nan_count = torch.isnan(daily_asset_returns).sum().item()
                print(f"  NaN count in returns: {nan_count}/{len(daily_asset_returns)}")

        # Log weights and returns (use detach() to prevent tracking history)
        weights_log[i, :] = normalized_weights.detach()
        portfolio_returns_log[i] = daily_portfolio_return.detach()

        # Add current return to rolling list for objective calculation
        # Detach returns when storing to break gradient history
        rolling_portfolio_returns.append(daily_portfolio_return.detach())

        # --- Objective Calculation and Optimization Step ---
        # Wait until we have enough data for the lookback window
        if len(rolling_portfolio_returns) > window_size:
            rolling_portfolio_returns.pop(0) # Remove oldest return

            # Verify we don't have all zeros in our portfolio returns
            all_zeros = all(r.item() == 0 for r in rolling_portfolio_returns)
            if all_zeros:
                print(f"WARNING: All portfolio returns are zero at day {i}, skipping optimization")
                continue

            # Prepare tensors for objective function
            past_portfolio_returns = torch.stack(rolling_portfolio_returns[:-1] + [daily_portfolio_return])
            
            # Get corresponding risk-free rates for the window
            start_idx = max(0, i - window_size + 1)
            past_rf = rf_tensor[start_idx : i + 1]

            # Get previous day's weights for turnover calculation
            prev_weights = weights_log[i-1, :] if i > 0 else normalized_weights.detach()

            # Zero out gradients before computation
            optimizer.zero_grad()
            
            try:
                # Recompute normalized weights for fresh gradient computation
                clamped_weights = torch.clamp(weights, min=-5, max=5)
                current_norm_weights = torch.nn.functional.softmax(clamped_weights, dim=0)
                
                # Recalculate today's return for gradient computation
                current_return = torch.dot(current_norm_weights, daily_asset_returns)
                
                # Create list with detached historical returns + current gradient-connected return
                historical_returns = rolling_portfolio_returns[:-1]
                new_returns_list = historical_returns + [current_return]
                past_portfolio_returns = torch.stack(new_returns_list)
                
                # Calculate objective with robust error handling
                objective = calculate_objective_func(
                    past_portfolio_returns,
                    past_rf,
                    current_norm_weights,
                    prev_weights,
                    alphas,
                    enp_min,
                    enp_max
                )
                
                # Check if objective computation produced valid result
                if not torch.isnan(objective):
                    # Check objective is not just a default zero
                    if objective.item() != 0.0 or i % 50 == 0:  # Allow some zeros through for logging
                        # Compute and apply gradients
                        objective.backward()
                        
                        # --- Enhanced Logging --- 
                        log_interval = 50
                        if (i + 1) % log_interval == 0 or num_days - (i + 1) < 5:
                            if not use_tqdm:  # Don't print logs if using tqdm to avoid cluttering
                                print(f"\n--- Step {i+1}/{num_days} Log ---")
                                print(f"  Objective: {objective.item():.6f}")
                                
                                # Log average gradient magnitude rather than all gradients
                                if weights.grad is not None:
                                    avg_grad = torch.mean(torch.abs(weights.grad)).item()
                                    print(f"  Average Gradient Magnitude: {avg_grad:.6f}")
                                
                                # Record some sample weights before update
                                weights_before = weights.detach().clone()
                                
                                # Apply gradient update
                                optimizer.step()
                                
                                # Record weights after update
                                weights_after = weights.detach().clone()
                                weight_change = torch.sum(torch.abs(weights_after - weights_before)).item()
                                print(f"  Weight Change (Sum Abs): {weight_change:.6f}")
                                
                                # Display a few normalized weights as a sample
                                print(f"  Sample Normalized Weights: {[f'{w:.4f}' for w in normalized_weights[:5].tolist()]}")
                        else:
                            # Update weights without detailed logging
                            optimizer.step()
                        
                        # Apply gradient clipping after optimizer step
                        with torch.no_grad():
                            if weights.grad is not None and torch.isnan(weights.grad).any():
                                print(f"  WARNING: NaN gradient detected at day {i}, zeroing gradients")
                                weights.grad.zero_()
                    else:
                        if not use_tqdm:
                            print(f"  WARNING: Zero objective at day {i}, skipping gradient update")
                else:
                    if not use_tqdm:
                        print(f"  WARNING: NaN objective at day {i}, skipping gradient update")
                    
            except Exception as e:
                print(f"  Optimization error at day {i}: {e}")
                # Skip this day rather than propagating errors

    print("OGD optimization finished.")
    
    # Final check for validity of results
    if torch.isnan(weights_log).any():
        print("WARNING: Final weights contain NaN values")
        weights_log = torch.nan_to_num(weights_log, nan=1.0/num_assets)
    
    if torch.isnan(portfolio_returns_log).any():
        print("WARNING: Final portfolio returns contain NaN values")
        portfolio_returns_log = torch.nan_to_num(portfolio_returns_log, nan=0.0)

    # Convert logs back to pandas DataFrames/Series with original index
    weights_df = pd.DataFrame(weights_log.numpy(), index=data_df.index, columns=tickers)
    returns_series = pd.Series(portfolio_returns_log.numpy(), index=data_df.index, name="PortfolioReturn")

    return weights_df, returns_series

# --- Analysis Functions ---
def compute_sharpe(returns_series, rf_series, annualization_factor=252):
    """Compute annualized Sharpe ratio."""
    excess = returns_series - rf_series
    annual_excess_return = np.mean(excess) * annualization_factor
    annual_volatility = np.std(excess) * np.sqrt(annualization_factor)
    return annual_excess_return / (annual_volatility + eps)

def compute_max_drawdown(returns_series):
    """Compute maximum drawdown."""
    cr = np.cumprod(returns_series + 1)
    peak = np.maximum.accumulate(cr)
    return np.max((peak - cr) / (peak + eps))

def compute_alpha(returns_series, rf_series, factor_data, model="CAPM"):
    """Compute alpha using either CAPM or Fama-French 3-factor model.
    
    Args:
        returns_series: Portfolio returns series
        rf_series: Risk-free rate series
        factor_data: DataFrame with factor returns (must include 'mktrf' for CAPM, 
                    and 'smb', 'hml' for FF3)
        model: 'CAPM' or 'FF3'
        
    Returns:
        tuple: (alpha, regression_result)
    """
    y = np.asarray(returns_series - rf_series)
    
    if model == "CAPM":
        X = np.asarray(factor_data[["mktrf"]])
    elif model == "FF3":
        X = np.asarray(factor_data[["mktrf", "smb", "hml"]])
    else:
        raise ValueError("Model must be 'CAPM' or 'FF3'")

    X = sm.add_constant(X)
    result = sm.OLS(y, X).fit()
    return result.params[0], result

# --- Visualization Functions ---
def plot_optimization_results(
        opt_returns_series, 
        weights_df, 
        benchmark_returns=None, 
        top_n=5, 
        title_suffix=""
    ):
    """Plot optimization results with comparison to benchmarks.
    
    Args:
        opt_returns_series: Series of optimized portfolio returns
        weights_df: DataFrame of weights over time
        benchmark_returns: Dict of benchmark return series {name: series}
        top_n: Number of top assets to highlight in weights plot
        title_suffix: Additional text to add to plot titles
    """
    # Convert to numpy for plotting
    dates = opt_returns_series.index
    opt_returns = opt_returns_series.values
    weights_np = weights_df.values
    
    # Create plot with return distribution and cumulative returns
    fig, axes = plt.subplots(2, 1, figsize=(12, 10))
    
    # Return distribution
    axes[0].hist(opt_returns, bins=50, alpha=0.5, label='Optimized', color='red')
    
    # Cumulative returns
    axes[1].plot(dates, np.cumprod(opt_returns + 1), label='Optimized', color='red')

    # Add benchmarks if provided
    if benchmark_returns:
        for name, b_returns in benchmark_returns.items():
            axes[0].hist(b_returns, bins=50, alpha=0.5, label=name)
            axes[1].plot(dates, np.cumprod(b_returns + 1), label=name)

    axes[0].set_title('Return Distribution')
    axes[0].legend()
    
    axes[1].set_title('Cumulative Returns')
    axes[1].legend()
    axes[1].xaxis.set_major_locator(mdates.YearLocator())
    axes[1].xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
    
    fig.suptitle(f"Performance Comparison {title_suffix}", fontsize=16)
    plt.tight_layout()
    plt.show()
    
    # Create plot with weights evolution and distribution
    fig, axes = plt.subplots(2, 1, figsize=(12, 10))
    
    # Weight evolution
    top_assets_idx = np.argsort(weights_np[-1])[-top_n:]
    for i in range(weights_np.shape[1]):
        label = weights_df.columns[i] if i in top_assets_idx else None
        lw = 2 if i in top_assets_idx else 0.3
        alpha = 0.8 if i in top_assets_idx else 0.3
        axes[0].plot(dates, weights_np[:, i], label=label, linewidth=lw, alpha=alpha)

    axes[0].xaxis.set_major_locator(mdates.YearLocator())
    axes[0].xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
    axes[0].set_title("Weights Over Time")
    axes[0].legend()

    # Weight distribution
    axes[1].hist(weights_np[-1], bins=100, log=True, color='blue', alpha=0.7)
    axes[1].set_title("Final Day Weight Distribution")
    
    plt.tight_layout()
    plt.show()
    
    # Return effective number of positions over time
    enp_series = 1.0 / np.sum(weights_np ** 2, axis=1)
    fig, ax = plt.subplots(figsize=(12, 5))
    ax.plot(dates, enp_series)
    ax.set_title("Effective Number of Positions Over Time")
    ax.set_ylabel("ENP")
    ax.xaxis.set_major_locator(mdates.YearLocator())
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
    plt.tight_layout()
    plt.show()
    
    return None