"""
Wavelet Denoising for Financial Time Series
=============================================
DWT denoising with Daubechies db4 wavelet, soft-thresholding (Donoho-Johnstone).
Reference: arxiv:2408.12408 Section III-C

Usage:
    from wavelet_denoise import wavelet_denoise, preprocess_ohlcv
    
    # Single signal
    denoised_close = wavelet_denoise(df['Close'].values)
    
    # Full OHLCV DataFrame
    df_denoised, scaler = preprocess_ohlcv(df)
"""

import numpy as np
import pywt


def wavelet_denoise(signal: np.ndarray, wavelet: str = 'db4', level: int = None) -> np.ndarray:
    """
    DWT denoising: db4 wavelet, soft-thresholding, zero high-freq detail coefficients.
    
    From: arxiv:2408.12408 Section III-C (best configuration for S&P 500).
    
    Algorithm:
        1. Pad signal to power-of-2 length (mitigate boundary effects)
        2. Decompose with DWT (db4 wavelet)
        3. Estimate noise σ via MAD of finest detail coefficients
        4. Compute universal threshold (Donoho-Johnstone): σ * √(2·log(N))
        5. Soft-threshold all detail coefficients (keep approximation untouched)
        6. Reconstruct and trim to original length
    
    Args:
        signal: 1D numpy array of time series values
        wavelet: Wavelet family to use (default: 'db4' — Daubechies 4)
        level: Decomposition level (default: maximum possible)
    
    Returns:
        Denoised signal of same length as input
    """
    n = len(signal)
    
    # Pad to power of 2 to mitigate boundary effects
    pad_len = 2 ** int(np.ceil(np.log2(n))) - n
    padded = np.pad(signal, (0, pad_len), mode='edge')
    
    # Decompose
    if level is None:
        level = pywt.dwt_max_level(len(padded), wavelet)
    coeffs = pywt.wavedec(padded, wavelet, level=level)
    
    # Estimate noise sigma via Median Absolute Deviation (MAD) of finest detail coefficients
    sigma = np.median(np.abs(coeffs[-1])) / 0.6745
    
    # Universal threshold (Donoho-Johnstone)
    threshold = sigma * np.sqrt(2 * np.log(len(padded)))
    
    # Soft-threshold all detail coefficients (keep approximation untouched)
    coeffs_thresh = [coeffs[0]]  # Keep approximation coefficients as-is
    for detail in coeffs[1:]:
        coeffs_thresh.append(pywt.threshold(detail, threshold, mode='soft'))
    
    # Reconstruct and trim padding
    denoised = pywt.waverec(coeffs_thresh, wavelet)
    return denoised[:n]


def preprocess_ohlcv(df, columns=None):
    """
    Apply wavelet denoising to each OHLCV column, then normalize to [0,1].
    
    Args:
        df: pandas DataFrame with OHLCV columns
        columns: list of column names to process (default: ['Open','High','Low','Close','Volume'])
    
    Returns:
        denoised_df: DataFrame with denoised and normalized values
        scaler: fitted MinMaxScaler (for inverse transform during evaluation)
    """
    from sklearn.preprocessing import MinMaxScaler
    
    if columns is None:
        columns = ['Open', 'High', 'Low', 'Close', 'Volume']
    
    denoised = df.copy()
    
    # Step 1: Wavelet denoise each column independently
    for col in columns:
        denoised[col] = wavelet_denoise(df[col].values)
    
    # Step 2: MinMax normalize to [0, 1]
    scaler = MinMaxScaler()
    denoised[columns] = scaler.fit_transform(denoised[columns])
    
    return denoised, scaler


if __name__ == "__main__":
    # Quick test with synthetic data
    np.random.seed(42)
    t = np.linspace(0, 4 * np.pi, 1000)
    clean = np.sin(t) + 0.5 * np.sin(3 * t)
    noisy = clean + 0.3 * np.random.randn(len(t))
    
    denoised = wavelet_denoise(noisy)
    
    mse_noisy = np.mean((noisy - clean) ** 2)
    mse_denoised = np.mean((denoised - clean) ** 2)
    
    print(f"MSE (noisy vs clean):    {mse_noisy:.6f}")
    print(f"MSE (denoised vs clean): {mse_denoised:.6f}")
    print(f"Noise reduction:         {(1 - mse_denoised / mse_noisy) * 100:.1f}%")