"""
Central configuration for the TFT-ASRO deep learning pipeline.

All hyperparameters, feature dimensions, and training settings live here
so every module draws from a single source of truth.

Model paths honour the MODEL_DIR environment variable so they work both
locally (``data/models``) and inside the HF Space container
(``/data/models``).
"""

from __future__ import annotations

import os
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional


def _model_dir() -> str:
    """Resolve the base model directory from env (same as app.settings)."""
    return os.environ.get("MODEL_DIR", "/data/models")


@dataclass(frozen=True)
class EmbeddingConfig:
    model_name: str = "ProsusAI/finbert"
    full_dim: int = 768
    pca_dim: int = 32
    max_token_length: int = 512
    batch_size: int = 64
    pca_model_path: str = ""


@dataclass(frozen=True)
class SentimentFeatureConfig:
    momentum_windows: tuple[int, ...] = (5, 10, 30)
    surprise_lookback: int = 30
    surprise_threshold: float = 2.0
    event_types: tuple[str, ...] = (
        "supply_disruption",
        "supply_expansion",
        "demand_increase",
        "demand_decrease",
        "inventory_draw",
        "inventory_build",
        "policy_support",
        "policy_drag",
        "macro_usd_up",
        "macro_usd_down",
        "cost_push",
    )


@dataclass(frozen=True)
class LMEConfig:
    nasdaq_api_key_env: str = "NASDAQ_DATA_LINK_API_KEY"
    quandl_dataset: str = "LME/PR_CU"
    stock_change_windows: tuple[int, ...] = (1, 5, 10, 20)
    depletion_window: int = 20
    futures_symbols: tuple[str, ...] = ("HG=F",)
    futures_months_ahead: tuple[int, ...] = (3, 6, 12)
    max_ffill_days: int = 5


@dataclass(frozen=True)
class TFTModelConfig:
    max_encoder_length: int = 60
    max_prediction_length: int = 5
    # hidden_size 64→32: VSN encoder had 3.2M params for only 313 training
    # samples (344 features × hidden_size × hidden_continuous_size).
    # Reducing halves the dominant layer while keeping expressiveness.
    hidden_size: int = 32
    # attention_head_size 4→2: fewer heads for a small, single-series dataset.
    attention_head_size: int = 2
    # dropout 0.1→0.3: 313 samples / ~900K params still demands heavy regularisation.
    dropout: float = 0.3
    hidden_continuous_size: int = 16   # was 32; paired reduction with hidden_size
    quantiles: tuple[float, ...] = (0.02, 0.10, 0.25, 0.50, 0.75, 0.90, 0.98)
    # lr 1e-3→3e-4: smaller batches produce noisier gradients; conservative LR
    # reduces the risk of overshooting the narrow-loss landscape.
    learning_rate: float = 3e-4
    reduce_on_plateau_patience: int = 4
    # clip 0.5→1.0: tanh-based Sharpe gradients are inherently bounded;
    # relaxing the clip lets the model escape flat regions more aggressively.
    gradient_clip_val: float = 1.0


@dataclass(frozen=True)
class ASROConfig:
    # Total loss = lambda_quantile * calibration + (1-lambda_quantile) * sharpe
    #
    # lambda_quantile is the EXPLICIT weight of the quantile calibration bundle:
    #   calibration = q_loss + lambda_vol * vol_loss
    # w_sharpe = 1 - lambda_quantile (the complementary directional weight)
    #
    # This normalised (sum-to-1) formulation makes both components interpretable
    # and prevents either from silently dominating across loss-magnitude regimes.
    #
    # 0.4 / 0.6 split: 40% calibration (keeps TFT probabilistic),
    #                   60% Sharpe     (drives directional / amplitude learning)
    lambda_quantile: float = 0.4   # w_quantile; was 0.3 (unnormalised old formula)
    # lambda_vol is a sub-weight within the calibration bundle only.
    # It controls how much the Q90-Q10 spread tracks 2× actual σ.
    # Two independent Optuna runs (20 trials each) both converged on 0.35 —
    # updating default to match confirmed optimal value.
    lambda_vol: float = 0.35
    risk_free_rate: float = 0.0
    sharpe_window: int = 20


@dataclass(frozen=True)
class TrainingConfig:
    max_epochs: int = 100
    # patience 10→15: with 19 batches/epoch (vs 4 before) each epoch carries
    # more information; give the model more time to converge.
    early_stopping_patience: int = 15
    # batch_size 64→16: 313 samples / 64 = 4 batches/epoch → noisy gradients.
    # 313 / 16 ≈ 19 batches/epoch gives stable, consistent gradient estimates.
    batch_size: int = 16
    val_ratio: float = 0.15
    test_ratio: float = 0.10
    lookback_days: int = 730
    seed: int = 42
    num_workers: int = 0
    optuna_n_trials: int = 50
    checkpoint_dir: str = ""
    best_model_path: str = ""
    hf_model_repo: str = "ifieryarrows/copper-mind-tft"


@dataclass(frozen=True)
class FeatureStoreConfig:
    target_symbol: str = "HG=F"
    max_ffill: int = 3
    calendar_features: bool = True
    macro_event_features: bool = True


@dataclass
class TFTASROConfig:
    """Top-level config aggregating all sub-configs."""

    embedding: EmbeddingConfig = field(default_factory=EmbeddingConfig)
    sentiment: SentimentFeatureConfig = field(default_factory=SentimentFeatureConfig)
    lme: LMEConfig = field(default_factory=LMEConfig)
    model: TFTModelConfig = field(default_factory=TFTModelConfig)
    asro: ASROConfig = field(default_factory=ASROConfig)
    training: TrainingConfig = field(default_factory=TrainingConfig)
    feature_store: FeatureStoreConfig = field(default_factory=FeatureStoreConfig)

    @property
    def model_root(self) -> Path:
        return Path(self.training.checkpoint_dir).parent


def get_tft_config() -> TFTASROConfig:
    """
    Return the default TFT-ASRO configuration with paths resolved from
    MODEL_DIR (``/data/models`` on HF Space, configurable locally).
    """
    base = Path(_model_dir()) / "tft"
    return TFTASROConfig(
        embedding=EmbeddingConfig(
            pca_model_path=str(base / "pca_finbert.joblib"),
        ),
        training=TrainingConfig(
            checkpoint_dir=str(base / "checkpoints"),
            best_model_path=str(base / "best_tft_asro.ckpt"),
        ),
    )