"""
Attention Models
Various attention mechanisms for football prediction.

Part of the complete blueprint implementation.
"""

import numpy as np
from typing import Dict, Optional, Tuple
import logging

logger = logging.getLogger(__name__)

try:
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    TORCH_AVAILABLE = True
except ImportError:
    TORCH_AVAILABLE = False


if TORCH_AVAILABLE:
    class MultiHeadSelfAttention(nn.Module):
        """Multi-head self-attention for match features."""
        
        def __init__(
            self,
            embed_dim: int = 64,
            num_heads: int = 4,
            dropout: float = 0.1
        ):
            super().__init__()
            self.attention = nn.MultiheadAttention(
                embed_dim, num_heads, dropout=dropout, batch_first=True
            )
            self.norm = nn.LayerNorm(embed_dim)
        
        def forward(
            self,
            x: torch.Tensor,
            mask: torch.Tensor = None
        ) -> Tuple[torch.Tensor, torch.Tensor]:
            attn_out, attn_weights = self.attention(x, x, x, key_padding_mask=mask)
            return self.norm(x + attn_out), attn_weights
    
    
    class CrossAttention(nn.Module):
        """Cross-attention between home and away team features."""
        
        def __init__(self, embed_dim: int = 64, num_heads: int = 4):
            super().__init__()
            self.cross_attn = nn.MultiheadAttention(
                embed_dim, num_heads, batch_first=True
            )
            self.norm = nn.LayerNorm(embed_dim)
        
        def forward(
            self,
            query: torch.Tensor,
            key_value: torch.Tensor
        ) -> torch.Tensor:
            attn_out, _ = self.cross_attn(query, key_value, key_value)
            return self.norm(query + attn_out)
    
    
    class AttentionModel(nn.Module):
        """Full attention-based prediction model."""
        
        def __init__(
            self,
            input_dim: int = 32,
            embed_dim: int = 64,
            num_heads: int = 4,
            num_layers: int = 2,
            output_dim: int = 3,
            dropout: float = 0.2
        ):
            super().__init__()
            
            self.input_proj = nn.Linear(input_dim, embed_dim)
            
            # Self-attention layers
            self.self_attention = nn.ModuleList([
                MultiHeadSelfAttention(embed_dim, num_heads, dropout)
                for _ in range(num_layers)
            ])
            
            # Cross-attention for home vs away
            self.cross_attention = CrossAttention(embed_dim, num_heads)
            
            # Output
            self.output = nn.Sequential(
                nn.Linear(embed_dim * 2, embed_dim),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.Linear(embed_dim, output_dim)
            )
        
        def forward(
            self,
            home_features: torch.Tensor,
            away_features: torch.Tensor
        ) -> Tuple[torch.Tensor, Dict]:
            """
            Forward pass with attention.
            
            Args:
                home_features: (batch, seq_len, input_dim)
                away_features: (batch, seq_len, input_dim)
            """
            # Project inputs
            home = self.input_proj(home_features)
            away = self.input_proj(away_features)
            
            attention_weights = {}
            
            # Self-attention
            for i, layer in enumerate(self.self_attention):
                home, home_attn = layer(home)
                away, away_attn = layer(away)
                attention_weights[f'self_layer_{i}'] = {
                    'home': home_attn.detach(),
                    'away': away_attn.detach()
                }
            
            # Cross-attention
            home_cross = self.cross_attention(home, away)
            away_cross = self.cross_attention(away, home)
            
            # Pool and combine
            home_pooled = home_cross.mean(dim=1)
            away_pooled = away_cross.mean(dim=1)
            
            combined = torch.cat([home_pooled, away_pooled], dim=-1)
            output = self.output(combined)
            
            return output, attention_weights


class AttentionPredictor:
    """Wrapper for attention-based prediction."""
    
    def __init__(
        self,
        input_dim: int = 32,
        embed_dim: int = 64,
        seq_len: int = 10
    ):
        self.input_dim = input_dim
        self.embed_dim = embed_dim
        self.seq_len = seq_len
        self.model = None
        self.device = 'cuda' if TORCH_AVAILABLE and torch.cuda.is_available() else 'cpu'
        
        if TORCH_AVAILABLE:
            self.model = AttentionModel(input_dim, embed_dim).to(self.device)
    
    def encode_team_history(self, matches: list) -> np.ndarray:
        """Encode team match history."""
        sequence = np.zeros((self.seq_len, self.input_dim))
        
        for i, match in enumerate(matches[-self.seq_len:]):
            idx = self.seq_len - len(matches[-self.seq_len:]) + i
            sequence[idx, 0] = match.get('goals_for', 0)
            sequence[idx, 1] = match.get('goals_against', 0)
            sequence[idx, 2] = match.get('xg', 0)
            sequence[idx, 3] = match.get('possession', 50) / 100
            sequence[idx, 4] = match.get('shots', 0) / 20
        
        return sequence
    
    def predict(
        self,
        home_history: list,
        away_history: list
    ) -> Dict:
        """Predict match with attention weights."""
        if not TORCH_AVAILABLE or self.model is None:
            return {'home': 0.4, 'draw': 0.25, 'away': 0.35}
        
        home_enc = self.encode_team_history(home_history)
        away_enc = self.encode_team_history(away_history)
        
        self.model.eval()
        with torch.no_grad():
            home_t = torch.tensor(home_enc, dtype=torch.float32).unsqueeze(0).to(self.device)
            away_t = torch.tensor(away_enc, dtype=torch.float32).unsqueeze(0).to(self.device)
            
            logits, attn_weights = self.model(home_t, away_t)
            probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
        
        return {
            'home': float(probs[0]),
            'draw': float(probs[1]),
            'away': float(probs[2]),
            'model': 'attention',
            'attention_available': True
        }
    
    def get_attention_explanation(
        self,
        home_history: list,
        away_history: list
    ) -> Dict:
        """Get attention weights for interpretation."""
        if not TORCH_AVAILABLE or self.model is None:
            return {}
        
        home_enc = self.encode_team_history(home_history)
        away_enc = self.encode_team_history(away_history)
        
        self.model.eval()
        with torch.no_grad():
            home_t = torch.tensor(home_enc, dtype=torch.float32).unsqueeze(0).to(self.device)
            away_t = torch.tensor(away_enc, dtype=torch.float32).unsqueeze(0).to(self.device)
            
            _, attn_weights = self.model(home_t, away_t)
        
        # Extract attention patterns
        explanation = {
            'most_important_home_match': 0,
            'most_important_away_match': 0,
        }
        
        if 'self_layer_0' in attn_weights:
            home_attn = attn_weights['self_layer_0']['home'].cpu().numpy()
            away_attn = attn_weights['self_layer_0']['away'].cpu().numpy()
            
            explanation['most_important_home_match'] = int(np.argmax(home_attn.mean(axis=(0, 1))))
            explanation['most_important_away_match'] = int(np.argmax(away_attn.mean(axis=(0, 1))))
        
        return explanation


_model: Optional[AttentionPredictor] = None

def get_model() -> AttentionPredictor:
    global _model
    if _model is None:
        _model = AttentionPredictor()
    return _model