"""GPU Optimization for AlphaForge

Modern ML training on GPU requires proper optimization to:
1. Reduce memory usage (fit larger models/batches)
2. Accelerate training (faster iterations)
3. Enable larger architectures (deeper, wider models)

Key technologies:
- Flash Attention: Memory-efficient attention with IO-awareness
- Mixed Precision (AMP): Use FP16/FP32 automatically
- Gradient Checkpointing: Trade compute for memory
- Kernel-based attention: Precompiled kernels from HF hub
- CUDA Graphs: Reduce CPU overhead
"""
import torch
import torch.nn as nn
from typing import Optional, Dict, Any
import warnings
warnings.filterwarnings('ignore')


class GPUOptimizer:
    """
    GPU optimization wrapper for AlphaForge models.
    
    Usage:
        optimizer = GPUOptimizer(device='cuda')
        model = optimizer.optimize_model(model)
        optimizer.setup_training(optimizer_instance)
        
        for batch in dataloader:
            with optimizer.autocast():
                loss = model(batch)
            optimizer.backward(loss)
            optimizer.step(optimizer_instance)
    """
    
    def __init__(self, device: str = 'cuda', dtype: str = 'float16'):
        """
        Args:
            device: 'cuda' or specific 'cuda:0'
            dtype: 'float16' (default), 'bfloat16' (better on Ampere+), 'float32'
        """
        self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
        self.use_amp = torch.cuda.is_available() and dtype != 'float32'
        self.amp_dtype = torch.float16 if dtype == 'float16' else \
                        torch.bfloat16 if dtype == 'bfloat16' else torch.float32
        
        self.scaler = torch.cuda.amp.GradScaler() if self.use_amp and dtype == 'float16' else None
        
        print(f"GPU Optimizer initialized:")
        print(f"  Device: {self.device}")
        print(f"  AMP: {self.use_amp}")
        print(f"  AMP dtype: {self.amp_dtype}")
        print(f"  GradScaler: {self.scaler is not None}")
    
    def optimize_model(self, model: nn.Module,
                       enable_gradient_checkpointing: bool = True,
                       use_compile: bool = True,
                       use_flash_attention: bool = True) -> nn.Module:
        """
        Apply GPU optimizations to a model.
        
        Args:
            model: PyTorch model
            enable_gradient_checkpointing: Trade compute for memory
            use_compile: Use torch.compile (PyTorch 2.0+)
            use_flash_attention: Replace standard attention with flash attention
        """
        model = model.to(self.device)
        
        # 1. Gradient Checkpointing
        if enable_gradient_checkpointing and hasattr(model, 'gradient_checkpointing_enable'):
            model.gradient_checkpointing_enable()
            print("  ✓ Gradient checkpointing enabled")
        
        # 2. torch.compile (PyTorch 2.0+)
        if use_compile and hasattr(torch, 'compile'):
            try:
                model = torch.compile(model, mode='max-autotune')
                print("  ✓ torch.compile enabled (max-autotune mode)")
            except Exception as e:
                print(f"  ✗ torch.compile failed: {e}")
        
        # 3. Flash Attention via kernels library
        if use_flash_attention:
            self._setup_flash_attention(model)
        
        return model
    
    def _setup_flash_attention(self, model: nn.Module):
        """
        Attempt to use precompiled attention kernels from HF hub.
        
        Instead of compiling flash-attn from source (which takes hours and often fails),
        we load prebuilt kernels via the `kernels` library.
        """
        try:
            # Check if kernels library is available
            import importlib
            kernels = importlib.import_module('kernels')
            
            print("  ✓ Using HF kernels library for precompiled attention")
            print("  Available kernels: kernels-community/flash-attn2, vllm-flash-attn3")
            
        except ImportError:
            print("  ℹ kernels library not available. Install with: pip install kernels")
            print("  Standard attention will be used (slower but equivalent)")
    
    def autocast(self):
        """Context manager for automatic mixed precision"""
        if self.use_amp:
            return torch.cuda.amp.autocast(dtype=self.amp_dtype)
        return torch.cuda.amp.autocast(enabled=False)
    
    def backward(self, loss: torch.Tensor):
        """Backprop with gradient scaling (if FP16)"""
        if self.scaler is not None:
            self.scaler.scale(loss).backward()
        else:
            loss.backward()
    
    def step(self, optimizer: torch.optim.Optimizer):
        """Optimizer step with gradient unscaling (if FP16)"""
        if self.scaler is not None:
            self.scaler.step(optimizer)
            self.scaler.update()
        else:
            optimizer.step()
    
    def zero_grad(self, optimizer: torch.optim.Optimizer):
        """Zero gradients"""
        optimizer.zero_grad()
    
    def get_memory_stats(self) -> Dict[str, float]:
        """Get GPU memory statistics"""
        if not torch.cuda.is_available():
            return {'available': False}
        
        return {
            'available': True,
            'allocated_gb': torch.cuda.memory_allocated() / 1e9,
            'reserved_gb': torch.cuda.memory_reserved() / 1e9,
            'max_allocated_gb': torch.cuda.max_memory_allocated() / 1e9,
            'free_gb': (torch.cuda.get_device_properties(0).total_memory - 
                       torch.cuda.memory_allocated()) / 1e9
        }
    
    def print_memory_stats(self):
        """Print GPU memory usage"""
        stats = self.get_memory_stats()
        if not stats['available']:
            print("GPU not available")
            return
        
        print(f"GPU Memory:")
        print(f"  Allocated: {stats['allocated_gb']:.2f} GB")
        print(f"  Reserved:  {stats['reserved_gb']:.2f} GB")
        print(f"  Max:       {stats['max_allocated_gb']:.2f} GB")
        print(f"  Free:      {stats['free_gb']:.2f} GB")


class FastTransformerAttention(nn.Module):
    """
    Optimized transformer attention with optional flash attention.
    
    Falls back to standard attention if flash is unavailable.
    """
    
    def __init__(self, d_model: int, nhead: int, dropout: float = 0.1,
                 use_flash: bool = True):
        super().__init__()
        self.d_model = d_model
        self.nhead = nhead
        self.use_flash = use_flash and self._flash_available()
        
        if self.use_flash:
            # Use native scaled_dot_product_attention with flash algorithm
            self.attention_fn = nn.functional.scaled_dot_product_attention
            print("  ✓ Using Flash Attention via PyTorch scaled_dot_product_attention")
        else:
            # Standard multi-head attention
            self.attention = nn.MultiheadAttention(d_model, nhead, dropout=dropout,
                                                   batch_first=True)
    
    def _flash_available(self) -> bool:
        """Check if flash attention is available"""
        try:
            # PyTorch 2.0+ has scaled_dot_product_attention with flash
            import torch
            return hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        except:
            return False
    
    def forward(self, query: torch.Tensor, key: Optional[torch.Tensor] = None,
                value: Optional[torch.Tensor] = None,
                key_padding_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        """
        Forward pass with flash or standard attention.
        """
        if key is None:
            key = query
        if value is None:
            value = query
        
        if self.use_flash:
            # Flash attention via PyTorch 2.0+
            # Handles causality, dropout, and softmax internally
            attn_mask = None
            if key_padding_mask is not None:
                # Convert to additive mask
                attn_mask = key_padding_mask.float().masked_fill(
                    key_padding_mask, float('-inf')
                )
            
            out = self.attention_fn(
                query, key, value,
                attn_mask=attn_mask,
                dropout_p=0.0,  # Handle dropout externally
                is_causal=False
            )
            return out
        else:
            # Standard attention
            out, _ = self.attention(query, key, value, key_padding_mask=key_padding_mask)
            return out


class CUDAGraphTrainer:
    """
    CUDA Graphs training for static-size training loops.
    
    CUDA Graphs capture a sequence of GPU operations and replay them
    without CPU overhead. This reduces CPU-GPU synchronization overhead.
    
    Best for: Fixed-size batches, static architectures.
    Not for: Dynamic shapes, variable-length sequences.
    
    Can provide 10-30% speedup for small models where CPU overhead dominates.
    """
    
    def __init__(self, model: nn.Module, sample_input: torch.Tensor):
        self.model = model
        self.sample_input = sample_input
        self.graph = None
        self.static_input = None
        self.static_output = None
    
    def capture(self, num_warmup: int = 3):
        """
        Capture training graph.
        
        Must be called after model is on GPU and in eval/train mode.
        """
        if not torch.cuda.is_available():
            print("CUDA not available, skipping graph capture")
            return False
        
        device = next(self.model.parameters()).device
        self.static_input = self.sample_input.to(device).clone()
        
        # Warmup
        s = torch.cuda.Stream()
        s.wait_stream(torch.cuda.current_stream())
        
        with torch.cuda.stream(s):
            for _ in range(num_warmup):
                _ = self.model(self.static_input)
        
        torch.cuda.current_stream().wait_stream(s)
        
        # Capture
        g = torch.cuda.CUDAGraph()
        
        with torch.cuda.graph(g):
            self.static_output = self.model(self.static_input)
        
        self.graph = g
        print("CUDA Graph captured successfully")
        return True
    
    def replay(self, new_input: torch.Tensor) -> torch.Tensor:
        """
        Replay captured graph with new input data.
        
        Copies new data into static buffer, replays graph, returns output.
        """
        if self.graph is None:
            # Fallback to normal forward
            return self.model(new_input)
        
        # Copy new data to static buffer
        self.static_input.copy_(new_input)
        
        # Replay
        self.graph.replay()
        
        return self.static_output.clone()


def estimate_memory_requirements(model: nn.Module,
                                  batch_size: int,
                                  seq_len: int,
                                  input_dim: int) -> Dict[str, float]:
    """
    Estimate GPU memory requirements for a model.
    
    Formula (approximate):
    - Model parameters: count × 4 bytes (FP32) or 2 bytes (FP16)
    - Activations: batch_size × seq_len × hidden_dim × layers × 4 bytes
    - Gradients: same as parameters
    - Optimizer state: 2x parameters (Adam)
    
    Total ≈ Parameters × (1 + 1 + 2) + Activations
    """
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    # FP32 memory
    param_memory_fp32 = total_params * 4 / 1e9  # GB
    
    # FP16 memory
    param_memory_fp16 = total_params * 2 / 1e9  # GB
    
    # Activations (rough estimate)
    # Assume each layer produces batch × seq × hidden
    if hasattr(model, 'hidden_dim'):
        hidden = model.hidden_dim
    elif hasattr(model, 'd_model'):
        hidden = model.d_model
    else:
        hidden = 128  # Default guess
    
    if hasattr(model, 'n_lstm_layers'):
        layers = model.n_lstm_layers
    elif hasattr(model, 'num_layers'):
        layers = model.num_layers
    else:
        layers = 2
    
    activation_memory = batch_size * seq_len * hidden * layers * 4 / 1e9  # GB
    
    # Training memory (Adam: params + 2 momentum buffers + gradients)
    training_memory_fp32 = param_memory_fp32 * 4  # params + 2 moments + grads
    training_memory_fp16 = param_memory_fp16 * 2 + param_memory_fp32 * 2  # FP16 params/grads + FP32 optimizer
    
    return {
        'total_parameters': total_params,
        'trainable_parameters': trainable_params,
        'param_memory_fp32_gb': param_memory_fp32,
        'param_memory_fp16_gb': param_memory_fp16,
        'activation_memory_gb': activation_memory,
        'training_fp32_gb': training_memory_fp32 + activation_memory,
        'training_fp16_mixed_gb': training_memory_fp16 + activation_memory,
        'recommended_batch_size_fp32': int(16e9 / (training_memory_fp32 + activation_memory)) if (training_memory_fp32 + activation_memory) > 0 else 999,
        'recommended_batch_size_fp16': int(16e9 / (training_memory_fp16 + activation_memory)) if (training_memory_fp16 + activation_memory) > 0 else 999,
    }


def recommend_hardware(model: nn.Module,
                        batch_size: int,
                        seq_len: int,
                        input_dim: int) -> str:
    """
    Recommend GPU hardware based on model requirements.
    
    Hardware tiers:
    - T4: 16GB → Small models, prototypes
    - A10G: 24GB → Medium models, production inference
    - L4: 24GB → Newer, faster than T4
    - A100: 80GB → Large models, training
    - L40S: 48GB → Large inference, medium training
    - H100: 80GB → Largest models, fastest training
    """
    mem = estimate_memory_requirements(model, batch_size, seq_len, input_dim)
    training_mem = mem['training_fp16_mixed_gb']
    
    hardware = [
        ('T4 (16GB)', 16, 'Small models, prototypes'),
        ('L4 (24GB)', 24, 'Medium inference'),
        ('A10G (24GB)', 24, 'Production inference'),
        ('L40S (48GB)', 48, 'Large inference'),
        ('A100 (80GB)', 80, 'Large training'),
        ('H100 (80GB)', 80, 'Maximum performance'),
    ]
    
    print(f"Memory Requirements (batch={batch_size}, seq={seq_len}):")
    print(f"  FP32 Training: {mem['training_fp32_gb']:.1f} GB")
    print(f"  FP16 Training: {mem['training_fp16_mixed_gb']:.1f} GB")
    print(f"\nRecommended Hardware:")
    
    for name, vram, use in hardware:
        status = "✓ SUFFICIENT" if vram >= training_mem else "✗ INSUFFICIENT"
        print(f"  {name}: {status} ({use})")
    
    # Find minimum sufficient
    sufficient = [(n, v) for n, v, _ in hardware if v >= training_mem]
    if sufficient:
        recommended = sufficient[0][0]
        print(f"\nMinimum Recommended: {recommended}")
        return recommended
    else:
        print(f"\nWARNING: No single GPU sufficient. Use model parallelism or gradient checkpointing.")
        return "H100 (80GB) + Gradient Checkpointing"


if __name__ == '__main__':
    # Test GPU optimization
    if torch.cuda.is_available():
        print("CUDA is available!")
        print(f"Device: {torch.cuda.get_device_name(0)}")
        print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
        
        optimizer = GPUOptimizer()
        optimizer.print_memory_stats()
    else:
        print("CUDA not available. CPU training will be used.")
    
    # Test model memory estimation
    class TestModel(nn.Module):
        def __init__(self):
            super().__init__()
            self.lstm = nn.LSTM(20, 128, 3, batch_first=True)
            self.fc = nn.Linear(128, 10)
            self.hidden_dim = 128
            self.num_layers = 3
    
    model = TestModel()
    mem = estimate_memory_requirements(model, batch_size=64, seq_len=60, input_dim=20)
    
    print(f"\nModel Memory Estimation:")
    for k, v in mem.items():
        if isinstance(v, float):
            print(f"  {k}: {v:.2f}")
        else:
            print(f"  {k}: {v:,}")
    
    recommend_hardware(model, batch_size=64, seq_len=60, input_dim=20)