# pipelines/single_gpu_eval.py

import torch
import os
from pathlib import Path
from typing import Optional, Dict, Any
from rich.console import Console

console = Console()

def _prepare_model_for_single_gpu_eval(model, config: Dict[str, Any]) -> torch.nn.Module:
    """
    Prepare model for single GPU evaluation by removing multi-GPU wrappers and ensuring
    all components are on a single device (cuda:0 or first visible GPU).
    
    This fixes the cuda:0 vs cuda:1 device mismatch issue by ensuring the model
    is completely on one device before evaluation.
    
    Args:
        model: The trained model (may be wrapped with DDP/FSDP/DataParallel)
        config: Configuration dictionary
        
    Returns:
        Clean model on single GPU (cuda:0 if available, otherwise cpu)
    """
    console.print("[blue]🔄 Preparing model for evaluation on single GPU...[/blue]")
    
    # Determine target device
    if torch.cuda.is_available():
        # Use first visible GPU or cuda:0
        visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
        if visible_devices and visible_devices != "-1":
            target_device = "cuda:0"  # First visible device
        else:
            target_device = "cuda:0"
        console.print(f"[blue]   🎯 Target device: {target_device}[/blue]")
    else:
        target_device = "cpu"
        console.print("[blue]   🎯 Target device: cpu (no CUDA available)[/blue]")
    
    try:
        # Step 1: Remove any multi-GPU wrappers
        original_model = model
        if hasattr(model, 'module'):
            # Remove DataParallel/DistributedDataParallel wrapper
            model = model.module
            console.print("[blue]   ✅ Removed DDP/DataParallel wrapper[/blue]")
        
        # Step 2: Move model to CPU first to clear any device state
        model = model.cpu()
        console.print("[blue]   ✅ Model moved to CPU[/blue]")
        
        # Step 3: Clear any cached states or buffers
        if hasattr(model, 'clear_cache'):
            model.clear_cache()
            console.print("[blue]   ✅ Model cache cleared[/blue]")
        
        # Step 4: Move to target device
        model = model.to(target_device)
        console.print(f"[blue]   ✅ Model moved to {target_device}[/blue]")
        
        # Step 5: Verify all components are on the same device
        _verify_model_device_consistency(model, target_device)
        
        # Step 6: Re-attach LoRA adapters if needed
        if config.get("training_recipe", "").lower() in ["lora", "qlora"]:
            model = _reattach_lora_adapters(model, target_device, config)
        
        console.print(f"[green]✅ Model prepared for single GPU evaluation on {target_device}[/green]")
        return model
        
    except Exception as e:
        console.print(f"[red]❌ Failed to prepare model for single GPU evaluation: {e}[/red]")
        # Fallback: return original model
        console.print("[yellow]⚠️ Falling back to original model[/yellow]")
        return original_model

def _verify_model_device_consistency(model: torch.nn.Module, target_device: str) -> None:
    """
    Verify that all model components are on the target device.
    
    Args:
        model: The model to verify
        target_device: Expected device (e.g., "cuda:0", "cpu")
    """
    target_device = torch.device(target_device)
    issues = []
    
    # Check parameters
    for name, param in model.named_parameters():
        if param.device != target_device:
            issues.append(f"Parameter {name} on {param.device}, expected {target_device}")
    
    # Check buffers
    for name, buffer in model.named_buffers():
        if buffer.device != target_device:
            issues.append(f"Buffer {name} on {buffer.device}, expected {target_device}")
    
    if issues:
        console.print(f"[red]❌ Device consistency issues found:[/red]")
        for issue in issues:
            console.print(f"[red]   - {issue}[/red]")
        raise RuntimeError(f"Model device consistency issues: {issues}")
    else:
        console.print(f"[green]   ✅ All model components on {target_device}[/green]")

def _reattach_lora_adapters(model: torch.nn.Module, target_device: str, config: Dict[str, Any]) -> torch.nn.Module:
    """
    Re-attach LoRA adapters if they were used during training.
    
    Args:
        model: The base model
        target_device: Target device for the model
        config: Configuration dictionary
        
    Returns:
        Model with LoRA adapters re-attached
    """
    try:
        # Check if LoRA adapters exist
        adapter_path = Path(config.get("output_dir", "runs/humigence")) / "final_model"
        if adapter_path.exists() and (adapter_path / "adapter_config.json").exists():
            console.print("[blue]   🔧 Re-attaching LoRA adapters...[/blue]")
            
            # Import PEFT here to avoid issues if not available
            try:
                from peft import PeftModel
                
                # Load the model with LoRA adapters
                model = PeftModel.from_pretrained(model, str(adapter_path))
                model = model.to(target_device)
                console.print("[blue]   ✅ LoRA adapters re-attached[/blue]")
                
            except ImportError:
                console.print("[yellow]   ⚠️ PEFT not available, skipping LoRA re-attachment[/yellow]")
            except Exception as e:
                console.print(f"[yellow]   ⚠️ Failed to re-attach LoRA adapters: {e}[/yellow]")
        
        return model
        
    except Exception as e:
        console.print(f"[yellow]   ⚠️ LoRA re-attachment failed: {e}[/yellow]")
        return model

def _move_batch_to_device(batch: Dict[str, torch.Tensor], target_device: str) -> Dict[str, torch.Tensor]:
    """
    Move all tensors in a batch to the target device.
    
    Args:
        batch: Dictionary of tensors
        target_device: Target device (e.g., "cuda:0", "cpu")
        
    Returns:
        Batch with all tensors on target device
    """
    target_device = torch.device(target_device)
    moved_batch = {}
    
    for key, value in batch.items():
        if hasattr(value, "to"):
            moved_batch[key] = value.to(target_device)
        else:
            moved_batch[key] = value
    
    return moved_batch

def _move_tensors_to_cpu(*tensors) -> tuple:
    """
    Move all tensors to CPU and detach them.
    
    Args:
        *tensors: Variable number of tensors
        
    Returns:
        Tuple of CPU tensors
    """
    cpu_tensors = []
    for tensor in tensors:
        if hasattr(tensor, "detach"):
            cpu_tensors.append(tensor.detach().cpu())
        else:
            cpu_tensors.append(tensor)
    return tuple(cpu_tensors)