File size: 100,195 Bytes

67cdd6e

"""
Training Loop for DETR
"""
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR
from torch.utils.data import DataLoader
from typing import Dict, Optional

# TensorBoard (optional)
try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    SummaryWriter = None
import os
import signal
import sys
from pathlib import Path
import time
import gc
import sys  # Ensure sys is imported for flush()
from src.training.evaluator import Evaluator
from src.training.adaptive_optimizer import AdaptiveOptimizer

# Mixed precision training
from torch.amp import autocast, GradScaler

# Memory monitoring
try:
    import psutil
    PSUTIL_AVAILABLE = True
except ImportError:
    PSUTIL_AVAILABLE = False

# MLflow tracking
try:
    import mlflow
    import mlflow.pytorch
    MLFLOW_AVAILABLE = True
except ImportError:
    MLFLOW_AVAILABLE = False
    mlflow = None

# MLflow debugging flag
MLFLOW_DEBUG = True  # Set to True to enable detailed MLflow error logging


class MLflowDebugger:
    """Comprehensive MLflow operation debugging"""
    
    def __init__(self, tracking_uri, run_id=None, experiment_id=None):
        self.tracking_uri = tracking_uri
        self.run_id = run_id
        self.experiment_id = experiment_id
        self.operation_count = 0
        
    def _get_timestamp(self):
        """Get formatted timestamp"""
        return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + f".{int(time.time() * 1000) % 1000:03d}"
    
    def _format_size(self, size_bytes):
        """Format bytes to human readable"""
        for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
            if size_bytes < 1024.0:
                return f"{size_bytes:.2f}{unit}"
            size_bytes /= 1024.0
        return f"{size_bytes:.2f}PB"
    
    def check_filesystem_health(self, path=None):
        """Check disk space, permissions, path validity"""
        health = {
            'disk_total': 0,
            'disk_free': 0,
            'disk_used': 0,
            'disk_percent': 0,
            'path_exists': False,
            'path_writable': False,
            'path_readable': False,
            'fs_type': None,
            'mount_point': None
        }
        
        try:
            import shutil
            import stat
            
            # Get disk usage for the path or current directory
            check_path = path if path else os.getcwd()
            statvfs = os.statvfs(check_path)
            
            # Calculate disk space
            total_bytes = statvfs.f_frsize * statvfs.f_blocks
            free_bytes = statvfs.f_frsize * statvfs.f_bavail
            used_bytes = total_bytes - free_bytes
            percent_used = (used_bytes / total_bytes * 100) if total_bytes > 0 else 0
            
            health['disk_total'] = total_bytes
            health['disk_free'] = free_bytes
            health['disk_used'] = used_bytes
            health['disk_percent'] = percent_used
            
            # Check path permissions
            if path and os.path.exists(path):
                health['path_exists'] = True
                health['path_readable'] = os.access(path, os.R_OK)
                health['path_writable'] = os.access(path, os.W_OK)
            
            # Try to get filesystem info
            try:
                if PSUTIL_AVAILABLE:
                    disk = psutil.disk_usage(check_path)
                    health['disk_total'] = disk.total
                    health['disk_free'] = disk.free
                    health['disk_used'] = disk.used
                    health['disk_percent'] = disk.percent
            except:
                pass
                
        except Exception as e:
            if MLFLOW_DEBUG:
                print(f"[MLFLOW DEBUG] Error checking filesystem health: {e}")
        
        return health
    
    def check_mlflow_backend_health(self):
        """Verify MLflow backend is accessible"""
        health = {
            'backend_accessible': False,
            'experiment_accessible': False,
            'run_accessible': False,
            'run_active': False,
            'tracking_uri': self.tracking_uri,
            'error': None
        }
        
        if not MLFLOW_AVAILABLE:
            health['error'] = "MLflow not available"
            return health
        
        try:
            # Check tracking URI
            current_uri = mlflow.get_tracking_uri()
            health['backend_accessible'] = (current_uri == self.tracking_uri or self.tracking_uri in current_uri)
            
            # Check experiment
            if self.experiment_id:
                try:
                    experiment = mlflow.get_experiment(self.experiment_id)
                    health['experiment_accessible'] = experiment is not None
                except:
                    health['experiment_accessible'] = False
            
            # Check run
            if self.run_id:
                try:
                    run = mlflow.get_run(self.run_id)
                    health['run_accessible'] = run is not None
                    health['run_active'] = run.info.status == "RUNNING"
                except:
                    health['run_accessible'] = False
                    health['run_active'] = False
                    
        except Exception as e:
            health['error'] = str(e)
            if MLFLOW_DEBUG:
                print(f"[MLFLOW DEBUG] Backend health check error: {e}")
        
        return health
    
    def log_operation_start(self, operation_name, **kwargs):
        """Log before MLflow operation"""
        self.operation_count += 1
        timestamp = self._get_timestamp()
        
        # Get filesystem health
        fs_health = self.check_filesystem_health()
        
        # Build log message
        details = []
        if 'file_path' in kwargs:
            file_path = kwargs['file_path']
            details.append(f"file={file_path}")
            if os.path.exists(file_path):
                file_size = os.path.getsize(file_path)
                details.append(f"size={self._format_size(file_size)}")
        if 'artifact_path' in kwargs:
            details.append(f"artifact_path={kwargs['artifact_path']}")
        if 'metric_name' in kwargs:
            details.append(f"metric={kwargs['metric_name']}")
            details.append(f"value={kwargs.get('value', 'N/A')}")
        if 'step' in kwargs:
            details.append(f"step={kwargs['step']}")
        
        details.append(f"disk_free={self._format_size(fs_health['disk_free'])}")
        details.append(f"disk_used={fs_health['disk_percent']:.1f}%")
        
        log_msg = f"[MLFLOW DEBUG] [{timestamp}] [{operation_name}] [START] {' '.join(details)}"
        print(log_msg, flush=True)
        
        return {
            'start_time': time.time(),
            'operation_name': operation_name,
            'fs_health': fs_health,
            'kwargs': kwargs
        }
    
    def log_operation_success(self, operation_name, duration, context=None, **kwargs):
        """Log after successful operation"""
        timestamp = self._get_timestamp()
        
        # Get filesystem health after operation
        fs_health = self.check_filesystem_health()
        
        details = []
        details.append(f"duration={duration:.3f}s")
        
        if context and 'file_path' in context.get('kwargs', {}):
            file_path = context['kwargs']['file_path']
            if os.path.exists(file_path):
                file_size = os.path.getsize(file_path)
                details.append(f"file_size={self._format_size(file_size)}")
        
        details.append(f"disk_free={self._format_size(fs_health['disk_free'])}")
        details.append(f"disk_used={fs_health['disk_percent']:.1f}%")
        
        log_msg = f"[MLFLOW DEBUG] [{timestamp}] [{operation_name}] [SUCCESS] {' '.join(details)}"
        print(log_msg, flush=True)
    
    def log_operation_error(self, operation_name, error, context=None, **kwargs):
        """Log detailed error information"""
        timestamp = self._get_timestamp()
        
        # Get filesystem health
        fs_health = self.check_filesystem_health()
        
        # Get backend health
        backend_health = self.check_mlflow_backend_health()
        
        # Extract error details
        error_type = type(error).__name__
        error_msg = str(error)
        
        # Check for MLflow-specific error codes
        mlflow_error_code = None
        if hasattr(error, 'error_code'):
            mlflow_error_code = error.error_code
        elif 'INTERNAL_ERROR' in error_msg or 'INTERNAL_ERROR' in error_type:
            mlflow_error_code = 'INTERNAL_ERROR'
        
        # Get full traceback
        import traceback
        tb_str = traceback.format_exc()
        
        # Build log message
        details = []
        details.append(f"error_type={error_type}")
        if mlflow_error_code:
            details.append(f"mlflow_error_code={mlflow_error_code}")
        details.append(f"error_msg={error_msg}")
        
        if context:
            if 'file_path' in context.get('kwargs', {}):
                details.append(f"file={context['kwargs']['file_path']}")
            if 'artifact_path' in context.get('kwargs', {}):
                details.append(f"artifact_path={context['kwargs']['artifact_path']}")
            if 'start_time' in context:
                elapsed = time.time() - context['start_time']
                details.append(f"elapsed={elapsed:.3f}s")
        
        details.append(f"disk_free={self._format_size(fs_health['disk_free'])}")
        details.append(f"disk_used={fs_health['disk_percent']:.1f}%")
        details.append(f"backend_accessible={backend_health['backend_accessible']}")
        details.append(f"run_active={backend_health['run_active']}")
        
        # OSError specific details
        if isinstance(error, (OSError, IOError)):
            if hasattr(error, 'errno'):
                details.append(f"errno={error.errno}")
            if hasattr(error, 'strerror'):
                details.append(f"strerror={error.strerror}")
        
        log_msg = f"[MLFLOW DEBUG] [{timestamp}] [{operation_name}] [ERROR] {' '.join(details)}"
        print(log_msg, flush=True)
        
        # Print traceback if debug enabled
        if MLFLOW_DEBUG:
            print(f"[MLFLOW DEBUG] Full traceback:\n{tb_str}", flush=True)
        
        # Print filesystem details
        if MLFLOW_DEBUG:
            print(f"[MLFLOW DEBUG] Filesystem state: total={self._format_size(fs_health['disk_total'])}, "
                  f"free={self._format_size(fs_health['disk_free'])}, "
                  f"used={fs_health['disk_percent']:.1f}%", flush=True)
        
        # Print backend state
        if MLFLOW_DEBUG:
            print(f"[MLFLOW DEBUG] Backend state: uri={backend_health['tracking_uri']}, "
                  f"experiment_accessible={backend_health['experiment_accessible']}, "
                  f"run_accessible={backend_health['run_accessible']}, "
                  f"run_active={backend_health['run_active']}", flush=True)
            if backend_health['error']:
                print(f"[MLFLOW DEBUG] Backend error: {backend_health['error']}", flush=True)


class Trainer:
    """DETR Trainer"""
    
    def __init__(self, model: nn.Module, train_loader: DataLoader,
                 val_loader: DataLoader, config: Dict, device: torch.device,
                 writer: Optional = None, mlflow_run=None, real_val_path: Optional[str] = None):
        """
        Initialize trainer
        
        Args:
            model: DETR model
            train_loader: Training data loader
            val_loader: Validation data loader
            config: Training configuration
            device: Device to train on
            writer: TensorBoard writer (optional)
            mlflow_run: MLflow run object (optional)
            real_val_path: Path to real validation set for generalization testing (optional)
        """
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.config = config
        self.device = device
        self.writer = writer
        self.mlflow_run = mlflow_run
        self.use_mlflow = mlflow_run is not None and MLFLOW_AVAILABLE
        self.real_val_path = real_val_path
        self.mlflow_failure_count = 0  # Track consecutive MLflow failures
        self.mlflow_max_failures = 5  # Disable MLflow after this many consecutive failures
        # Disable model logging by default (causes INTERNAL_ERROR) - can be enabled via config
        self.mlflow_model_logging_enabled = config.get('logging', {}).get('mlflow_log_models', False)
        if not self.mlflow_model_logging_enabled:
            print("[MLFLOW] Model artifact logging disabled (set mlflow_log_models: true in config to enable)")
        
        # Initialize MLflow debugger (always initialize, even if MLflow is disabled)
        tracking_uri = config.get('logging', {}).get('mlflow_tracking_uri', 'file:./mlruns')
        if self.use_mlflow and mlflow_run:
            run_id = mlflow_run.info.run_id if hasattr(mlflow_run, 'info') else None
            experiment_id = mlflow_run.info.experiment_id if hasattr(mlflow_run, 'info') else None
            self.mlflow_debugger = MLflowDebugger(tracking_uri, run_id, experiment_id)
        else:
            # Initialize with None values if MLflow is not available
            self.mlflow_debugger = MLflowDebugger(tracking_uri, None, None)
        
        # Setup mixed precision training (AMP)
        self.use_amp = config['training'].get('mixed_precision', False)
        self.scaler = GradScaler('cuda') if self.use_amp else None
        if self.use_amp:
            print("Mixed precision training (AMP) enabled")
        
        # Setup gradient accumulation
        self.gradient_accumulation_steps = config['training'].get('gradient_accumulation_steps', 1)
        if self.gradient_accumulation_steps > 1:
            print(f"Gradient accumulation enabled: {self.gradient_accumulation_steps} steps")
        
        # Memory cleanup frequency
        self.memory_cleanup_frequency = config['training'].get('memory_cleanup_frequency', 10)
        
        # Setup adaptive optimizer (if enabled)
        self.adaptive_optimizer = None
        if config['training'].get('adaptive_optimization', False):
            self.adaptive_optimizer = AdaptiveOptimizer(
                initial_num_workers=config['dataset']['num_workers'],
                initial_prefetch_factor=config['dataset'].get('prefetch_factor', 2),
                target_gpu_utilization=config['training'].get('target_gpu_utilization', 0.85),
                max_ram_usage=config['training'].get('max_ram_usage', 0.80),
                adjustment_interval=config['training'].get('adaptive_adjustment_interval', 50)
            )
            print("Adaptive optimization enabled - monitoring resource usage")
        
        # Setup optimizer
        self.optimizer = AdamW(
            model.parameters(),
            lr=config['optimizer']['lr'],
            betas=config['optimizer']['betas'],
            weight_decay=config['optimizer']['weight_decay']
        )
        
        # Setup learning rate scheduler
        warmup_epochs = config['lr_schedule']['warmup_epochs']
        num_epochs = config['training']['num_epochs']
        
        if warmup_epochs > 0:
            warmup_scheduler = LinearLR(
                self.optimizer,
                start_factor=0.1,
                end_factor=1.0,
                total_iters=warmup_epochs * len(train_loader)
            )
            cosine_scheduler = CosineAnnealingLR(
                self.optimizer,
                T_max=(num_epochs - warmup_epochs) * len(train_loader),
                eta_min=config['lr_schedule'].get('min_lr', 1e-6)
            )
            self.scheduler = SequentialLR(
                self.optimizer,
                schedulers=[warmup_scheduler, cosine_scheduler],
                milestones=[warmup_epochs * len(train_loader)]
            )
        else:
            self.scheduler = CosineAnnealingLR(
                self.optimizer,
                T_max=num_epochs * len(train_loader),
                eta_min=config['lr_schedule'].get('min_lr', 1e-6)
            )
        
        # Setup evaluator
        self.evaluator = Evaluator(config['evaluation'])
        
        # Training state
        self.best_map = 0.0
        self.global_step = 0
        self.interrupted = False
        
        # Setup signal handlers for graceful shutdown
        def signal_handler(signum, frame):
            print(f"\n\nReceived signal {signum}. Saving checkpoint before exit...")
            self.interrupted = True
        
        signal.signal(signal.SIGINT, signal_handler)
        signal.signal(signal.SIGTERM, signal_handler)
    
    def _get_memory_usage(self) -> Dict[str, float]:
        """Get current memory usage in GB"""
        memory_info = {}
        if PSUTIL_AVAILABLE:
            process = psutil.Process(os.getpid())
            memory_info['ram_gb'] = process.memory_info().rss / (1024 ** 3)
        if torch.cuda.is_available():
            memory_info['gpu_gb'] = torch.cuda.memory_allocated() / (1024 ** 3)
            memory_info['gpu_reserved_gb'] = torch.cuda.memory_reserved() / (1024 ** 3)
        return memory_info
    
    def train_epoch(self, epoch: int):
        """Train for one epoch"""
        print(f"[VERBOSE] train_epoch({epoch}) called", flush=True)
        
        self.model.train()
        print(f"[VERBOSE] Model set to train mode", flush=True)
        
        total_loss = 0.0
        num_batches = 0
        
        print_freq = self.config['logging']['print_frequency']
        
        # Initialize gradient accumulation
        accumulation_loss = 0.0
        accumulation_loss_components = {}  # Track individual loss components
        accumulation_count = 0
        
        # Zero gradients at start
        self.optimizer.zero_grad()
        print(f"[VERBOSE] Gradients zeroed, about to iterate over train_loader (len={len(self.train_loader)})", flush=True)
        print(f"[VERBOSE] Getting iterator from train_loader...", flush=True)
        
        for batch_idx, (images, targets) in enumerate(self.train_loader):
            if batch_idx == 0:
                print(f"[VERBOSE] Starting batch 0 at {time.strftime('%Y-%m-%d %H:%M:%S')}", flush=True)
            elif batch_idx % 10 == 0:
                print(f"[VERBOSE] Processing batch {batch_idx}/{len(self.train_loader)}", flush=True)
            
            # Track data loading time (approximate)
            data_load_start = time.time()
            if batch_idx == 0:
                print(f"[VERBOSE] Batch {batch_idx}: Data loaded, moving to device...", flush=True)
            
            # Move to device
            # DETR expects list of images, not batched tensor
            if isinstance(images, torch.Tensor):
                images = [img.to(self.device) for img in images]
            else:
                images = [img.to(self.device) for img in images]
            
            targets = [{k: v.to(self.device) if isinstance(v, torch.Tensor) else v 
                       for k, v in t.items()} for t in targets]
            
            data_load_time = time.time() - data_load_start
            if batch_idx == 0:
                print(f"[VERBOSE] Batch {batch_idx}: Moved to device in {data_load_time:.3f}s, about to forward pass...", flush=True)
            
            # Convert images to channels-last if enabled
            if hasattr(self.model, 'memory_format') and self.model.memory_format == torch.channels_last:
                images = [img.to(memory_format=torch.channels_last) if isinstance(img, torch.Tensor) else img 
                         for img in images]
            
            # Track GPU processing time
            gpu_start = time.time()
            if batch_idx == 0:
                print(f"[VERBOSE] Batch {batch_idx}: Starting forward pass (AMP={self.use_amp})...", flush=True)
            
            # Forward pass with mixed precision
            if self.use_amp:
                with autocast('cuda'):
                    if batch_idx == 0:
                        print(f"[VERBOSE] Batch {batch_idx}: Calling model.forward()...", flush=True)
                    loss_dict = self.model(images, targets)
                    if batch_idx == 0:
                        print(f"[VERBOSE] Batch {batch_idx}: model.forward() completed", flush=True)
                    losses = sum(loss for loss in loss_dict.values())
                
                # Scale loss by accumulation steps
                scaled_loss = losses / self.gradient_accumulation_steps
                
                # Backward pass with scaling
                self.scaler.scale(scaled_loss).backward()
            else:
                # Standard precision
                if batch_idx == 0:
                    print(f"[VERBOSE] Batch {batch_idx}: Calling model.forward() (standard precision)...", flush=True)
                loss_dict = self.model(images, targets)
                if batch_idx == 0:
                    print(f"[VERBOSE] Batch {batch_idx}: model.forward() completed", flush=True)
                
                # Log loss components for diagnostics
                if batch_idx % self.config['logging'].get('print_frequency', 20) == 0:
                    loss_components_str = ", ".join([f"{k}: {v.item():.4f}" for k, v in loss_dict.items()])
                    print(f"[DIAGNOSTIC] Epoch {epoch}, Batch {batch_idx}: {loss_components_str}", flush=True)
                
                losses = sum(loss for loss in loss_dict.values())
                
                # Scale loss by accumulation steps
                scaled_loss = losses / self.gradient_accumulation_steps
                
                # Backward pass
                scaled_loss.backward()
            
            gpu_processing_time = time.time() - gpu_start
            
            # Record timing for adaptive optimizer
            if self.adaptive_optimizer:
                self.adaptive_optimizer.record_batch_timing(data_load_time, gpu_processing_time)
            
            # Accumulate loss and individual components
            accumulation_loss += losses.item()
            for loss_name, loss_value in loss_dict.items():
                if loss_name not in accumulation_loss_components:
                    accumulation_loss_components[loss_name] = 0.0
                accumulation_loss_components[loss_name] += loss_value.item()
            accumulation_count += 1
            
            # Log label distribution for diagnostics (first batch of epoch, every 5 epochs)
            if batch_idx == 0 and epoch % 5 == 0:
                label_counts = {}
                for target in targets:
                    labels = target.get('labels', torch.tensor([]))
                    for label in labels.cpu().numpy():
                        label_counts[label] = label_counts.get(label, 0) + 1
                if label_counts:
                    label_dist_str = ", ".join([f"label_{k}: {v}" for k, v in sorted(label_counts.items())])
                    print(f"[DIAGNOSTIC] Epoch {epoch}, Batch 0 label distribution: {label_dist_str}", flush=True)
            
            # Only step optimizer after accumulation steps
            if (batch_idx + 1) % self.gradient_accumulation_steps == 0:
                # Gradient clipping
                if self.config['training'].get('gradient_clip'):
                    if self.use_amp:
                        self.scaler.unscale_(self.optimizer)
                    
                    # Calculate gradient norm before clipping for diagnostics
                    total_norm = torch.nn.utils.clip_grad_norm_(
                        self.model.parameters(),
                        self.config['training']['gradient_clip']
                    )
                    
                    # Log gradient norm periodically
                    if batch_idx % (self.config['logging'].get('print_frequency', 20) * 2) == 0:
                        print(f"[DIAGNOSTIC] Epoch {epoch}, Batch {batch_idx}: Gradient norm: {total_norm.item():.4f}", flush=True)
                
                # Optimizer step
                if self.use_amp:
                    self.scaler.step(self.optimizer)
                    self.scaler.update()
                else:
                    self.optimizer.step()
                
                # Zero gradients after step
                self.optimizer.zero_grad()
            
            # Scheduler step (every batch, not every accumulation step)
            self.scheduler.step()
            
            # Adaptive optimization check
            if self.adaptive_optimizer:
                adjustment = self.adaptive_optimizer.adjust_parameters(batch_idx)
                if adjustment:
                    print(f"\n🔧 Adaptive Optimization Adjustment (batch {batch_idx}):")
                    for key, value in adjustment.items():
                        if key != 'metrics':
                            print(f"   {key}: {value}")
                    if 'metrics' in adjustment:
                        m = adjustment['metrics']
                        print(f"   GPU util: {m['avg_gpu_utilization']:.1%}, RAM: {m['avg_ram_usage']:.1%}")
                        print(f"   Data load: {m['avg_data_loading_time']:.3f}s, GPU process: {m['avg_gpu_processing_time']:.3f}s")
                    
                    # Log to MLflow (with error handling and debugging)
                    if self.use_mlflow:
                        try:
                            for key, value in adjustment.items():
                                if key != 'metrics':
                                    if self.mlflow_debugger and self.use_mlflow:
                                        context = self.mlflow_debugger.log_operation_start(
                                            'log_metric',
                                            metric_name=f'adaptive_{key}',
                                            value=value,
                                            step=self.global_step
                                        )
                                    start_time = time.time()
                                    mlflow.log_metric(f'adaptive_{key}', value, step=self.global_step)
                                    if self.mlflow_debugger and self.use_mlflow:
                                        self.mlflow_debugger.log_operation_success('log_metric', time.time() - start_time, context)
                        except Exception as mlflow_error:
                            # Log detailed error information
                            if self.mlflow_debugger and self.use_mlflow:
                                self.mlflow_debugger.log_operation_error(
                                    'log_metric',
                                    mlflow_error,
                                    context={'operation': 'adaptive_optimization', 'step': self.global_step}
                                )
                            # Don't let MLflow errors stop training
                            if MLFLOW_DEBUG:
                                print(f"[MLFLOW DEBUG] Adaptive MLflow logging failed (non-blocking): {type(mlflow_error).__name__}: {mlflow_error}")
                            pass  # Silently ignore adaptive optimization MLflow errors
            
            # Memory cleanup
            del images, targets, loss_dict, losses
            if (batch_idx + 1) % self.memory_cleanup_frequency == 0:
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                gc.collect()
            
            # Logging (only on accumulation boundaries or last batch)
            if (batch_idx + 1) % self.gradient_accumulation_steps == 0 or (batch_idx + 1) == len(self.train_loader):
                avg_loss = accumulation_loss / accumulation_count if accumulation_count > 0 else 0.0
                total_loss += avg_loss * accumulation_count
                num_batches += accumulation_count
                self.global_step += 1
                
                # Memory monitoring
                memory_info = {}
                if batch_idx % (print_freq * self.gradient_accumulation_steps) == 0:
                    memory_info = self._get_memory_usage()
                    if memory_info:
                        mem_str = ", ".join([f"{k}: {v:.2f}GB" for k, v in memory_info.items()])
                        print(f"Memory: {mem_str}")
                
                # Reduced logging frequency for less I/O overhead
                log_every_n_steps = self.config['logging'].get('log_every_n_steps', 10)
                
                if batch_idx % (print_freq * self.gradient_accumulation_steps) == 0:
                    current_lr = self.optimizer.param_groups[0]['lr']
                    print(f"Epoch [{epoch}] Batch [{batch_idx}/{len(self.train_loader)}] "
                          f"Loss: {avg_loss:.4f} LR: {current_lr:.6f}")
                
                # Less frequent TensorBoard logging
                if self.writer and self.global_step % log_every_n_steps == 0:
                    current_lr = self.optimizer.param_groups[0]['lr']
                    self.writer.add_scalar('Train/Loss', avg_loss, self.global_step)
                    self.writer.add_scalar('Train/LearningRate', current_lr, self.global_step)
                    if memory_info:
                        for key, value in memory_info.items():
                            self.writer.add_scalar(f'Memory/{key}', value, self.global_step)
                
                # MLflow logging (with error handling and retry logic to prevent training interruption)
                if self.use_mlflow and self.global_step % log_every_n_steps == 0:
                    # Circuit breaker: disable MLflow if too many consecutive failures
                    if self.mlflow_failure_count >= self.mlflow_max_failures:
                        if self.global_step % (log_every_n_steps * 50) == 0:  # Only warn occasionally
                            print(f"WARNING: MLflow disabled due to {self.mlflow_failure_count} consecutive failures")
                        self.use_mlflow = False
                        continue
                    
                    max_retries = 2
                    retry_count = 0
                    logged_successfully = False
                    
                    while retry_count < max_retries and not logged_successfully:
                        context = None
                        try:
                            current_lr = self.optimizer.param_groups[0]['lr']
                            
                            # Pre-operation validation
                            if self.mlflow_debugger and self.use_mlflow:
                                # Validate metrics before logging
                                if not (isinstance(avg_loss, (int, float)) and not (torch.isnan(torch.tensor(avg_loss)) or torch.isinf(torch.tensor(avg_loss)))):
                                    print(f"[MLFLOW DEBUG] WARNING: Invalid loss value: {avg_loss}")
                                if not (isinstance(current_lr, (int, float)) and not (torch.isnan(torch.tensor(current_lr)) or torch.isinf(torch.tensor(current_lr)))):
                                    print(f"[MLFLOW DEBUG] WARNING: Invalid LR value: {current_lr}")
                            
                            # Log train_loss metric
                            if self.mlflow_debugger and self.use_mlflow:
                                context = self.mlflow_debugger.log_operation_start(
                                    'log_metric',
                                    metric_name='train_loss',
                                    value=avg_loss,
                                    step=self.global_step
                                )
                            start_time = time.time()
                            mlflow.log_metric('train_loss', avg_loss, step=self.global_step)
                            if self.mlflow_debugger and self.use_mlflow:
                                self.mlflow_debugger.log_operation_success('log_metric', time.time() - start_time, context)
                            
                            # Log learning_rate metric
                            if self.mlflow_debugger and self.use_mlflow:
                                context = self.mlflow_debugger.log_operation_start(
                                    'log_metric',
                                    metric_name='learning_rate',
                                    value=current_lr,
                                    step=self.global_step
                                )
                            start_time = time.time()
                            mlflow.log_metric('learning_rate', current_lr, step=self.global_step)
                            if self.mlflow_debugger and self.use_mlflow:
                                self.mlflow_debugger.log_operation_success('log_metric', time.time() - start_time, context)
                            
                            # Log individual loss components
                            if accumulation_loss_components:
                                for loss_name, loss_sum in accumulation_loss_components.items():
                                    avg_component_loss = loss_sum / accumulation_count if accumulation_count > 0 else 0.0
                                    if self.mlflow_debugger and self.use_mlflow:
                                        context = self.mlflow_debugger.log_operation_start(
                                            'log_metric',
                                            metric_name=f'train_{loss_name}',
                                            value=avg_component_loss,
                                            step=self.global_step
                                        )
                                    start_time = time.time()
                                    mlflow.log_metric(f'train_{loss_name}', avg_component_loss, step=self.global_step)
                                    if self.mlflow_debugger and self.use_mlflow:
                                        self.mlflow_debugger.log_operation_success('log_metric', time.time() - start_time, context)
                            
                            if memory_info:
                                for key, value in memory_info.items():
                                    if self.mlflow_debugger and self.use_mlflow:
                                        context = self.mlflow_debugger.log_operation_start(
                                            'log_metric',
                                            metric_name=f'memory_{key}',
                                            value=value,
                                            step=self.global_step
                                        )
                                    start_time = time.time()
                                    mlflow.log_metric(f'memory_{key}', value, step=self.global_step)
                                    if self.mlflow_debugger and self.use_mlflow:
                                        self.mlflow_debugger.log_operation_success('log_metric', time.time() - start_time, context)
                            
                            logged_successfully = True
                            self.mlflow_failure_count = 0  # Reset failure count on success
                            
                        except Exception as mlflow_error:
                            retry_count += 1
                            
                            # Log detailed error information
                            if self.mlflow_debugger and self.use_mlflow:
                                self.mlflow_debugger.log_operation_error(
                                    'log_metric',
                                    mlflow_error,
                                    context=context,
                                    retry_count=retry_count,
                                    max_retries=max_retries
                                )
                            
                            error_type = type(mlflow_error).__name__
                            error_msg = f"MLflow logging failed (non-blocking) at step {self.global_step}, attempt {retry_count}/{max_retries}: {error_type}: {mlflow_error}"
                            
                            # Increment failure count
                            self.mlflow_failure_count += 1
                            
                            # Check if it's a recoverable error
                            recoverable_errors = ('RestException', 'ConnectionError', 'Timeout', 'INTERNAL_ERROR')
                            is_recoverable = any(err in str(mlflow_error) or err in error_type for err in recoverable_errors)
                            
                            if retry_count < max_retries and is_recoverable:
                                if MLFLOW_DEBUG:
                                    print(f"[MLFLOW DEBUG] {error_msg} - Retrying...")
                                time.sleep(0.5)  # Brief delay before retry
                            else:
                                # Don't let MLflow errors stop training
                                if self.global_step % (log_every_n_steps * 10) == 0:  # Only print every 10th failure
                                    print(f"WARNING: {error_msg} (failure count: {self.mlflow_failure_count}/{self.mlflow_max_failures})")
                                # Disable MLflow if too many failures
                                if self.mlflow_failure_count >= self.mlflow_max_failures:
                                    print(f"WARNING: MLflow disabled after {self.mlflow_failure_count} consecutive failures. Training continues without MLflow.")
                                    self.use_mlflow = False
                                break
                
                # Reset accumulation
                accumulation_loss = 0.0
                accumulation_loss_components = {}
                accumulation_count = 0
        
        # Handle remaining gradients if last batch didn't complete accumulation cycle
        if accumulation_count > 0:
            # Gradient clipping
            if self.config['training'].get('gradient_clip'):
                if self.use_amp:
                    self.scaler.unscale_(self.optimizer)
                torch.nn.utils.clip_grad_norm_(
                    self.model.parameters(),
                    self.config['training']['gradient_clip']
                )
            
            # Optimizer step with remaining accumulated gradients
            if self.use_amp:
                self.scaler.step(self.optimizer)
                self.scaler.update()
            else:
                self.optimizer.step()
            
            # Zero gradients
            self.optimizer.zero_grad()
            
            # Log final accumulation
            avg_loss = accumulation_loss / accumulation_count
            total_loss += avg_loss * accumulation_count
            num_batches += accumulation_count
            self.global_step += 1
            
            # Reset accumulation components for next epoch
            accumulation_loss_components = {}
        
        avg_loss = total_loss / num_batches if num_batches > 0 else 0.0
        print(f"[VERBOSE] train_epoch({epoch}) finished: {num_batches} batches, avg_loss={avg_loss:.4f}", flush=True)
        return avg_loss
    
    def validate(self, epoch: int) -> float:
        """Validate model"""
        self.model.eval()
        
        all_predictions = []
        all_targets = []
        
        with torch.no_grad():
            for batch_idx, (images, targets) in enumerate(self.val_loader):
                # DETR expects list of images
                if isinstance(images, torch.Tensor):
                    images = [img.to(self.device) for img in images]
                else:
                    images = [img.to(self.device) for img in images]
                
                # Get predictions (in eval mode, model returns predictions)
                # Use autocast for validation too if AMP is enabled
                if self.use_amp:
                    with autocast('cuda'):
                        outputs = self.model(images)
                else:
                    outputs = self.model(images)
                
                # Store for evaluation
                # Convert target labels from 1-based (1=player, 2=ball) to 0-based (0=player, 1=ball)
                for i, (output, target) in enumerate(zip(outputs, targets)):
                    # Convert target labels: 1→0 (player), 2→1 (ball)
                    target_0based = target.copy()
                    if 'labels' in target_0based and len(target_0based['labels']) > 0:
                        target_0based['labels'] = target_0based['labels'] - 1
                    all_predictions.append(output)
                    all_targets.append(target_0based)
                
                # Memory cleanup during validation
                del images, targets, outputs
                if (batch_idx + 1) % self.memory_cleanup_frequency == 0:
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()
                    gc.collect()
        
        # Evaluate
        eval_metrics = self.evaluator.evaluate(all_predictions, all_targets)
        map_score = eval_metrics['map']
        
        print(f"Validation mAP: {map_score:.4f}")
        print(f"Validation Precision: {eval_metrics['precision']:.4f}")
        print(f"Validation Recall: {eval_metrics['recall']:.4f}")
        print(f"Validation F1: {eval_metrics['f1']:.4f}")
        print(f"\nPer-Class Metrics (IoU 0.5):")
        print(f"  Player - mAP@0.5: {eval_metrics.get('player_map_05', eval_metrics.get('player_map', 0.0)):.4f}, Precision: {eval_metrics.get('player_precision_05', eval_metrics.get('player_precision', 0.0)):.4f}, Recall: {eval_metrics.get('player_recall_05', eval_metrics.get('player_recall', 0.0)):.4f}, F1: {eval_metrics.get('player_f1', 0.0):.4f}")
        print(f"  Ball   - mAP@0.5: {eval_metrics.get('ball_map_05', eval_metrics.get('ball_map', 0.0)):.4f}, Precision: {eval_metrics.get('ball_precision_05', eval_metrics.get('ball_precision', 0.0)):.4f}, Recall: {eval_metrics.get('ball_recall_05', eval_metrics.get('ball_recall', 0.0)):.4f}, F1: {eval_metrics.get('ball_f1', 0.0):.4f}")
        print(f"\nPer-Class Metrics (IoU 0.75):")
        print(f"  Player - mAP@0.75: {eval_metrics.get('player_map_75', 0.0):.4f}")
        print(f"  Ball   - mAP@0.75: {eval_metrics.get('ball_map_75', 0.0):.4f}")
        if 'ball_avg_predictions_per_image' in eval_metrics:
            print(f"\nBall Detection:")
            print(f"  Avg predictions per image with balls: {eval_metrics['ball_avg_predictions_per_image']:.2f}")
            print(f"  Images with balls: {eval_metrics.get('images_with_balls', 0)}")
        
        if self.writer:
            self.writer.add_scalar('Val/mAP', map_score, epoch)
            self.writer.add_scalar('Val/Precision', eval_metrics['precision'], epoch)
            self.writer.add_scalar('Val/Recall', eval_metrics['recall'], epoch)
            self.writer.add_scalar('Val/F1', eval_metrics['f1'], epoch)
            # Per-class metrics
            self.writer.add_scalar('Val/Player_mAP', eval_metrics['player_map'], epoch)
            self.writer.add_scalar('Val/Player_Precision', eval_metrics['player_precision'], epoch)
            self.writer.add_scalar('Val/Player_Recall', eval_metrics['player_recall'], epoch)
            self.writer.add_scalar('Val/Player_F1', eval_metrics['player_f1'], epoch)
            self.writer.add_scalar('Val/Ball_mAP', eval_metrics['ball_map'], epoch)
            self.writer.add_scalar('Val/Ball_Precision', eval_metrics['ball_precision'], epoch)
            self.writer.add_scalar('Val/Ball_Recall', eval_metrics['ball_recall'], epoch)
            self.writer.add_scalar('Val/Ball_F1', eval_metrics['ball_f1'], epoch)
        
        # MLflow logging for validation (with error handling and retry logic)
        if self.use_mlflow:
            # Circuit breaker: skip if too many failures
            if self.mlflow_failure_count >= self.mlflow_max_failures:
                return map_score
            
            max_retries = 2
            retry_count = 0
            logged_successfully = False
            
            while retry_count < max_retries and not logged_successfully:
                context = None
                try:
                    # Pre-operation validation - check backend health
                    if self.mlflow_debugger and self.use_mlflow:
                        backend_health = self.mlflow_debugger.check_mlflow_backend_health()
                        if not backend_health['backend_accessible']:
                            print(f"[MLFLOW DEBUG] WARNING: Backend not accessible before validation logging")
                        if not backend_health['run_active']:
                            print(f"[MLFLOW DEBUG] WARNING: Run not active before validation logging")
                    
                    # Overall metrics - log with debugging
                    metrics_to_log = [
                        ('val_map', map_score),
                        ('val_precision', eval_metrics['precision']),
                        ('val_recall', eval_metrics['recall']),
                        ('val_f1', eval_metrics['f1']),
                        ('val_player_map_05', eval_metrics['player_map_05']),
                        ('val_player_precision_05', eval_metrics['player_precision_05']),
                        ('val_player_recall_05', eval_metrics['player_recall_05']),
                        ('val_player_f1', eval_metrics['player_f1']),
                        ('val_player_map_75', eval_metrics['player_map_75']),
                        ('val_ball_map_05', eval_metrics['ball_map_05']),
                        ('val_ball_precision_05', eval_metrics['ball_precision_05']),
                        ('val_ball_recall_05', eval_metrics['ball_recall_05']),
                        ('val_ball_f1', eval_metrics['ball_f1']),
                        ('val_ball_map_75', eval_metrics['ball_map_75']),
                        ('val_ball_avg_predictions_per_image', eval_metrics['ball_avg_predictions_per_image']),
                        ('val_images_with_balls', eval_metrics['images_with_balls']),
                        ('val_player_map', eval_metrics.get('player_map', eval_metrics['player_map_05'])),
                        ('val_player_precision', eval_metrics.get('player_precision', eval_metrics['player_precision_05'])),
                        ('val_player_recall', eval_metrics.get('player_recall', eval_metrics['player_recall_05'])),
                        ('val_ball_map', eval_metrics.get('ball_map', eval_metrics['ball_map_05'])),
                        ('val_ball_precision', eval_metrics.get('ball_precision', eval_metrics['ball_precision_05'])),
                        ('val_ball_recall', eval_metrics.get('ball_recall', eval_metrics['ball_recall_05'])),
                    ]
                    
                    for metric_name, metric_value in metrics_to_log:
                        # Validate metric value
                        if self.mlflow_debugger and self.use_mlflow:
                            if not isinstance(metric_value, (int, float)) or (isinstance(metric_value, float) and (torch.isnan(torch.tensor(metric_value)) or torch.isinf(torch.tensor(metric_value)))):
                                print(f"[MLFLOW DEBUG] WARNING: Invalid metric value for {metric_name}: {metric_value}")
                        
                        if self.mlflow_debugger and self.use_mlflow:
                            context = self.mlflow_debugger.log_operation_start(
                                'log_metric',
                                metric_name=metric_name,
                                value=metric_value,
                                step=epoch
                            )
                        start_time = time.time()
                        mlflow.log_metric(metric_name, metric_value, step=epoch)
                        if self.mlflow_debugger and self.use_mlflow:
                            self.mlflow_debugger.log_operation_success('log_metric', time.time() - start_time, context)
                    
                    # Goal tracking - log goal achievement status
                    self._log_goals_to_mlflow(eval_metrics, epoch)
                    
                    logged_successfully = True
                    self.mlflow_failure_count = 0  # Reset failure count on success
                    
                except Exception as mlflow_error:
                    retry_count += 1
                    
                    # Log detailed error information
                    if self.mlflow_debugger and self.use_mlflow:
                        self.mlflow_debugger.log_operation_error(
                            'log_metric',
                            mlflow_error,
                            context=context,
                            retry_count=retry_count,
                            max_retries=max_retries,
                            epoch=epoch
                        )
                    
                    error_type = type(mlflow_error).__name__
                    error_msg = f"MLflow validation logging failed (non-blocking) at epoch {epoch}, attempt {retry_count}/{max_retries}: {error_type}: {mlflow_error}"
                    
                    # Increment failure count
                    self.mlflow_failure_count += 1
                    
                    # Check if recoverable
                    recoverable = any(err in str(mlflow_error) or err in error_type for err in ('RestException', 'ConnectionError', 'Timeout', 'INTERNAL_ERROR'))
                    
                    if retry_count < max_retries and recoverable:
                        if MLFLOW_DEBUG:
                            print(f"[MLFLOW DEBUG] {error_msg} - Retrying...")
                        time.sleep(1)
                    else:
                        print(f"WARNING: {error_msg} (failure count: {self.mlflow_failure_count}/{self.mlflow_max_failures})")
                        # Disable MLflow if too many failures
                        if self.mlflow_failure_count >= self.mlflow_max_failures:
                            print(f"WARNING: MLflow disabled after {self.mlflow_failure_count} consecutive failures. Training continues without MLflow.")
                            self.use_mlflow = False
                        break
        
        # Final cleanup after validation
        del all_predictions, all_targets
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
        
        return map_score

    
    def test_real_validation(self, epoch: int):
        """
        Test on real validation set to track generalization
        Logs results to MLflow
        """
        import json
        from PIL import Image
        import numpy as np
        from scipy.optimize import linear_sum_assignment
        from src.training.augmentation import get_val_transforms
        
        if not self.real_val_path:
            return
        
        real_val_dir = Path(self.real_val_path)
        annotation_file = real_val_dir / "just_ball_and_people.json"
        
        if not annotation_file.exists():
            print(f"Warning: Real validation annotation file not found: {annotation_file}")
            return
        
        print(f"\n🧪 Testing on real validation set (generalization)...")
        
        # Load annotations
        with open(annotation_file, 'r') as f:
            coco_data = json.load(f)
        
        images = {img['id']: img for img in coco_data['images']}
        image_annotations = {}
        for ann in coco_data['annotations']:
            img_id = ann['image_id']
            if img_id not in image_annotations:
                image_annotations[img_id] = []
            image_annotations[img_id].append(ann)
        
        # Setup transforms
        val_transforms = get_val_transforms(self.config['augmentation']['val'])
        evaluator = Evaluator({'iou_thresholds': [0.5], 'max_detections': 100})
        threshold = 0.1  # Lower threshold for real validation
        
        all_predictions = []
        all_targets = []
        ball_tp = 0
        ball_fp = 0
        ball_fn = 0
        
        self.model.eval()
        with torch.no_grad():
            for image_id in sorted(images.keys()):
                image_info = images[image_id]
                image_path = real_val_dir / image_info['file_name']
                
                if not image_path.exists():
                    continue
                
                try:
                    pil_image = Image.open(image_path).convert('RGB')
                except Exception as e:
                    continue
                
                dummy_target = {
                    'boxes': torch.zeros((0, 4), dtype=torch.float32),
                    'labels': torch.zeros((0,), dtype=torch.int64),
                    'image_id': torch.tensor([image_id], dtype=torch.int64),
                    'area': torch.zeros((0,), dtype=torch.float32),
                    'iscrowd': torch.zeros((0,), dtype=torch.int64)
                }
                
                image_tensor, _ = val_transforms(pil_image, dummy_target)
                image_tensor = image_tensor.to(self.device)
                outputs = self.model([image_tensor])
                
                output = outputs[0]
                pred_scores = output['scores'].cpu()
                pred_labels = output['labels'].cpu()
                pred_boxes = output['boxes'].cpu()
                
                # Filter by confidence threshold
                mask = pred_scores >= threshold
                pred_scores_filtered = pred_scores[mask]
                pred_labels_filtered = pred_labels[mask]
                pred_boxes_filtered = pred_boxes[mask]
                
                # Convert labels from 1-based to 0-based
                valid_mask = pred_labels_filtered > 0
                pred_labels_filtered = pred_labels_filtered[valid_mask] - 1
                pred_scores_filtered = pred_scores_filtered[valid_mask]
                pred_boxes_filtered = pred_boxes_filtered[valid_mask]
                
                # Get ground truth
                annotations = image_annotations.get(image_id, [])
                gt_boxes = []
                gt_labels = []
                
                for ann in annotations:
                    bbox = ann['bbox']
                    x, y, w, h = bbox
                    x_min, y_min, x_max, y_max = x, y, x + w, y + h
                    cat_id = ann['category_id']
                    gt_boxes.append([x_min, y_min, x_max, y_max])
                    gt_labels.append(cat_id)
                
                if len(pred_boxes_filtered) == 0:
                    pred_boxes_tensor = torch.zeros((0, 4), dtype=torch.float32)
                    pred_scores_tensor = torch.zeros((0,), dtype=torch.float32)
                    pred_labels_tensor = torch.zeros((0,), dtype=torch.int64)
                else:
                    pred_boxes_tensor = pred_boxes_filtered
                    pred_scores_tensor = pred_scores_filtered
                    pred_labels_tensor = pred_labels_filtered
                
                if len(gt_boxes) == 0:
                    gt_boxes_tensor = torch.zeros((0, 4), dtype=torch.float32)
                    gt_labels_tensor = torch.zeros((0,), dtype=torch.int64)
                else:
                    gt_boxes_tensor = torch.tensor(gt_boxes, dtype=torch.float32)
                    gt_labels_tensor = torch.tensor(gt_labels, dtype=torch.int64)
                
                all_predictions.append({
                    'boxes': pred_boxes_tensor,
                    'scores': pred_scores_tensor,
                    'labels': pred_labels_tensor
                })
                
                all_targets.append({
                    'boxes': gt_boxes_tensor,
                    'labels': gt_labels_tensor
                })
                
                # Calculate ball metrics
                pred_boxes_np = pred_boxes_tensor.cpu().numpy()
                pred_labels_np = pred_labels_tensor.cpu().numpy()
                pred_scores_np = pred_scores_tensor.cpu().numpy()
                
                target_boxes_np = gt_boxes_tensor.cpu().numpy()
                target_labels_np = gt_labels_tensor.cpu().numpy()
                
                ball_pred_mask = pred_labels_np == 1
                ball_target_mask = target_labels_np == 1
                
                if np.sum(ball_target_mask) == 0:
                    ball_fp += np.sum(ball_pred_mask)
                    continue
                
                if np.sum(ball_pred_mask) == 0:
                    ball_fn += np.sum(ball_target_mask)
                    continue
                
                ball_pred_boxes = pred_boxes_np[ball_pred_mask]
                ball_target_boxes = target_boxes_np[ball_target_mask]
                ball_pred_scores = pred_scores_np[ball_pred_mask]
                
                ious = evaluator._compute_ious(ball_pred_boxes, ball_target_boxes)
                matched_targets = np.zeros(len(ball_target_boxes), dtype=bool)
                sorted_indices = np.argsort(ball_pred_scores)[::-1]
                
                tp = 0
                for pred_idx in sorted_indices:
                    best_iou = 0.0
                    best_target_idx = -1
                    
                    for target_idx in range(len(ball_target_boxes)):
                        if not matched_targets[target_idx]:
                            iou = ious[pred_idx, target_idx]
                            if iou > best_iou:
                                best_iou = iou
                                best_target_idx = target_idx
                    
                    if best_iou >= 0.5:
                        matched_targets[best_target_idx] = True
                        tp += 1
                    else:
                        ball_fp += 1
                
                ball_tp += tp
                ball_fn += np.sum(~matched_targets)
        
        # Calculate metrics
        metrics = evaluator.evaluate(all_predictions, all_targets)
        
        ball_recall = ball_tp / (ball_tp + ball_fn) if (ball_tp + ball_fn) > 0 else 0.0
        ball_precision = ball_tp / (ball_tp + ball_fp) if (ball_tp + ball_fp) > 0 else 0.0
        
        # Print results
        print(f"  Real Val - Ball: mAP={metrics['ball_map_05']:.4f}, Recall={ball_recall:.4f}, Prec={ball_precision:.4f}")
        print(f"  Real Val - Player: mAP={metrics['player_map_05']:.4f}, Recall={metrics['player_recall_05']:.4f}, Prec={metrics['player_precision_05']:.4f}")
        
        # Log to MLflow
        if self.use_mlflow:
            try:
                mlflow.log_metric('real_val_ball_map_05', float(metrics['ball_map_05']), step=epoch)
                mlflow.log_metric('real_val_ball_recall', float(ball_recall), step=epoch)
                mlflow.log_metric('real_val_ball_precision', float(ball_precision), step=epoch)
                mlflow.log_metric('real_val_player_map_05', float(metrics['player_map_05']), step=epoch)
                mlflow.log_metric('real_val_player_recall_05', float(metrics['player_recall_05']), step=epoch)
                mlflow.log_metric('real_val_player_precision_05', float(metrics['player_precision_05']), step=epoch)
                mlflow.log_metric('real_val_ball_tp', int(ball_tp), step=epoch)
                mlflow.log_metric('real_val_ball_fp', int(ball_fp), step=epoch)
                mlflow.log_metric('real_val_ball_fn', int(ball_fn), step=epoch)
                print(f"  ✅ Logged real validation metrics to MLflow")
            except Exception as e:
                print(f"  Warning: Failed to log real validation metrics to MLflow: {e}")
        
        # Log to TensorBoard
        if self.writer:
            self.writer.add_scalar('RealVal/Ball_mAP', metrics['ball_map_05'], epoch)
            self.writer.add_scalar('RealVal/Ball_Recall', ball_recall, epoch)
            self.writer.add_scalar('RealVal/Ball_Precision', ball_precision, epoch)
            self.writer.add_scalar('RealVal/Player_mAP', metrics['player_map_05'], epoch)
            self.writer.add_scalar('RealVal/Player_Recall', metrics['player_recall_05'], epoch)
            self.writer.add_scalar('RealVal/Player_Precision', metrics['player_precision_05'], epoch)
    
    def _log_goals_to_mlflow(self, eval_metrics: Dict[str, float], epoch: int):
        """
        Log goal tracking metrics to MLflow
        
        Args:
            eval_metrics: Dictionary of evaluation metrics
            epoch: Current epoch number
        """
        if not self.use_mlflow:
            return
        
        # Define goals
        goals = {
            # Player goals
            'goal_player_recall_05': 0.95,
            'goal_player_precision_05': 0.80,
            'goal_player_map_05': 0.85,
            'goal_player_map_75': 0.70,
            
            # Ball goals
            'goal_ball_recall_05': 0.80,
            'goal_ball_precision_05': 0.70,
            'goal_ball_map_05': 0.70,
            'goal_ball_avg_predictions_per_image': 1.0,  # At least 1 prediction per image with balls
        }
        
        # Log goal achievement status (1.0 = achieved, 0.0 = not achieved)
        goal_achievements = {}
        
        # Player goals
        player_recall_05 = eval_metrics.get('player_recall_05', 0.0)
        player_precision_05 = eval_metrics.get('player_precision_05', 0.0)
        player_map_05 = eval_metrics.get('player_map_05', 0.0)
        player_map_75 = eval_metrics.get('player_map_75', 0.0)
        
        goal_achievements['goal_player_recall_05_achieved'] = 1.0 if player_recall_05 >= goals['goal_player_recall_05'] else 0.0
        goal_achievements['goal_player_precision_05_achieved'] = 1.0 if player_precision_05 >= goals['goal_player_precision_05'] else 0.0
        goal_achievements['goal_player_map_05_achieved'] = 1.0 if player_map_05 >= goals['goal_player_map_05'] else 0.0
        goal_achievements['goal_player_map_75_achieved'] = 1.0 if player_map_75 >= goals['goal_player_map_75'] else 0.0
        
        # Ball goals
        ball_recall_05 = eval_metrics.get('ball_recall_05', 0.0)
        ball_precision_05 = eval_metrics.get('ball_precision_05', 0.0)
        ball_map_05 = eval_metrics.get('ball_map_05', 0.0)
        ball_avg_preds = eval_metrics.get('ball_avg_predictions_per_image', 0.0)
        
        goal_achievements['goal_ball_recall_05_achieved'] = 1.0 if ball_recall_05 >= goals['goal_ball_recall_05'] else 0.0
        goal_achievements['goal_ball_precision_05_achieved'] = 1.0 if ball_precision_05 >= goals['goal_ball_precision_05'] else 0.0
        goal_achievements['goal_ball_map_05_achieved'] = 1.0 if ball_map_05 >= goals['goal_ball_map_05'] else 0.0
        goal_achievements['goal_ball_avg_predictions_achieved'] = 1.0 if ball_avg_preds >= goals['goal_ball_avg_predictions_per_image'] else 0.0
        
        # Log goal achievement metrics (with error handling and debugging)
        try:
            for goal_name, achieved in goal_achievements.items():
                if self.mlflow_debugger and self.use_mlflow:
                    context = self.mlflow_debugger.log_operation_start(
                        'log_metric',
                        metric_name=goal_name,
                        value=achieved,
                        step=epoch
                    )
                else:
                    context = None
                start_time = time.time()
                mlflow.log_metric(goal_name, achieved, step=epoch)
                if self.mlflow_debugger and self.use_mlflow:
                    self.mlflow_debugger.log_operation_success('log_metric', time.time() - start_time, context)
            
            # Log goal vs actual comparison (as percentage of goal)
            goal_progress = {
                'goal_player_recall_05_progress': (player_recall_05 / goals['goal_player_recall_05']) * 100 if goals['goal_player_recall_05'] > 0 else 0.0,
                'goal_player_precision_05_progress': (player_precision_05 / goals['goal_player_precision_05']) * 100 if goals['goal_player_precision_05'] > 0 else 0.0,
                'goal_player_map_05_progress': (player_map_05 / goals['goal_player_map_05']) * 100 if goals['goal_player_map_05'] > 0 else 0.0,
                'goal_player_map_75_progress': (player_map_75 / goals['goal_player_map_75']) * 100 if goals['goal_player_map_75'] > 0 else 0.0,
                'goal_ball_recall_05_progress': (ball_recall_05 / goals['goal_ball_recall_05']) * 100 if goals['goal_ball_recall_05'] > 0 else 0.0,
                'goal_ball_precision_05_progress': (ball_precision_05 / goals['goal_ball_precision_05']) * 100 if goals['goal_ball_precision_05'] > 0 else 0.0,
                'goal_ball_map_05_progress': (ball_map_05 / goals['goal_ball_map_05']) * 100 if goals['goal_ball_map_05'] > 0 else 0.0,
                'goal_ball_avg_predictions_progress': (ball_avg_preds / goals['goal_ball_avg_predictions_per_image']) * 100 if goals['goal_ball_avg_predictions_per_image'] > 0 else 0.0,
            }
            
            for progress_name, progress_value in goal_progress.items():
                if self.mlflow_debugger and self.use_mlflow:
                    context = self.mlflow_debugger.log_operation_start(
                        'log_metric',
                        metric_name=progress_name,
                        value=min(progress_value, 100.0),
                        step=epoch
                    )
                else:
                    context = None
                start_time = time.time()
                mlflow.log_metric(progress_name, min(progress_value, 100.0), step=epoch)  # Cap at 100%
                if self.mlflow_debugger and self.use_mlflow:
                    self.mlflow_debugger.log_operation_success('log_metric', time.time() - start_time, context)
        except Exception as mlflow_error:
            # Log detailed error information
            if self.mlflow_debugger and self.use_mlflow:
                self.mlflow_debugger.log_operation_error(
                    'log_metric',
                    mlflow_error,
                    context={'epoch': epoch, 'operation': 'goal_tracking'},
                    epoch=epoch
                )
            # Don't let MLflow errors stop goal tracking
            if MLFLOW_DEBUG:
                print(f"[MLFLOW DEBUG] Goal tracking MLflow logging failed (non-blocking): {type(mlflow_error).__name__}: {mlflow_error}")
            pass  # Silently ignore goal tracking MLflow errors
    
    def save_checkpoint(self, epoch: int, map_score: float, is_best: bool = False, 
                       is_interrupt: bool = False, lightweight: bool = False):
        """
        Save model checkpoint
        
        Args:
            epoch: Current epoch number
            map_score: Validation mAP score (0.0 if not validated)
            is_best: Whether this is the best model so far
            is_interrupt: Whether this is an interrupt/error save
            lightweight: If True, save only model weights (faster, for frequent saves)
        """
        # Force lightweight for interrupt saves or if config requires it
        use_lightweight_only = self.config['checkpoint'].get('use_lightweight_only', False)
        if is_interrupt or use_lightweight_only:
            lightweight = True
        
        checkpoint_dir = Path(self.config['checkpoint']['save_dir'])
        checkpoint_dir.mkdir(parents=True, exist_ok=True)
        
        if lightweight:
            # Lightweight checkpoint: just model weights and epoch
            checkpoint = {
                'epoch': epoch,
                'model_state_dict': self.model.state_dict(),
                'config': self.config
            }
            checkpoint_path = checkpoint_dir / f"checkpoint_epoch_{epoch}_lightweight.pth"
        else:
            # Full checkpoint: includes optimizer, scheduler, metrics
            checkpoint = {
                'epoch': epoch,
                'model_state_dict': self.model.state_dict(),
                'optimizer_state_dict': self.optimizer.state_dict(),
                'scheduler_state_dict': self.scheduler.state_dict(),
                'map': map_score,
                'config': self.config
            }
            checkpoint_path = checkpoint_dir / f"checkpoint_epoch_{epoch}.pth"
        
        # Save checkpoint with error handling for disk quota issues
        try:
            torch.save(checkpoint, checkpoint_path)
        except (OSError, IOError) as e:
            if "Disk quota exceeded" in str(e) or "No space left" in str(e) or "file write failed" in str(e):
                print(f"ERROR: Disk quota exceeded. Failed to save checkpoint: {checkpoint_path}")
                print(f"Attempting to save lightweight checkpoint instead...")
                # Try lightweight version if full checkpoint failed
                if not lightweight:
                    lightweight_checkpoint = {
                        'epoch': epoch,
                        'model_state_dict': self.model.state_dict(),
                        'config': self.config
                    }
                    lightweight_path = checkpoint_dir / f"checkpoint_epoch_{epoch}_lightweight.pth"
                    try:
                        torch.save(lightweight_checkpoint, lightweight_path)
                        print(f"Saved lightweight checkpoint instead: {lightweight_path}")
                        checkpoint_path = lightweight_path
                    except Exception as e2:
                        print(f"ERROR: Failed to save even lightweight checkpoint: {e2}")
                        raise
                else:
                    raise
            else:
                raise
        
        if is_interrupt:
            print(f"Saved interrupt checkpoint: {checkpoint_path}")
        elif lightweight:
            print(f"Saved lightweight checkpoint: {checkpoint_path}")
        else:
            print(f"Saved checkpoint: {checkpoint_path}")
        
        # Save best model (only for full checkpoints with validation, and if enabled)
        if is_best and not lightweight and self.config['checkpoint'].get('save_best', True):
            best_path = checkpoint_dir / "best_model.pth"
            try:
                torch.save(checkpoint, best_path)
                print(f"Saved best model with mAP: {map_score:.4f}")
            except (OSError, IOError) as e:
                if "Disk quota exceeded" in str(e) or "No space left" in str(e) or "file write failed" in str(e):
                    print(f"WARNING: Disk quota exceeded. Skipping best model save.")
                else:
                    raise
        
        # Save latest checkpoint (always overwrite)
        latest_path = checkpoint_dir / "latest_checkpoint.pth"
        try:
            torch.save(checkpoint, latest_path)
        except (OSError, IOError) as e:
            if "Disk quota exceeded" in str(e) or "No space left" in str(e) or "file write failed" in str(e):
                print(f"WARNING: Disk quota exceeded. Skipping latest checkpoint save.")
            else:
                raise
        
        # Log checkpoint to MLflow (with retry logic and error recovery)
        if self.use_mlflow:
            # Circuit breaker: skip if too many failures
            if self.mlflow_failure_count >= self.mlflow_max_failures:
                if epoch % 10 == 0:  # Only warn occasionally
                    print(f"WARNING: MLflow disabled due to {self.mlflow_failure_count} consecutive failures")
                return
            
            max_retries = 2
            retry_count = 0
            logged_successfully = False
            
            while retry_count < max_retries and not logged_successfully:
                model_was_eval = False
                try:
                    if not lightweight:
                        # Pre-operation validation for checkpoint artifact
                        if self.mlflow_debugger and self.use_mlflow:
                            # Check filesystem health
                            fs_health = self.mlflow_debugger.check_filesystem_health(str(checkpoint_path.parent))
                            if fs_health['disk_percent'] > 90:
                                print(f"[MLFLOW DEBUG] WARNING: Disk usage is {fs_health['disk_percent']:.1f}% - may cause issues")
                            
                            # Validate checkpoint file
                            if not os.path.exists(checkpoint_path):
                                raise FileNotFoundError(f"Checkpoint file does not exist: {checkpoint_path}")
                            if not os.access(checkpoint_path, os.R_OK):
                                raise PermissionError(f"Checkpoint file is not readable: {checkpoint_path}")
                            
                            checkpoint_size = os.path.getsize(checkpoint_path)
                            if checkpoint_size == 0:
                                raise ValueError(f"Checkpoint file is empty: {checkpoint_path}")
                            
                            # Log checkpoint artifact
                            context = self.mlflow_debugger.log_operation_start(
                                'log_artifact',
                                file_path=str(checkpoint_path),
                                artifact_path="checkpoints",
                                file_size=checkpoint_size
                            )
                        else:
                            context = None
                        
                        start_time = time.time()
                        mlflow.log_artifact(str(checkpoint_path), artifact_path="checkpoints")
                        
                        if self.mlflow_debugger and self.use_mlflow:
                            self.mlflow_debugger.log_operation_success('log_artifact', time.time() - start_time, context)
                            
                            # Post-operation verification
                            # Check if artifact was written (for file backend, check mlruns directory)
                            if self.mlflow_debugger.tracking_uri.startswith('file:'):
                                artifact_root = self.mlflow_debugger.tracking_uri.replace('file:', '')
                                if self.mlflow_debugger.run_id:
                                    artifact_path = os.path.join(artifact_root, self.mlflow_debugger.run_id, "artifacts", "checkpoints", checkpoint_path.name)
                                    if os.path.exists(artifact_path):
                                        written_size = os.path.getsize(artifact_path)
                                        print(f"[MLFLOW DEBUG] Verified artifact written: {artifact_path} ({self.mlflow_debugger._format_size(written_size)})")
                                    else:
                                        print(f"[MLFLOW DEBUG] WARNING: Artifact verification failed - file not found at {artifact_path}")
                        
                        if is_best and os.path.exists(best_path):
                            # Pre-operation validation for best model artifact
                            if self.mlflow_debugger and self.use_mlflow:
                                best_size = os.path.getsize(best_path)
                                context = self.mlflow_debugger.log_operation_start(
                                    'log_artifact',
                                    file_path=str(best_path),
                                    artifact_path="checkpoints",
                                    file_size=best_size
                                )
                            else:
                                context = None
                            
                            start_time = time.time()
                            mlflow.log_artifact(str(best_path), artifact_path="checkpoints")
                            
                            if self.mlflow_debugger and self.use_mlflow:
                                self.mlflow_debugger.log_operation_success('log_artifact', time.time() - start_time, context)
                    
                    # Save model in MLflow's native format every epoch (skip if model logging disabled)
                    if self.mlflow_model_logging_enabled:
                        # Set model to eval mode for inference
                        self.model.eval()
                        model_was_eval = True
                        model_log_retry = 0
                        model_logged = False
                        
                        while model_log_retry < max_retries and not model_logged:
                            temp_model_path = None
                            temp_dir = None
                            context = None
                            try:
                                # Pre-operation validation
                                if self.mlflow_debugger and self.use_mlflow:
                                    # Check backend health before model logging
                                    backend_health = self.mlflow_debugger.check_mlflow_backend_health()
                                    if not backend_health['backend_accessible']:
                                        raise RuntimeError("MLflow backend not accessible")
                                    if not backend_health['run_active']:
                                        raise RuntimeError("MLflow run is not active")
                                    
                                    # Check filesystem health
                                    fs_health = self.mlflow_debugger.check_filesystem_health()
                                    if fs_health['disk_percent'] > 90:
                                        print(f"[MLFLOW DEBUG] WARNING: Disk usage is {fs_health['disk_percent']:.1f}% - may cause issues")
                                
                                # FIX: Save model to temp file first, then log the file to avoid INTERNAL_ERROR
                                # This prevents issues with model state, device placement, or serialization
                                import tempfile
                                temp_dir = tempfile.mkdtemp(prefix=f"mlflow_model_epoch_{epoch}_")
                                temp_model_path = os.path.join(temp_dir, "model.pth")
                                
                                # Validate temp directory
                                if not os.path.exists(temp_dir) or not os.access(temp_dir, os.W_OK):
                                    raise OSError(f"Temp directory not writable: {temp_dir}")
                                
                                # Save model state dict (lighter and more reliable than full model)
                                # Use state_dict() which doesn't require moving model to CPU
                                if self.mlflow_debugger and self.use_mlflow:
                                    context = self.mlflow_debugger.log_operation_start(
                                        'save_model_temp',
                                        file_path=temp_model_path,
                                        operation="saving model state dict to temp file"
                                    )
                                
                                # Get state dict without moving model (more efficient)
                                model_state_dict = {k: v.cpu() if v.is_cuda else v for k, v in self.model.state_dict().items()}
                                
                                torch.save({
                                    'epoch': epoch,
                                    'model_state_dict': model_state_dict,
                                    'map': map_score,
                                    'is_best': is_best,
                                    'config': self.config
                                }, temp_model_path)
                                
                                # Verify temp file was created
                                if not os.path.exists(temp_model_path):
                                    raise FileNotFoundError(f"Failed to create temp model file: {temp_model_path}")
                                
                                temp_file_size = os.path.getsize(temp_model_path)
                                if temp_file_size == 0:
                                    raise ValueError(f"Temp model file is empty: {temp_model_path}")
                                
                                if self.mlflow_debugger and self.use_mlflow:
                                    self.mlflow_debugger.log_operation_success('save_model_temp', time.time() - context['start_time'], context)
                                    print(f"[MLFLOW DEBUG] Model saved to temp file: {temp_model_path} ({self.mlflow_debugger._format_size(temp_file_size)})")
                                
                                # Use different artifact paths for different epochs (sanitize for MLflow)
                                model_artifact_path = f"models_epoch_{epoch}"
                                
                                # Validate artifact path (no invalid characters)
                                invalid_chars = ['<', '>', ':', '"', '|', '?', '*']
                                if any(char in model_artifact_path for char in invalid_chars):
                                    raise ValueError(f"Invalid artifact path characters: {model_artifact_path}")
                                
                                # Pre-operation validation for artifact logging
                                if self.mlflow_debugger and self.use_mlflow:
                                    fs_health = self.mlflow_debugger.check_filesystem_health(temp_dir)
                                    if fs_health['disk_free'] < temp_file_size * 2:  # Need at least 2x file size free
                                        raise OSError(f"Insufficient disk space: {self.mlflow_debugger._format_size(fs_health['disk_free'])} free, need {self.mlflow_debugger._format_size(temp_file_size * 2)}")
                                    
                                    context = self.mlflow_debugger.log_operation_start(
                                        'log_artifact',
                                        file_path=temp_model_path,
                                        artifact_path=model_artifact_path,
                                        file_size=temp_file_size
                                    )
                                
                                # Log the saved model file instead of the live model object
                                start_time = time.time()
                                mlflow.log_artifact(temp_model_path, artifact_path=model_artifact_path)
                                
                                if self.mlflow_debugger and self.use_mlflow:
                                    self.mlflow_debugger.log_operation_success('log_artifact', time.time() - start_time, context)
                                    
                                    # Post-operation verification
                                    if self.mlflow_debugger.tracking_uri.startswith('file:'):
                                        artifact_root = self.mlflow_debugger.tracking_uri.replace('file:', '')
                                        if self.mlflow_debugger.run_id:
                                            artifact_path = os.path.join(artifact_root, self.mlflow_debugger.run_id, "artifacts", model_artifact_path, "model.pth")
                                            if os.path.exists(artifact_path):
                                                written_size = os.path.getsize(artifact_path)
                                                print(f"[MLFLOW DEBUG] Verified model artifact written: {artifact_path} ({self.mlflow_debugger._format_size(written_size)})")
                                            else:
                                                print(f"[MLFLOW DEBUG] WARNING: Model artifact verification failed - file not found at {artifact_path}")
                                
                                # Also log metadata as a separate artifact
                                metadata_path = os.path.join(temp_dir, "metadata.yaml")
                                import yaml
                                with open(metadata_path, 'w') as f:
                                    yaml.dump({
                                        "epoch": epoch,
                                        "map": map_score,
                                        "is_best": is_best,
                                        "model_config": self.config.get('model', {}),
                                        "training_config": self.config.get('training', {})
                                    }, f)
                                
                                # Log metadata artifact
                                if self.mlflow_debugger and self.use_mlflow:
                                    context = self.mlflow_debugger.log_operation_start(
                                        'log_artifact',
                                        file_path=metadata_path,
                                        artifact_path=model_artifact_path,
                                        file_size=os.path.getsize(metadata_path)
                                    )
                                start_time = time.time()
                                mlflow.log_artifact(metadata_path, artifact_path=model_artifact_path)
                                if self.mlflow_debugger and self.use_mlflow:
                                    self.mlflow_debugger.log_operation_success('log_artifact', time.time() - start_time, context)
                                
                                if is_best:
                                    # Also save best model at standard "model" path for easy access
                                    if self.mlflow_debugger and self.use_mlflow:
                                        context = self.mlflow_debugger.log_operation_start(
                                            'log_artifact',
                                            file_path=temp_model_path,
                                            artifact_path="model",
                                            file_size=temp_file_size
                                        )
                                    start_time = time.time()
                                    mlflow.log_artifact(temp_model_path, artifact_path="model")
                                    if self.mlflow_debugger and self.use_mlflow:
                                        self.mlflow_debugger.log_operation_success('log_artifact', time.time() - start_time, context)
                                    
                                    if self.mlflow_debugger and self.use_mlflow:
                                        context = self.mlflow_debugger.log_operation_start(
                                            'log_artifact',
                                            file_path=metadata_path,
                                            artifact_path="model",
                                            file_size=os.path.getsize(metadata_path)
                                        )
                                    start_time = time.time()
                                    mlflow.log_artifact(metadata_path, artifact_path="model")
                                    if self.mlflow_debugger and self.use_mlflow:
                                        self.mlflow_debugger.log_operation_success('log_artifact', time.time() - start_time, context)
                                    
                                    print(f"Saved model to MLflow - epoch {epoch} (mAP: {map_score:.4f}, BEST)")
                                else:
                                    print(f"Saved model to MLflow - epoch {epoch} (mAP: {map_score:.4f})")
                                
                                model_logged = True
                                
                                # Clean up temp files
                                import shutil
                                try:
                                    if temp_dir and os.path.exists(temp_dir):
                                        shutil.rmtree(temp_dir)
                                except Exception as cleanup_error:
                                    if MLFLOW_DEBUG:
                                        print(f"[MLFLOW DEBUG] Warning: Failed to cleanup temp dir {temp_dir}: {cleanup_error}")
                                    
                            except Exception as model_log_error:
                                model_log_retry += 1
                                
                                # Log detailed error information
                                if self.mlflow_debugger and self.use_mlflow:
                                    self.mlflow_debugger.log_operation_error(
                                        'log_artifact',
                                        model_log_error,
                                        context=context,
                                        retry_count=model_log_retry,
                                        max_retries=max_retries,
                                        epoch=epoch,
                                        file_path=temp_model_path if temp_model_path else None
                                    )
                                
                                error_type = type(model_log_error).__name__
                                error_msg = f"Failed to log model to MLflow (attempt {model_log_retry}/{max_retries}): {error_type}: {model_log_error}"
                                
                                # Clean up temp files on error
                                if temp_dir and os.path.exists(temp_dir):
                                    try:
                                        import shutil
                                        shutil.rmtree(temp_dir)
                                    except Exception as cleanup_error:
                                        if MLFLOW_DEBUG:
                                            print(f"[MLFLOW DEBUG] Warning: Failed to cleanup temp dir on error: {cleanup_error}")
                                
                                # Check if recoverable
                                recoverable = any(err in str(model_log_error) or err in error_type for err in ('RestException', 'ConnectionError', 'Timeout', 'INTERNAL_ERROR'))
                                
                                if model_log_retry < max_retries and recoverable:
                                    if MLFLOW_DEBUG:
                                        print(f"[MLFLOW DEBUG] {error_msg} - Retrying...")
                                    time.sleep(2)  # Longer delay for model logging
                                else:
                                    print(f"Warning: {error_msg}")
                                    # Don't disable model logging permanently - just skip this epoch
                                    # The temp file approach should fix INTERNAL_ERROR
                                    break
                        
                        # Restore training mode
                        if model_was_eval:
                            self.model.train()
                            model_was_eval = False
                    
                    logged_successfully = True
                    self.mlflow_failure_count = 0  # Reset failure count on success
                            
                except Exception as e:
                    retry_count += 1
                    
                    # Log detailed error information
                    if self.mlflow_debugger and self.use_mlflow:
                        self.mlflow_debugger.log_operation_error(
                            'log_checkpoint',
                            e,
                            context={'epoch': epoch, 'lightweight': lightweight},
                            retry_count=retry_count,
                            max_retries=max_retries,
                            checkpoint_path=str(checkpoint_path) if 'checkpoint_path' in locals() else None
                        )
                    
                    error_type = type(e).__name__
                    error_msg = f"Failed to log checkpoint to MLflow (attempt {retry_count}/{max_retries}): {error_type}: {e}"
                    
                    # Restore training mode if needed
                    if model_was_eval:
                        self.model.train()
                        model_was_eval = False
                    
                    # Increment failure count
                    self.mlflow_failure_count += 1
                    
                    # Check if recoverable
                    recoverable = any(err in str(e) or err in error_type for err in ('RestException', 'ConnectionError', 'Timeout', 'INTERNAL_ERROR'))
                    
                    if retry_count < max_retries and recoverable:
                        if MLFLOW_DEBUG:
                            print(f"[MLFLOW DEBUG] {error_msg} - Retrying...")
                        time.sleep(1)
                    else:
                        print(f"Warning: {error_msg} (failure count: {self.mlflow_failure_count}/{self.mlflow_max_failures})")
                        # Disable MLflow if too many failures
                        if self.mlflow_failure_count >= self.mlflow_max_failures:
                            print(f"WARNING: MLflow disabled after {self.mlflow_failure_count} consecutive failures. Training continues without MLflow.")
                            self.use_mlflow = False
                        break
    
    def train(self, start_epoch: int = 0, num_epochs: int = 50):
        """Main training loop"""
        print(f"Starting training from epoch {start_epoch} to {num_epochs}")
        
        try:
            for epoch in range(start_epoch, num_epochs):
                if self.interrupted:
                    print("\nTraining interrupted. Saving checkpoint...")
                    # Save checkpoint without validation
                    self.save_checkpoint(epoch - 1, 0.0, is_best=False, is_interrupt=True)
                    break
                
                print(f"\n{'='*50}")
                print(f"Epoch {epoch + 1}/{num_epochs}")
                print(f"{'='*50}")
                print(f"[VERBOSE] Starting epoch {epoch + 1} at {time.strftime('%Y-%m-%d %H:%M:%S')}", flush=True)
                
                # Train
                print(f"[VERBOSE] About to call train_epoch({epoch})", flush=True)
                train_loss = self.train_epoch(epoch)
                print(f"[VERBOSE] train_epoch({epoch}) completed, loss: {train_loss:.4f}", flush=True)
                print(f"Training Loss: {train_loss:.4f}")
                
                # Save checkpoint every epoch (for safety, even if not validating)
                # This ensures we don't lose progress if training stops early
                save_every_epoch = self.config['checkpoint'].get('save_every_epoch', True)
                if save_every_epoch:
                    # Save lightweight checkpoint every epoch
                    self.save_checkpoint(epoch, 0.0, is_best=False, is_interrupt=False, lightweight=True)
                    
                    # Keep only last N lightweight checkpoints to save space (keep last 20)
                    # This balances frequent saves with disk space
                    keep_last_n = self.config['checkpoint'].get('keep_last_lightweight', 20)
                    if epoch >= keep_last_n:
                        import os
                        checkpoint_dir = Path(self.config['checkpoint']['save_dir'])
                        old_lightweight = checkpoint_dir / f"checkpoint_epoch_{epoch - keep_last_n}_lightweight.pth"
                        if old_lightweight.exists():
                            try:
                                old_lightweight.unlink()
                            except:
                                pass  # Ignore deletion errors
                
                # Validate less frequently for speed (every 10 epochs instead of 5)
                validate_frequency = 10
                if (epoch + 1) % validate_frequency == 0 or epoch == num_epochs - 1:
                    map_score = self.validate(epoch)
                    
                    # Save full checkpoint with validation metrics
                    is_best = map_score > self.best_map
                    if is_best:
                        self.best_map = map_score
                    
                    # Only save full checkpoint if not using lightweight-only mode
                    use_lightweight_only = self.config['checkpoint'].get('use_lightweight_only', False)
                    if not use_lightweight_only and (epoch + 1) % self.config['checkpoint']['save_frequency'] == 0:
                        self.save_checkpoint(epoch, map_score, is_best, is_interrupt=False, lightweight=False)
                
                # Test on real validation set after each epoch (if path provided)
                if self.real_val_path and Path(self.real_val_path).exists():
                    try:
                        self.test_real_validation(epoch)
                    except Exception as e:
                        print(f"Warning: Real validation test failed: {e}")
                        import traceback
                        traceback.print_exc()
                
                # Print adaptive optimization stats at end of epoch
                if self.adaptive_optimizer:
                    stats = self.adaptive_optimizer.get_statistics()
                    print(f"\n📊 Adaptive Optimization Stats:")
                    print(f"   Adjustments made: {stats['adjustment_count']}")
                    print(f"   Current workers: {stats['current_workers']}, prefetch: {stats['current_prefetch']}")
                    print(f"   Avg GPU util: {stats['avg_gpu_utilization']:.1%}, Avg RAM: {stats['avg_ram_usage']:.1%}")
                
                print(f"{'='*50}\n")
        
        except KeyboardInterrupt:
            print("\n\nTraining interrupted by user. Saving checkpoint...")
            self.save_checkpoint(epoch, 0.0, is_best=False, is_interrupt=True)
            raise
        
        except Exception as e:
            print(f"\n\nTraining error: {e}")
            print("Saving checkpoint before exit...")
            self.save_checkpoint(epoch, 0.0, is_best=False, is_interrupt=True)
            raise