"""System resource detection and management for adaptive processing.""" import os import psutil import torch import logging from typing import Optional, Dict, Any class SystemResources: """Detect and manage system resources for adaptive processing. This class provides a unified interface to system resource detection, handling CPU, RAM, and GPU capabilities. It calculates appropriate thresholds and settings based on the detected hardware configuration. It implements extreme memory conservation strategies to prevent OOM crashes even on large datasets or limited hardware. """ def __init__(self): # CPU detection self.cpu_cores = os.cpu_count() or 1 self.cpu_threads = self.cpu_cores # Try to get physical cores vs logical cores try: self.cpu_physical_cores = psutil.cpu_count(logical=False) or self.cpu_cores except: self.cpu_physical_cores = self.cpu_cores # RAM detection self.total_ram_gb = psutil.virtual_memory().total / (1024 ** 3) self.available_ram_gb = psutil.virtual_memory().available / (1024 ** 3) # GPU detection self.has_cuda = torch.cuda.is_available() self.cuda_device = None self.cuda_mem_gb = 0 if self.has_cuda: try: torch.cuda.empty_cache() self.cuda_device = torch.cuda.get_device_name(0) self.cuda_mem_gb = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3) except Exception as e: logging.warning(f"Error detecting CUDA properties: {e}") self.has_cuda = False # Calculate resource-based thresholds self._calculate_thresholds() # Log detected resources self._log_resources() def _calculate_thresholds(self): """Calculate adaptive thresholds based on detected system resources.""" # Memory thresholds - scaled to available RAM with extreme caution # For all systems, use much more conservative thresholds after OOM testing # Calculate absolute available RAM for emergency protection self.emergency_reserve_gb = max(2.0, self.total_ram_gb * 0.2) # At least 2GB or 20% reserved if self.total_ram_gb < 8: # Low RAM (<8GB) self.ram_usage_warning = self.total_ram_gb * 0.45 # 45% of RAM self.ram_usage_critical = self.total_ram_gb * 0.60 # 60% of RAM self.max_files_multiplier = 0.03 # Extremely conservative self.use_disk_offload = True # Always use disk offloading elif self.total_ram_gb < 16: # Medium RAM (8-16GB) self.ram_usage_warning = self.total_ram_gb * 0.55 # 55% of RAM self.ram_usage_critical = self.total_ram_gb * 0.70 # 70% of RAM self.max_files_multiplier = 0.05 self.use_disk_offload = True # Always use disk offloading else: # High RAM (>16GB) self.ram_usage_warning = self.total_ram_gb * 0.60 # 60% of RAM (down from 75%) self.ram_usage_critical = self.total_ram_gb * 0.75 # 75% of RAM (down from 90%) self.max_files_multiplier = 0.1 # Halved from previous 0.2 self.use_disk_offload = True # Use disk offloading even on high-RAM systems # Maximum text chunk size in memory (characters) # This helps prevent individual large chunks from causing OOM self.max_text_chunk_size = min(10_000_000, int(self.total_ram_gb * 1_000_000)) # CPU-based settings # For worker count, use physical cores (or half of logical cores if physical detection failed) self.max_workers = max(1, min(self.cpu_physical_cores, 4)) # At most 4 workers # Batch size based on available cores if self.cpu_cores <= 2: self.batch_size = 2 elif self.cpu_cores <= 4: self.batch_size = 4 else: self.batch_size = min(5, self.cpu_cores // 2) # Training chunk size - how many texts to process in one training iteration if self.total_ram_gb < 8: self.training_chunk_size = 3 elif self.total_ram_gb < 16: self.training_chunk_size = 5 else: self.training_chunk_size = 10 def _log_resources(self): """Log detected system resources and calculated thresholds.""" logging.info("===== System Resources =====") logging.info(f"CPU: {self.cpu_cores} cores ({self.cpu_physical_cores} physical)") logging.info(f"RAM: {self.total_ram_gb:.1f} GB total, {self.available_ram_gb:.1f} GB available") if self.has_cuda: logging.info(f"GPU: {self.cuda_device} with {self.cuda_mem_gb:.1f} GB memory") else: logging.info("GPU: Not available") logging.info("===== Adaptive Settings =====") logging.info(f"RAM Warning Threshold: {self.ram_usage_warning:.1f} GB") logging.info(f"RAM Critical Threshold: {self.ram_usage_critical:.1f} GB") logging.info(f"Max Workers: {self.max_workers}") logging.info(f"Batch Size: {self.batch_size}") logging.info(f"Training Chunk Size: {self.training_chunk_size}") logging.info(f"Max Files Multiplier: {self.max_files_multiplier:.2f}")