Johnnyman1100's picture
Upload 38 files
4265aea verified
"""System resource detection and management for adaptive processing."""
import os
import psutil
import torch
import logging
from typing import Optional, Dict, Any
class SystemResources:
"""Detect and manage system resources for adaptive processing.
This class provides a unified interface to system resource detection,
handling CPU, RAM, and GPU capabilities. It calculates appropriate
thresholds and settings based on the detected hardware configuration.
It implements extreme memory conservation strategies to prevent OOM crashes
even on large datasets or limited hardware.
"""
def __init__(self):
# CPU detection
self.cpu_cores = os.cpu_count() or 1
self.cpu_threads = self.cpu_cores
# Try to get physical cores vs logical cores
try:
self.cpu_physical_cores = psutil.cpu_count(logical=False) or self.cpu_cores
except:
self.cpu_physical_cores = self.cpu_cores
# RAM detection
self.total_ram_gb = psutil.virtual_memory().total / (1024 ** 3)
self.available_ram_gb = psutil.virtual_memory().available / (1024 ** 3)
# GPU detection
self.has_cuda = torch.cuda.is_available()
self.cuda_device = None
self.cuda_mem_gb = 0
if self.has_cuda:
try:
torch.cuda.empty_cache()
self.cuda_device = torch.cuda.get_device_name(0)
self.cuda_mem_gb = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)
except Exception as e:
logging.warning(f"Error detecting CUDA properties: {e}")
self.has_cuda = False
# Calculate resource-based thresholds
self._calculate_thresholds()
# Log detected resources
self._log_resources()
def _calculate_thresholds(self):
"""Calculate adaptive thresholds based on detected system resources."""
# Memory thresholds - scaled to available RAM with extreme caution
# For all systems, use much more conservative thresholds after OOM testing
# Calculate absolute available RAM for emergency protection
self.emergency_reserve_gb = max(2.0, self.total_ram_gb * 0.2) # At least 2GB or 20% reserved
if self.total_ram_gb < 8: # Low RAM (<8GB)
self.ram_usage_warning = self.total_ram_gb * 0.45 # 45% of RAM
self.ram_usage_critical = self.total_ram_gb * 0.60 # 60% of RAM
self.max_files_multiplier = 0.03 # Extremely conservative
self.use_disk_offload = True # Always use disk offloading
elif self.total_ram_gb < 16: # Medium RAM (8-16GB)
self.ram_usage_warning = self.total_ram_gb * 0.55 # 55% of RAM
self.ram_usage_critical = self.total_ram_gb * 0.70 # 70% of RAM
self.max_files_multiplier = 0.05
self.use_disk_offload = True # Always use disk offloading
else: # High RAM (>16GB)
self.ram_usage_warning = self.total_ram_gb * 0.60 # 60% of RAM (down from 75%)
self.ram_usage_critical = self.total_ram_gb * 0.75 # 75% of RAM (down from 90%)
self.max_files_multiplier = 0.1 # Halved from previous 0.2
self.use_disk_offload = True # Use disk offloading even on high-RAM systems
# Maximum text chunk size in memory (characters)
# This helps prevent individual large chunks from causing OOM
self.max_text_chunk_size = min(10_000_000, int(self.total_ram_gb * 1_000_000))
# CPU-based settings
# For worker count, use physical cores (or half of logical cores if physical detection failed)
self.max_workers = max(1, min(self.cpu_physical_cores, 4)) # At most 4 workers
# Batch size based on available cores
if self.cpu_cores <= 2:
self.batch_size = 2
elif self.cpu_cores <= 4:
self.batch_size = 4
else:
self.batch_size = min(5, self.cpu_cores // 2)
# Training chunk size - how many texts to process in one training iteration
if self.total_ram_gb < 8:
self.training_chunk_size = 3
elif self.total_ram_gb < 16:
self.training_chunk_size = 5
else:
self.training_chunk_size = 10
def _log_resources(self):
"""Log detected system resources and calculated thresholds."""
logging.info("===== System Resources =====")
logging.info(f"CPU: {self.cpu_cores} cores ({self.cpu_physical_cores} physical)")
logging.info(f"RAM: {self.total_ram_gb:.1f} GB total, {self.available_ram_gb:.1f} GB available")
if self.has_cuda:
logging.info(f"GPU: {self.cuda_device} with {self.cuda_mem_gb:.1f} GB memory")
else:
logging.info("GPU: Not available")
logging.info("===== Adaptive Settings =====")
logging.info(f"RAM Warning Threshold: {self.ram_usage_warning:.1f} GB")
logging.info(f"RAM Critical Threshold: {self.ram_usage_critical:.1f} GB")
logging.info(f"Max Workers: {self.max_workers}")
logging.info(f"Batch Size: {self.batch_size}")
logging.info(f"Training Chunk Size: {self.training_chunk_size}")
logging.info(f"Max Files Multiplier: {self.max_files_multiplier:.2f}")