Upload 38 files

4265aea verified 9 months ago

5.46 kB

	"""System resource detection and management for adaptive processing."""

	import os
	import psutil
	import torch
	import logging
	from typing import Optional, Dict, Any

	class SystemResources:
	"""Detect and manage system resources for adaptive processing.

	This class provides a unified interface to system resource detection,
	handling CPU, RAM, and GPU capabilities. It calculates appropriate
	thresholds and settings based on the detected hardware configuration.

	It implements extreme memory conservation strategies to prevent OOM crashes
	even on large datasets or limited hardware.
	"""

	def __init__(self):
	# CPU detection
	self.cpu_cores = os.cpu_count() or 1
	self.cpu_threads = self.cpu_cores

	# Try to get physical cores vs logical cores
	try:
	self.cpu_physical_cores = psutil.cpu_count(logical=False) or self.cpu_cores
	except:
	self.cpu_physical_cores = self.cpu_cores

	# RAM detection
	self.total_ram_gb = psutil.virtual_memory().total / (1024 ** 3)
	self.available_ram_gb = psutil.virtual_memory().available / (1024 ** 3)

	# GPU detection
	self.has_cuda = torch.cuda.is_available()
	self.cuda_device = None
	self.cuda_mem_gb = 0

	if self.has_cuda:
	try:
	torch.cuda.empty_cache()
	self.cuda_device = torch.cuda.get_device_name(0)
	self.cuda_mem_gb = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)
	except Exception as e:
	logging.warning(f"Error detecting CUDA properties: {e}")
	self.has_cuda = False

	# Calculate resource-based thresholds
	self._calculate_thresholds()

	# Log detected resources
	self._log_resources()

	def _calculate_thresholds(self):
	"""Calculate adaptive thresholds based on detected system resources."""
	# Memory thresholds - scaled to available RAM with extreme caution
	# For all systems, use much more conservative thresholds after OOM testing

	# Calculate absolute available RAM for emergency protection
	self.emergency_reserve_gb = max(2.0, self.total_ram_gb * 0.2) # At least 2GB or 20% reserved

	if self.total_ram_gb < 8: # Low RAM (<8GB)
	self.ram_usage_warning = self.total_ram_gb * 0.45 # 45% of RAM
	self.ram_usage_critical = self.total_ram_gb * 0.60 # 60% of RAM
	self.max_files_multiplier = 0.03 # Extremely conservative
	self.use_disk_offload = True # Always use disk offloading
	elif self.total_ram_gb < 16: # Medium RAM (8-16GB)
	self.ram_usage_warning = self.total_ram_gb * 0.55 # 55% of RAM
	self.ram_usage_critical = self.total_ram_gb * 0.70 # 70% of RAM
	self.max_files_multiplier = 0.05
	self.use_disk_offload = True # Always use disk offloading
	else: # High RAM (>16GB)
	self.ram_usage_warning = self.total_ram_gb * 0.60 # 60% of RAM (down from 75%)
	self.ram_usage_critical = self.total_ram_gb * 0.75 # 75% of RAM (down from 90%)
	self.max_files_multiplier = 0.1 # Halved from previous 0.2
	self.use_disk_offload = True # Use disk offloading even on high-RAM systems

	# Maximum text chunk size in memory (characters)
	# This helps prevent individual large chunks from causing OOM
	self.max_text_chunk_size = min(10_000_000, int(self.total_ram_gb * 1_000_000))

	# CPU-based settings
	# For worker count, use physical cores (or half of logical cores if physical detection failed)
	self.max_workers = max(1, min(self.cpu_physical_cores, 4)) # At most 4 workers

	# Batch size based on available cores
	if self.cpu_cores <= 2:
	self.batch_size = 2
	elif self.cpu_cores <= 4:
	self.batch_size = 4
	else:
	self.batch_size = min(5, self.cpu_cores // 2)

	# Training chunk size - how many texts to process in one training iteration
	if self.total_ram_gb < 8:
	self.training_chunk_size = 3
	elif self.total_ram_gb < 16:
	self.training_chunk_size = 5
	else:
	self.training_chunk_size = 10

	def _log_resources(self):
	"""Log detected system resources and calculated thresholds."""
	logging.info("===== System Resources =====")
	logging.info(f"CPU: {self.cpu_cores} cores ({self.cpu_physical_cores} physical)")
	logging.info(f"RAM: {self.total_ram_gb:.1f} GB total, {self.available_ram_gb:.1f} GB available")

	if self.has_cuda:
	logging.info(f"GPU: {self.cuda_device} with {self.cuda_mem_gb:.1f} GB memory")
	else:
	logging.info("GPU: Not available")

	logging.info("===== Adaptive Settings =====")
	logging.info(f"RAM Warning Threshold: {self.ram_usage_warning:.1f} GB")
	logging.info(f"RAM Critical Threshold: {self.ram_usage_critical:.1f} GB")
	logging.info(f"Max Workers: {self.max_workers}")
	logging.info(f"Batch Size: {self.batch_size}")
	logging.info(f"Training Chunk Size: {self.training_chunk_size}")
	logging.info(f"Max Files Multiplier: {self.max_files_multiplier:.2f}")