kappaTune / selective_fine_tuning.py

Upload 2 files

6a08381 verified 6 months ago

12 kB

	# The class in this script implements a selective fine-tuning method based on the condition number
	# Author: Oswaldo Ludwig (now with AI support)
	# Date: 03/07/2025
	# In case of publication using this script or ideas in this script, cite:
	# Ludwig, Oswaldo. "The Condition Number as a Scale-Invariant Proxy for Information Encoding in Neural Units." arXiv preprint arXiv:2506.16289 (2025).

	import torch
	import torch.nn as nn
	import torch.optim as optim
	import os
	import json
	import numpy as np
	import logging
	from typing import Type, Dict, Any, Set, List

	# Configure logging (ensure this is at the top level or configured once)
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	class SelectiveFineTuningOptimizer:
	"""
	A custom optimizer wrapper that selectively fine-tunes a PyTorch model
	based on the condition numbers of its parameters. Parameters with lower
	condition numbers are prioritized for fine-tuning.
	"""
	def __init__(self, model: nn.Module, base_optimizer_cls: Type[optim.Optimizer], optimizer_args: Dict[str, Any],
	condition_file: str = 'condition_numbers.json',
	num_tensors_to_finetune: int = 100,
	recompute: bool = False,
	max_dim_size_to_analyze: int = None): # New parameter for filtering
	"""
	Initializes the SelectiveFineTuningOptimizer.

	Args:
	model (nn.Module): The PyTorch model to be fine-tuned.
	base_optimizer_cls (Type[optim.Optimizer]): The class of the base optimizer (e.g., torch.optim.Adam).
	optimizer_args (Dict[str, Any]): A dictionary of arguments to pass to the base optimizer constructor.
	condition_file (str): Path to the JSON file for storing/loading condition numbers.
	num_tensors_to_finetune (int): The number of top tensors (based on condition number) to fine-tune.
	recompute (bool): If True, recompute condition numbers even if the file exists.
	max_dim_size_to_analyze (int, optional): If provided, any parameter tensor with at least one dimension
	larger than this value will be skipped from analysis.
	Useful for ignoring very large embedding matrices etc.
	"""
	self.model = model
	self.condition_file = condition_file
	self.num_tensors_to_finetune = num_tensors_to_finetune
	self.recompute = recompute
	self.max_dim_size_to_analyze = max_dim_size_to_analyze # Store the new parameter

	self.condition_numbers: Dict[str, float] = {}

	if not os.path.exists(condition_file) or recompute:
	self.condition_numbers = self._analyze_model()
	self._save_condition_numbers()
	else:
	self.condition_numbers = self._load_condition_numbers()

	self.trainable_param_names: Set[str] = self._select_trainable_parameters()
	self._unfreeze_selected_parameters()

	# Initialize the base optimizer with selected parameters
	params_to_optimize = [p for n, p in model.named_parameters() if n in self.trainable_param_names]
	if not params_to_optimize:
	logger.warning("No parameters selected for fine-tuning based on the criteria. Optimizer will have no parameters.")
	self.optimizer = base_optimizer_cls(params_to_optimize, **optimizer_args)
	logger.info(f"Optimizer initialized with {len(params_to_optimize)} trainable parameters.")


	def _analyze_model(self) -> Dict[str, float]:
	"""
	Analyzes the singular values of model parameters to compute their condition numbers.
	Parameters with less than 2 dimensions or having any dimension
	larger than `max_dim_size_to_analyze` are ignored.
	SVD is performed on the GPU if the tensor is on CUDA, otherwise on CPU.

	Returns:
	Dict[str, float]: A dictionary mapping parameter names to their condition numbers.
	"""
	condition_numbers = {}
	logger.info("Analyzing the model tensors...")

	initial_requires_grad_state = {}
	for name, param in self.model.named_parameters():
	initial_requires_grad_state[name] = param.requires_grad
	param.requires_grad = False # Temporarily disable for analysis

	analyzed_count = 0
	skipped_ndim_count = 0
	skipped_dim_size_count = 0 # New counter
	skipped_svd_error_count = 0
	total_params_in_model = 0

	try:
	for name, param in self.model.named_parameters():
	total_params_in_model += 1
	# Filter 1: Skip by number of dimensions
	if param.ndim < 2:
	logger.debug(f"Skipping {name} due to less than 2 dimensions (ndim={param.ndim}).")
	skipped_ndim_count += 1
	continue
	# Filter 2: Skip by any dimension size exceeding threshold
	if self.max_dim_size_to_analyze is not None:
	if any(dim_size > self.max_dim_size_to_analyze for dim_size in param.shape):
	logger.debug(f"Skipping {name} due to a dimension larger than {self.max_dim_size_to_analyze} (shape={param.shape}).")
	skipped_dim_size_count += 1
	continue

	try:
	data = param.detach() # Keep on GPU if already there
	if data.is_cuda:
	# Perform SVD on GPU
	u, s, v = torch.linalg.svd(data, full_matrices=False)
	else:
	# Fallback to CPU if not on CUDA
	u, s, v = torch.linalg.svd(data.cpu(), full_matrices=False)

	cond_number = (s[0] / s[-1]).item() if s[-1] > 0 else float('inf')
	condition_numbers[name] = cond_number
	analyzed_count += 1
	logger.debug(f"Analyzed {name}: condition_number={cond_number:.4f}")
	except torch.linalg.LinAlgError as e:
	logger.warning(f"Skipping {name} due to SVD Linear Algebra error: {e}")
	skipped_svd_error_count += 1
	except Exception as e:
	logger.error(f"Skipping {name} due to unexpected error during SVD: {e}")
	skipped_svd_error_count += 1
	finally:
	# Restore initial requires_grad state (though _unfreeze_selected_parameters will override this)
	for name, param in self.model.named_parameters():
	param.requires_grad = initial_requires_grad_state[name]


	logger.info(f"Done analyzing model tensors. Total parameters in model: {total_params_in_model}")
	logger.info(f"Parameters analyzed for condition numbers: {analyzed_count}")
	logger.info(f"Skipped due to ndim < 2: {skipped_ndim_count}")
	logger.info(f"Skipped due to dimension size > {self.max_dim_size_to_analyze}: {skipped_dim_size_count}") # New log
	logger.info(f"Skipped due to SVD errors: {skipped_svd_error_count}")
	return condition_numbers

	def _save_condition_numbers(self):
	"""
	Saves the computed condition numbers to a JSON file.
	"""
	try:
	with open(self.condition_file, 'w') as f:
	json.dump(self.condition_numbers, f, indent=2)
	logger.info(f"Condition numbers saved to {self.condition_file}")
	except IOError as e:
	logger.error(f"Failed to save condition numbers to {self.condition_file}: {e}")

	def _load_condition_numbers(self) -> Dict[str, float]:
	"""
	Loads condition numbers from a JSON file. If the file is corrupted,
	it triggers a recomputation.

	Returns:
	Dict[str, float]: The loaded condition numbers.
	"""
	try:
	with open(self.condition_file, 'r') as f:
	data = json.load(f)
	logger.info(f"Condition numbers loaded from {self.condition_file}")
	return data
	except json.JSONDecodeError as e:
	logger.warning(f"Condition file '{self.condition_file}' is corrupted or invalid. Error: {e}. Recomputing.")
	if os.path.exists(self.condition_file):
	try:
	os.remove(self.condition_file) # Remove corrupted file
	logger.info(f"Removed corrupted condition file: {self.condition_file}")
	except OSError as err:
	logger.error(f"Error removing corrupted file {self.condition_file}: {err}")
	return self._analyze_model() # Recompute if loading fails
	except IOError as e:
	logger.error(f"Failed to load condition numbers from {self.condition_file}: {e}. Recomputing.")
	return self._analyze_model() # Recompute if file not found or other IO error

	def _select_trainable_parameters(self) -> Set[str]:
	"""
	Selects the top `num_tensors_to_finetune` parameters based on their condition numbers
	(lower condition number is better).

	Returns:
	Set[str]: A set of names of the parameters chosen for fine-tuning.
	"""
	if not self.condition_numbers:
	logger.warning("No condition numbers available to select trainable parameters.")
	return set()

	sorted_params = sorted(self.condition_numbers.items(), key=lambda x: x[1])
	selected = [name for name, _ in sorted_params[:self.num_tensors_to_finetune]]
	logger.info(f"Selected {len(selected)} parameters for fine-tuning out of {len(self.condition_numbers)} analyzed.")
	logger.debug(f"Selected parameters: {selected}")
	return set(selected)

	def _unfreeze_selected_parameters(self):
	"""
	Sets `requires_grad=True` for the selected trainable parameters
	and `requires_grad=False` for all other parameters in the model.
	"""
	total_params = 0
	frozen_params_count = 0
	unfrozen_params_count = 0

	for name, param in self.model.named_parameters():
	total_params += 1
	if name in self.trainable_param_names:
	if not param.requires_grad: # Only change if it's different
	param.requires_grad = True
	unfrozen_params_count += 1
	logger.debug(f"Parameter '{name}' set to requires_grad=True.")
	else:
	if param.requires_grad: # Only change if it's different
	param.requires_grad = False
	frozen_params_count += 1
	logger.debug(f"Parameter '{name}' set to requires_grad=False.")

	logger.info(f"Model parameters configured: {unfrozen_params_count} unfrozen, {frozen_params_count} frozen (out of {total_params} total).")


	def step(self):
	"""
	Performs a single optimization step (parameter update).
	Delegates to the base optimizer's step method.
	"""
	self.optimizer.step()

	def zero_grad(self):
	"""
	Clears the gradients of all optimized parameters.
	Delegates to the base optimizer's zero_grad method.
	"""
	self.optimizer.zero_grad()

	def state_dict(self) -> Dict[str, Any]:
	"""
	Returns a serializable dictionary containing the current state of the optimizer.
	Delegates to the base optimizer's state_dict method.
	"""
	return self.optimizer.state_dict()

	def load_state_dict(self, state_dict: Dict[str, Any]):
	"""
	Loads the optimizer's state from a state_dict.
	Delegates to the base optimizer's load_state_dict method.

	Args:
	state_dict (Dict[str, Any]): A dictionary containing the optimizer's state.
	"""
	self.optimizer.load_state_dict(state_dict)