Upload folder using huggingface_hub

b386992 verified 9 months ago

5.6 kB

	# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	import math
	from typing import Dict, List, Tuple, Union

	from lightning.pytorch.callbacks.callback import Callback

	from nemo.lightning.io.mixin import IOMixin

	ScheduleValue = Union[int, List[int], Tuple[int, int]] # -1, N, [start, end], [start, -1]


	def _resolve_attr(root, path: str):
	"""
	Traverse dotted attribute path (“encoder.layer1”) from root.
	"""
	m = root
	for part in path.split('.'):
	m = getattr(m, part)
	return m


	def make_start_end(name: str, spec: Union[int, list[int]]):
	"""Translates spec to start/end steps, for example,
	spec = -1 -> (0, inf)
	spec = N (int>0) -> (N, int)
	spec = [start, end] -> (start, end)


	Args:
	name (str): name layer
	spec (Union[int, list[int]]): spec.

	Raises:
	ValueError: if spec is not int/list/tuple

	Returns:
	tuple(int, int): returns start/end
	"""
	# Normalize to (start, end) where end==inf means “forever”
	if isinstance(spec, int):
	if spec == -1: # forever
	start, end = 0, math.inf
	else: # first N steps
	start, end = 0, spec - 1
	elif isinstance(spec, (list, tuple)) and len(spec) == 2:
	start, end = spec
	start = max(start, 0)
	if end < 0:
	end = math.inf
	else:
	raise ValueError(f"Invalid schedule for '{name}': {spec}")
	return start, end


	class LayerFreezer(Callback, IOMixin):
	"""
	Freezes sub-modules of a LightningModule based on the list provided. The list of layers should
	be the full FQN.

	Instantiate
	-----------
	# to keep layers frozen for all training
	callback = LayerFreezer(['layer1', 'layer2',])
	# for some steps
	callback = LayerFreezer({'layer1': 10, 'layer2': (10, 100)})

	trainer = pl.Trainer(callbacks=[callback], ...)
	"""

	def __init__(self, schedule: Union[List[str], Dict[str, ScheduleValue]]):
	"""
	Args
	----
	schedule: Union[list, dict]
	- dict
	key = attribute path of sub-module inside LightningModule
	value = one of
	: -1 -> frozen for entire run
	: N (int>0) -> frozen for first N steps (0..N-1)
	: [start, end] -> frozen if start <= step <= end
	use -1 for "until end of training"
	- list:
	key = attribute path of sub-module inside LightningModule
	value = -1 (hardcoded; use a dict if you want to specify manually).
	"""
	super().__init__()
	assert isinstance(schedule, (list, dict)), type(schedule)
	if isinstance(schedule, list):
	schedule = {item: -1 for item in schedule}

	self.schedule: Dict[str, Tuple[int, float]] = {}
	self.frozen_state: Dict[str, bool] = {} # last applied state

	for name, spec in schedule.items():
	self.schedule[name] = make_start_end(name, spec)

	# --------------------------------------------------------------------- #
	# internal helpers
	# --------------------------------------------------------------------- #
	@staticmethod
	def _resolve_attr(root, path: str):
	"""
	Traverse dotted attribute path (“encoder.layer1”) from root.
	"""
	m = root
	for part in path.split('.'):
	m = getattr(m, part)
	return m

	def _apply_freeze(self, module, freeze: bool):
	"""
	Enable/disable gradients + switch (eval/train) mode.
	"""
	for p in module.parameters():
	p.requires_grad = not freeze
	# Optional: also flip training mode so dropout / BN are disabled.
	module.eval() if freeze else module.train()

	# --------------------------------------------------------------------- #
	# Lightning hooks
	# --------------------------------------------------------------------- #
	def on_train_batch_start(self, trainer, pl_module, *_):
	"""
	freezes layers listed on frozen_layers
	Args:
	trainer (Trainer): the trainer
	pl_module (LightningModule): model
	"""
	step = trainer.global_step

	for name, (start, end) in self.schedule.items():
	should_be_frozen = start <= step <= end
	# skip if status unchanged since last check
	if self.frozen_state.get(name, None) == should_be_frozen:
	continue

	submod = self._resolve_attr(pl_module, name)
	self._apply_freeze(submod, should_be_frozen)
	self.frozen_state[name] = should_be_frozen

	def on_train_start(self, trainer, pl_module):
	"""
	on_train_start
	In case we resume from checkpoint, re-establish correct state
	Args:
	trainer (Trainer): the trainer
	pl_module (LightningModule): model
	"""
	self.on_train_batch_start(trainer, pl_module, None, 0)