Spaces:

markytools
/

strexp

Build error

App Files Files Community

strexp / captum /_utils /models /linear_model /train.py

markytools

added strexp

d61b9c7 over 2 years ago

raw

history blame contribute delete

11.7 kB

	import time
	import warnings
	from typing import Any, Callable, Dict, List, Optional

	import torch
	import torch.nn as nn
	from captum._utils.models.linear_model.model import LinearModel
	from torch.utils.data import DataLoader


	def l2_loss(x1, x2, weights=None):
	if weights is None:
	return torch.mean((x1 - x2) ** 2) / 2.0
	else:
	return torch.sum((weights / weights.norm(p=1)) * ((x1 - x2) ** 2)) / 2.0


	def sgd_train_linear_model(
	model: LinearModel,
	dataloader: DataLoader,
	construct_kwargs: Dict[str, Any],
	max_epoch: int = 100,
	reduce_lr: bool = True,
	initial_lr: float = 0.01,
	alpha: float = 1.0,
	loss_fn: Callable = l2_loss,
	reg_term: Optional[int] = 1,
	patience: int = 10,
	threshold: float = 1e-4,
	running_loss_window: Optional[int] = None,
	device: Optional[str] = None,
	init_scheme: str = "zeros",
	debug: bool = False,
	) -> Dict[str, float]:
	r"""
	Trains a linear model with SGD. This will continue to iterate your
	dataloader until we converged to a solution or alternatively until we have
	exhausted `max_epoch`.

	Convergence is defined by the loss not changing by `threshold` amount for
	`patience` number of iterations.

	Args:
	model
	The model to train
	dataloader
	The data to train it with. We will assume the dataloader produces
	either pairs or triples of the form (x, y) or (x, y, w). Where x and
	y are typical pairs for supervised learning and w is a weight
	vector.

	We will call `model._construct_model_params` with construct_kwargs
	and the input features set to `x.shape[1]` (`x.shape[0]` corresponds
	to the batch size). We assume that `len(x.shape) == 2`, i.e. the
	tensor is flat. The number of output features will be set to
	y.shape[1] or 1 (if `len(y.shape) == 1`); we require `len(y.shape)
	<= 2`.
	max_epoch
	The maximum number of epochs to exhaust
	reduce_lr
	Whether or not to reduce the learning rate as iterations progress.
	Halves the learning rate when the training loss does not move. This
	uses torch.optim.lr_scheduler.ReduceLROnPlateau and uses the
	parameters `patience` and `threshold`
	initial_lr
	The initial learning rate to use.
	alpha
	A constant for the regularization term.
	loss_fn
	The loss to optimise for. This must accept three parameters:
	x1 (predicted), x2 (labels) and a weight vector
	reg_term
	Regularization is defined by the `reg_term` norm of the weights.
	Please use `None` if you do not wish to use regularization.
	patience
	Defines the number of iterations in a row the loss must remain
	within `threshold` in order to be classified as converged.
	threshold
	Threshold for convergence detection.
	running_loss_window
	Used to report the training loss once we have finished training and
	to determine when we have converged (along with reducing the
	learning rate).

	The reported training loss will take the last `running_loss_window`
	iterations and average them.

	If `None` we will approximate this to be the number of examples in
	an epoch.
	init_scheme
	Initialization to use prior to training the linear model.
	device
	The device to send the model and data to. If None then no `.to` call
	will be used.
	debug
	Whether to print the loss, learning rate per iteration

	Returns
	This will return the final training loss (averaged with
	`running_loss_window`)
	"""

	loss_window: List[torch.Tensor] = []
	min_avg_loss = None
	convergence_counter = 0
	converged = False

	def get_point(datapoint):
	if len(datapoint) == 2:
	x, y = datapoint
	w = None
	else:
	x, y, w = datapoint

	if device is not None:
	x = x.to(device)
	y = y.to(device)
	if w is not None:
	w = w.to(device)

	return x, y, w

	# get a point and construct the model
	data_iter = iter(dataloader)
	x, y, w = get_point(next(data_iter))

	model._construct_model_params(
	in_features=x.shape[1],
	out_features=y.shape[1] if len(y.shape) == 2 else 1,
	**construct_kwargs,
	)
	model.train()

	assert model.linear is not None

	if init_scheme is not None:
	assert init_scheme in ["xavier", "zeros"]

	with torch.no_grad():
	if init_scheme == "xavier":
	torch.nn.init.xavier_uniform_(model.linear.weight)
	else:
	model.linear.weight.zero_()

	if model.linear.bias is not None:
	model.linear.bias.zero_()

	optim = torch.optim.SGD(model.parameters(), lr=initial_lr)
	if reduce_lr:
	scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
	optim, factor=0.5, patience=patience, threshold=threshold
	)

	t1 = time.time()
	epoch = 0
	i = 0
	while epoch < max_epoch:
	while True: # for x, y, w in dataloader
	if running_loss_window is None:
	running_loss_window = x.shape[0] * len(dataloader)

	y = y.view(x.shape[0], -1)
	if w is not None:
	w = w.view(x.shape[0], -1)

	i += 1

	out = model(x)

	loss = loss_fn(y, out, w)
	if reg_term is not None:
	reg = torch.norm(model.linear.weight, p=reg_term)
	loss += reg.sum() * alpha

	if len(loss_window) >= running_loss_window:
	loss_window = loss_window[1:]
	loss_window.append(loss.clone().detach())
	assert len(loss_window) <= running_loss_window

	average_loss = torch.mean(torch.stack(loss_window))
	if min_avg_loss is not None:
	# if we haven't improved by at least `threshold`
	if average_loss > min_avg_loss or torch.isclose(
	min_avg_loss, average_loss, atol=threshold
	):
	convergence_counter += 1
	if convergence_counter >= patience:
	converged = True
	break
	else:
	convergence_counter = 0
	if min_avg_loss is None or min_avg_loss >= average_loss:
	min_avg_loss = average_loss.clone()

	if debug:
	print(
	f"lr={optim.param_groups[0]['lr']}, Loss={loss},"
	+ "Aloss={average_loss}, min_avg_loss={min_avg_loss}"
	)

	loss.backward()

	optim.step()
	model.zero_grad()
	if scheduler:
	scheduler.step(average_loss)

	temp = next(data_iter, None)
	if temp is None:
	break
	x, y, w = get_point(temp)

	if converged:
	break

	epoch += 1
	data_iter = iter(dataloader)
	x, y, w = get_point(next(data_iter))

	t2 = time.time()
	return {
	"train_time": t2 - t1,
	"train_loss": torch.mean(torch.stack(loss_window)).item(),
	"train_iter": i,
	"train_epoch": epoch,
	}


	class NormLayer(nn.Module):
	def __init__(self, mean, std, n=None, eps=1e-8) -> None:
	super().__init__()
	self.mean = mean
	self.std = std
	self.eps = eps

	def forward(self, x):
	return (x - self.mean) / (self.std + self.eps)


	def sklearn_train_linear_model(
	model: LinearModel,
	dataloader: DataLoader,
	construct_kwargs: Dict[str, Any],
	sklearn_trainer: str = "Lasso",
	norm_input: bool = False,
	**fit_kwargs,
	):
	r"""
	Alternative method to train with sklearn. This does introduce some slight
	overhead as we convert the tensors to numpy and then convert the resulting
	trained model to a `LinearModel` object. However, this conversion
	should be negligible.

	Please note that this assumes:

	0. You have sklearn and numpy installed
	1. The dataset can fit into memory

	Args
	model
	The model to train.
	dataloader
	The data to use. This will be exhausted and converted to numpy
	arrays. Therefore please do not feed an infinite dataloader.
	norm_input
	Whether or not to normalize the input
	sklearn_trainer
	The sklearn model to use to train the model. Please refer to
	sklearn.linear_model for a list of modules to use.
	construct_kwargs
	Additional arguments provided to the `sklearn_trainer` constructor
	fit_kwargs
	Other arguments to send to `sklearn_trainer`'s `.fit` method
	"""
	from functools import reduce

	try:
	import numpy as np
	except ImportError:
	raise ValueError("numpy is not available. Please install numpy.")

	try:
	import sklearn
	import sklearn.linear_model
	import sklearn.svm
	except ImportError:
	raise ValueError("sklearn is not available. Please install sklearn >= 0.23")

	if not sklearn.__version__ >= "0.23.0":
	warnings.warn(
	"Must have sklearn version 0.23.0 or higher to use "
	"sample_weight in Lasso regression."
	)

	num_batches = 0
	xs, ys, ws = [], [], []
	for data in dataloader:
	if len(data) == 3:
	x, y, w = data
	else:
	assert len(data) == 2
	x, y = data
	w = None

	xs.append(x.cpu().numpy())
	ys.append(y.cpu().numpy())
	if w is not None:
	ws.append(w.cpu().numpy())
	num_batches += 1

	x = np.concatenate(xs, axis=0)
	y = np.concatenate(ys, axis=0)
	if len(ws) > 0:
	w = np.concatenate(ws, axis=0)
	else:
	w = None

	if norm_input:
	mean, std = x.mean(0), x.std(0)
	x -= mean
	x /= std

	t1 = time.time()
	sklearn_model = reduce(
	lambda val, el: getattr(val, el), [sklearn] + sklearn_trainer.split(".")
	)(**construct_kwargs)
	try:
	sklearn_model.fit(x, y, sample_weight=w, **fit_kwargs)
	except TypeError:
	sklearn_model.fit(x, y, **fit_kwargs)
	warnings.warn(
	"Sample weight is not supported for the provided linear model!"
	" Trained model without weighting inputs. For Lasso, please"
	" upgrade sklearn to a version >= 0.23.0."
	)

	t2 = time.time()

	# Convert weights to pytorch
	classes = (
	torch.IntTensor(sklearn_model.classes_)
	if hasattr(sklearn_model, "classes_")
	else None
	)

	# extract model device
	device = model.device if hasattr(model, "device") else "cpu"

	num_outputs = sklearn_model.coef_.shape[0] if sklearn_model.coef_.ndim > 1 else 1
	weight_values = torch.FloatTensor(sklearn_model.coef_).to(device) # type: ignore
	bias_values = torch.FloatTensor([sklearn_model.intercept_]).to( # type: ignore
	device # type: ignore
	) # type: ignore
	model._construct_model_params(
	norm_type=None,
	weight_values=weight_values.view(num_outputs, -1),
	bias_value=bias_values.squeeze().unsqueeze(0),
	classes=classes,
	)

	if norm_input:
	model.norm = NormLayer(mean, std)

	return {"train_time": t2 - t1}