Lexa

Converted .pt files to safetensors, then (dirtily) patched fairseq to enable loading of safetensor files

b5a0bec 8 months ago

15.2 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	#

	from dataclasses import dataclass, field
	from typing import List, Tuple

	import torch
	import torch.nn.functional as F
	from fairseq2.logging import get_log_writer
	from fairseq2.nn.padding import pad_seqs
	from torch import Tensor

	from lcm.datasets.batch import EmbeddingsBatch, LCMInput, LCMStyle
	from lcm.models.two_tower_diffusion_lcm.builder import TwoTowerDiffusionLCModel
	from lcm.train.criterion import CriterionsFactory
	from lcm.train.lcm.criterion import (
	LCMCriterion,
	LCMCriterionConfig,
	compute_standard_mse,
	)
	from lcm.train.metrics import LossTerm, format_as_float, register_metric_formatter
	from lcm.train.step_sampler import StepsSampler, StepsSamplerConfig

	logger = get_log_writer(__name__)


	@dataclass
	class TowerDiffusionLCMCriterionConfig(LCMCriterionConfig):
	cf_guidance_probability: float = 0.0
	"""Probability to use classifier-free guidance by dropping conditioning.
	Note that this requires the model to be set with
	`trained_with_cf_guidance = True`!
	"""
	step_sampling: StepsSamplerConfig = field(
	default_factory=lambda: StepsSamplerConfig()
	)

	log_losses_per_timestep_bucket: bool = False


	@CriterionsFactory.register("two_tower_diffusion_next_sent")
	class TwoTowerDiffusionCriterion(LCMCriterion):
	"""Computes the LCM training objective for next-sentence prediction with diffusion"""

	config: TowerDiffusionLCMCriterionConfig
	model: TwoTowerDiffusionLCModel

	def __init__(
	self,
	config: TowerDiffusionLCMCriterionConfig,
	model: TwoTowerDiffusionLCModel,
	style: LCMStyle = LCMStyle.UNSUPERVISED,
	):
	super().__init__(config, model, style)
	assert hasattr(self.base_model, "noise_scheduler"), (
	"Expecting the diffusion model to have a `noise_scheduler`"
	)
	self.noise_scheduler = self.base_model.noise_scheduler

	self.prediction_type = self.noise_scheduler.prediction_type

	self.trained_with_cf_guidance = self.base_model.config.trained_with_cf_guidance

	self.cf_guidance_probability = config.cf_guidance_probability

	assert (
	bool(self.cf_guidance_probability > 0) == self.trained_with_cf_guidance
	), (
	"Expecting the config's cf_guidance_probabilitya to align with the model's `trained_with_cf_guidance` ",
	f"Found cf_guidance_probability={config.cf_guidance_probability} and "
	f"trained_with_cf_guidance={self.trained_with_cf_guidance}",
	)

	assert self.normalize_in_criterion, (
	"We only support `normalize_in_criterion = True` in the diffusion criterions"
	)

	self.summands.append("unnormalized_reconstruction_loss")

	if self.config.log_losses_per_timestep_bucket:
	# customize if needed
	self.step_bucketing_boundaries = torch.linspace(
	0, self.noise_scheduler.num_diffusion_train_steps, 11
	)
	self.step_bucketing_labels: List[str] = []
	for e in range(len(self.step_bucketing_boundaries) - 1):
	bucket_left = self.step_bucketing_boundaries[e]
	bucket_right = self.step_bucketing_boundaries[e + 1]
	self.step_bucketing_labels.append(
	f"reconstruction_loss_t{bucket_left:.0f}-{bucket_right:.0f}"
	)

	self.summands.extend(self.step_bucketing_labels)
	for label in self.step_bucketing_labels:
	register_metric_formatter(
	label, label, 1000, format_as_float, overwrite=True
	)

	# Step sampler + loss weighter
	self.step_sampler = StepsSampler(
	config.step_sampling,
	noise_scheduler=self.noise_scheduler,
	)

	def prepare_input_and_mask(
	self,
	batch: LCMInput,
	) -> Tuple[EmbeddingsBatch, EmbeddingsBatch, torch.Tensor]:
	"""
	A method for preparing model inputs and mask for a batch.
	It will be typically reused by the `__call__`
	implementations of the subclasses.
	Returns:
	- input_batch: context
	- target_batch: denoiser input
	- target_mask mask of positions to compute the loss over

	"""
	# Prepare the input as in MSE LCM: each sequence is (src, tgt)
	input_embeddings = batch.prepare_input(style=self.style)

	# Normalize the embeddings
	if self.normalize_in_criterion:
	input_embeddings = input_embeddings.normalize_seqs(self.sonar_normalizer)

	target_mask = torch.ones(
	size=input_embeddings.seqs.shape[:-1],
	dtype=torch.bool,
	device=input_embeddings.seqs.device,
	)

	# Factor in padded positions:
	if input_embeddings.padding_mask is not None:
	target_mask &= input_embeddings.padding_mask.materialize()

	return input_embeddings, input_embeddings.clone(), target_mask

	def sample_noisy_input_and_targets(self, input_batch, target_mask):
	"""
	(1)
	Prepares the noised inputs (latents) by sampling diffusion timesteps and calling
	on the model's noise_scheduler to add noise accordingly
	(2) Given the scheduler prediction type, prepares the target that the model will be
	trained to predict.

	:param input_bach: EmbeddingsBatch of the ground truth embeddings with seqs in (B, T, C)
	:param target_mask: Bool tensor in (B, T) where `True` signals that the
	model will be asked to predict the position
	"""
	input_seqs, padding_mask = input_batch.seqs, input_batch.padding_mask

	timesteps = self.step_sampler.sample(
	size=input_seqs[..., 0].size(), device=input_seqs.device
	)

	# Sample noise
	noise_seqs = torch.randn_like(input_seqs)

	# Define target in (B*T, C)
	sonar_dim = input_seqs.size(-1)
	if self.prediction_type == "sample":
	"""Predict the clean ground truth embeddings. Default mode"""
	target = input_seqs.view(-1, sonar_dim)

	elif self.prediction_type == "epsilon":
	"""Predict the added noise"""
	target = noise_seqs.view(-1, sonar_dim)

	elif self.prediction_type == "v_prediction":
	"""Predict an interpolation of the ground truth clean
	embeddings and the added noise.
	As introduced in https://arxiv.org/pdf/2305.08891
	"""
	target = self.noise_scheduler.get_velocity(
	input_seqs.view(-1, sonar_dim),
	noise_seqs.view(-1, sonar_dim),
	timesteps.view(-1),
	).clone()
	else:
	raise ValueError(
	"Prediction type should be either: sample, epsilon, v_prediction"
	)

	# Add noise
	# Reshape inputs and noise into in (B*T , C) -> add noise -> reshape back as (B, T, C)
	noisy_input_seqs = self.noise_scheduler.add_noise(
	input_seqs.view(-1, sonar_dim),
	noise_seqs.view(-1, sonar_dim),
	timesteps.view(-1),
	).view(input_seqs.size())

	# Create sequence batch with diffusion timesteps
	noisy_input_batch = EmbeddingsBatch(
	noisy_input_seqs,
	padding_mask,
	diffusion_timesteps=timesteps,
	)
	return noisy_input_batch, target, target_mask

	def compute_loss(
	self, flattened_predictions, flattened_target
	) -> Tuple[Tensor, Tensor, Tensor]:
	"""
	Parameters:
	flattened_predictions (Tensor): The predictions in (N, C)
	flattened_target (Tensor): The targets in (N, C)

	Returns:
	reconstruction_loss (Tensor): The Reconstruction loss we want to optimize (RMSE, SmoothL1, Huber etc.).
	plain_reconstruction_loss (Tensor): plain RMSE loss.
	unnormalized_reconstruction_loss (Tensor): plain RMSE loss between unnormalized features.
	"""
	reconstruction_loss, plain_reconstruction_loss = compute_standard_mse(
	flattened_predictions,
	flattened_target,
	)

	unnormalized_reconstruction_loss, _ = compute_standard_mse(
	flattened_predictions,
	flattened_target,
	normalizer=self.sonar_normalizer,
	)
	# For backward compatibility with ongoing runs, take the sqrt
	if self.config.compute_rmse:
	epsilon = 1e-5
	reconstruction_loss = torch.sqrt(reconstruction_loss + epsilon)
	plain_reconstruction_loss = torch.sqrt(plain_reconstruction_loss + epsilon)
	unnormalized_reconstruction_loss = torch.sqrt(
	unnormalized_reconstruction_loss + epsilon
	)

	return (
	reconstruction_loss,
	plain_reconstruction_loss,
	unnormalized_reconstruction_loss,
	)

	@torch.no_grad()
	def _log_losses_per_step(self, batch_steps, reconstruction_loss):
	# Aggregate loss terms based on their bucket of diffusion steps for tracking
	summands = {}
	if self.config.log_losses_per_timestep_bucket:
	# Reconstruction_loss in BT,
	# batch_steps in BT,
	bucket_index = torch.bucketize(
	batch_steps, self.step_bucketing_boundaries.to(batch_steps.device)
	)
	onehot = F.one_hot(
	bucket_index,
	num_classes=self.step_bucketing_boundaries.numel(),
	)
	loss_per_step = torch.matmul(onehot.t().float(), reconstruction_loss)
	count_steps = onehot.sum(dim=0) + 1e-6
	if self.reduction == "mean":
	loss_per_step /= count_steps

	for e, label in enumerate(self.step_bucketing_labels):
	summands[label] = (
	loss_per_step[e].item(),
	count_steps[e].long().item(),
	)

	return summands

	def __call__(self, batch: LCMInput) -> LossTerm:
	"""
	Input batch is LCMInput with:
	source: List[Tensor]
	target: Union[None, List[Tensor]]
	"""

	# Prepare the clean inputs and target mask:
	input_batch, target_batch, target_mask = self.prepare_input_and_mask(batch)

	noisy_target_batch, target, target_mask = self.sample_noisy_input_and_targets(
	target_batch, target_mask
	)
	# Encode the context and diffuse:
	output_batch = self.model(
	input_batch,
	noisy_target_batch,
	cf_guidance_prob=self.cf_guidance_probability,
	)

	# Shape B, T, C
	output_seqs = output_batch.seqs

	sonar_dim = output_seqs.size(-1)

	# only measure distance over `target_mask = True` positions
	target_mask = target_mask.reshape(-1)

	# The target is basically the doubled ground truth sequence before noising
	# (with some modification to adjust for the denoiser's prediction type)

	# contextualized latents (noised inputs preceding the target) e_1, e_2, ...
	flattened_predictions = output_seqs.view(-1, sonar_dim)[target_mask]

	# x1, x2, ..., xT
	# Target is already in B*T, C
	flattened_target = target[target_mask]

	# Cast features to float32 before computing the loss:
	(
	reconstruction_loss,
	mse_loss,
	unnormalized_reconstruction_loss,
	) = self.compute_loss(flattened_predictions.float(), flattened_target.float())

	num_target_elements = target_mask.sum()

	batch_steps = noisy_target_batch.diffusion_timesteps.view(-1)[target_mask]

	summands = self._log_losses_per_step(batch_steps, reconstruction_loss)

	# Get loss scales per timestep (gamma)
	gammas = self.step_sampler.get_loss_scales(batch_steps)
	# Weight the loss terms
	if gammas is not None:
	reconstruction_loss = torch.mul(reconstruction_loss, gammas)

	if self.reduction == "sum" or num_target_elements == 0:
	reduced_reconstruction_loss = reconstruction_loss.sum()
	mse_loss = mse_loss.sum()
	unnormalized_reconstruction_loss = unnormalized_reconstruction_loss.sum()

	elif self.reduction == "mean":
	reduced_reconstruction_loss = reconstruction_loss.mean()
	mse_loss = mse_loss.mean()
	unnormalized_reconstruction_loss = unnormalized_reconstruction_loss.mean()

	final_loss = reduced_reconstruction_loss

	# Loss summands for records
	summands.update(
	{
	"mse_loss": (mse_loss.item(), -1),
	"reconstruction_loss": (reduced_reconstruction_loss.item(), -1),
	"unnormalized_reconstruction_loss": (
	unnormalized_reconstruction_loss.item(),
	-1,
	),
	}
	)

	return LossTerm(
	value=final_loss,
	batch_size=output_seqs.size(0),
	num_target_elements=num_target_elements.item(),
	summands=summands,
	)


	@CriterionsFactory.register("two_tower_diffusion_next_sent_finetuning")
	class DiffusionNextSentFinetuningCriterion(TwoTowerDiffusionCriterion):
	def __init__(
	self,
	config: TowerDiffusionLCMCriterionConfig,
	model: TwoTowerDiffusionLCModel,
	):
	super().__init__(config, model, LCMStyle.SUPERVISED)

	def prepare_input_and_mask(
	self,
	batch: LCMInput,
	) -> Tuple[EmbeddingsBatch, EmbeddingsBatch, torch.Tensor]:
	"""
	A method for preparing model inputs and mask for a batch.
	It will be typically reused by the `__call__`
	implementations of the subclasses.

	Returns:
	- input_batch: context
	- target_batch: denoiser input
	- target_mask mask of positions to compute the loss over
	"""

	# Prepare the input as in MSE LCM
	input_embeddings = batch.prepare_input(style=self.style)

	assert input_embeddings.source_lengths is not None, (
	"Missing source lengths needed for the two-tower supervised fintuning"
	)

	target_embeddings = EmbeddingsBatch(*pad_seqs(batch.target)) # type: ignore

	# Normalize the embeddings
	if self.normalize_in_criterion:
	input_embeddings = input_embeddings.normalize_seqs(self.sonar_normalizer)
	target_embeddings = target_embeddings.normalize_seqs(self.sonar_normalizer)

	target_mask = torch.ones(
	size=target_embeddings.shape[:-1],
	dtype=torch.bool,
	device=input_embeddings.seqs.device,
	)

	# Factor in padded positions:
	if target_embeddings.padding_mask is not None:
	target_mask &= target_embeddings.padding_mask.materialize()

	return input_embeddings, target_embeddings, target_mask