# Copyright 2025 The Scenic Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Loss functions.""" from absl import logging from flax.training import common_utils import jax import jax.numpy as jnp from scenic.model_lib.base_models import model_utils as base_model_utils def nll_loss(targets, pred, target_masks=None, label_smoothing=0): """Negative Log-loglikelihood loss (perplexity). Args: targets: ground-truth labels pred: predicted logits target_masks: mask that don't count label_smoothing: factor to smooth label. Returns: loss value """ vocab_size = pred.shape[-1] onehot_targets = common_utils.onehot(targets, vocab_size) return base_model_utils.weighted_softmax_cross_entropy( pred, onehot_targets, target_masks, label_smoothing=label_smoothing) def contrastive_loss(query_emb: jnp.ndarray, key_emb: jnp.ndarray, temperature: float = 1.0): """Contrastive loss with hard negative samples & other in-batch negatives. Args: query_emb: An array of shape [bsz, n_dim]. key_emb: An array of shape [bsz, n_knowledge, n_dim]. Only the first one is true positive sample, and the others are hard negatives. temperature: A scalar that the temprature is divided by it. Returns: Computed loss value. """ if query_emb.shape[0] != key_emb.shape[0]: raise ValueError('query_emb and key_emb should have the same batch size.') if query_emb.shape[-1] != key_emb.shape[-1]: raise ValueError( 'query_emb and key_emb should have the same embedding size.') per_device_bsz, k = query_emb.shape[0], key_emb.shape[1] global_key_emb = jnp.concatenate(jax.lax.all_gather(key_emb, 'batch'), 0) labels = jax.lax.axis_index( axis_name='batch') * per_device_bsz * k + jnp.arange(per_device_bsz) # bsz×d @ (bsz*n_device)×K×d -> bsz×(bsz * k * n_device) # positive pairs are on first diagonal. score_matrix = jnp.reshape( jnp.einsum('bd,nkd->bkn', query_emb, global_key_emb), [per_device_bsz, -1]) loss = nll_loss(pred=score_matrix / temperature, targets=labels) accs = jnp.equal(jnp.argmax(score_matrix, axis=1), labels) s0, s1 = score_matrix[0][0], score_matrix[0][1] # debug purpose logging.info('backward host_id : %d', jax.process_index()) logging.info(jax.lax.axis_index(axis_name='batch')) return loss, (jnp.mean(accs), s0, s1)