transformers-community
/

contrastive-search

@@ -282,11 +282,11 @@ def _contrastive_search(
                     f"{model.__class__.__name__} does not support caching and therefore **can't** be used "
                     "for contrastive search."
                 )
-            # We now only use Cache classes, but a few models have custom cache class, so we use this check instead of an instance check
-            elif not hasattr(past_key_values, "update"):
                 raise ValueError(
-                    f"{model.__class__.__name__} does not have a standard cache format and therefore **can't** be "
-                    "used for contrastive search without further modifications."
                 )
         # contrastive_search main logic start:
@@ -324,19 +324,7 @@ def _contrastive_search(
         del outputs
         if not sequential:
-            # Replicates the new past_key_values to match the `top_k` candidates
-            if isinstance(outputs["past_key_values"], DynamicCache) or (
-                    isinstance(outputs["past_key_values"], EncoderDecoderCache)
-                    and isinstance(
-                        outputs["past_key_values"].self_attention_cache, DynamicCache
-                    )
-                ):
-                model_kwargs["past_key_values"] = model_kwargs["past_key_values"].batch_repeat_interleave(top_k)
-            else:
-                raise ValueError(
-                    f"Unsupported cache type: {type(outputs['past_key_values'])}. Contrastive search requires "
-                    "dynamic cache, so set `cache_implementation='dynamic'` in the generation config."
-                )
         if sequential:
             all_outputs = []
@@ -352,21 +340,10 @@ def _contrastive_search(
                     output_hidden_states=True,
                     output_attentions=output_attentions,
                 )
-                if isinstance(outputs["past_key_values"], DynamicCache) or (
-                    isinstance(outputs["past_key_values"], EncoderDecoderCache)
-                    and isinstance(
-                        outputs["past_key_values"].self_attention_cache, DynamicCache
-                    )
-                ):
-                    # Remove past K-V from output since we don't need to stack later
-                    outputs["past_key_values"] = None
-                    # Remove last token from past K-V since we don't want to append it at this point
-                    model_kwargs["past_key_values"].crop(-1)
-                else:
-                    raise ValueError(
-                        f"Unsupported cache type: {type(outputs['past_key_values'])}. Contrastive search requires "
-                        "dynamic cache, so set `cache_implementation='dynamic'` in the generation config."
-                    )
                 all_outputs.append(outputs)
             outputs = stack_model_outputs(all_outputs, model.config.get_text_config())
@@ -463,17 +440,7 @@ def _contrastive_search(
                 next_past_key_values = next_past_key_values or getattr(
                     outputs, possible_cache_name, None
                 )
-            # Do it in-place layer per layer to save memory
-            if isinstance(next_past_key_values, DynamicCache) or (
-                isinstance(next_past_key_values, EncoderDecoderCache)
-                and isinstance(next_past_key_values.self_attention_cache, DynamicCache)
-            ):
-                next_past_key_values.batch_select_indices(augmented_idx)
-            else:
-                raise ValueError(
-                    f"Unsupported cache type: {type(outputs['past_key_values'])}. Contrastive search requires "
-                    "dynamic cache, so set `cache_implementation='dynamic'` in the generation config."
-                )
         logit_for_next_step = torch.stack(torch.split(logits, top_k))[
             range(batch_size), selected_idx, :
@@ -549,18 +516,7 @@ def _contrastive_search(
         # Contrastive search works by forward looking at the next token, so we need to exclude it from
         # `past_key_values` to be consistent with the other decoding methods
         if model_kwargs.get("past_key_values") is not None:
-            if isinstance(model_kwargs["past_key_values"], DynamicCache) or (
-                isinstance(model_kwargs["past_key_values"], EncoderDecoderCache)
-                and isinstance(
-                    model_kwargs["past_key_values"].self_attention_cache, DynamicCache
-                )
-            ):
-                model_kwargs["past_key_values"].crop(-1)
-            else:
-                raise ValueError(
-                    f"Unsupported cache type: {type(outputs['past_key_values'])}. Contrastive search requires "
-                    "dynamic cache, so set `cache_implementation='dynamic'` in the generation config."
-                )
         if model.config.is_encoder_decoder:
             return GenerateEncoderDecoderOutput(

                     f"{model.__class__.__name__} does not support caching and therefore **can't** be used "
                     "for contrastive search."
                 )
+            # Only those caches have the necesary methods
+            elif not (isinstance(past_key_values, DynamicCache) or (isinstance(past_key_values, EncoderDecoderCache) and isinstance(past_key_values.self_attention_cache, DynamicCache))):
                 raise ValueError(
+                    f"Unsupported cache type: {type(outputs['past_key_values'])}. Contrastive search requires "
+                    "dynamic cache, so set `cache_implementation='dynamic'` in the generation config."
                 )
         # contrastive_search main logic start:
         del outputs
         if not sequential:
+            model_kwargs["past_key_values"] = model_kwargs["past_key_values"].batch_repeat_interleave(top_k)
         if sequential:
             all_outputs = []
                     output_hidden_states=True,
                     output_attentions=output_attentions,
                 )
+                # Remove past K-V from output since we don't need to stack later
+                outputs["past_key_values"] = None
+                # Remove last token from past K-V since we don't want to append it at this point
+                model_kwargs["past_key_values"].crop(-1)
                 all_outputs.append(outputs)
             outputs = stack_model_outputs(all_outputs, model.config.get_text_config())
                 next_past_key_values = next_past_key_values or getattr(
                     outputs, possible_cache_name, None
                 )
+            next_past_key_values.batch_select_indices(augmented_idx)
         logit_for_next_step = torch.stack(torch.split(logits, top_k))[
             range(batch_size), selected_idx, :
         # Contrastive search works by forward looking at the next token, so we need to exclude it from
         # `past_key_values` to be consistent with the other decoding methods
         if model_kwargs.get("past_key_values") is not None:
+            model_kwargs["past_key_values"].crop(-1)
         if model.config.is_encoder_decoder:
             return GenerateEncoderDecoderOutput(