| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """TF general model utils.""" |
|
|
| from __future__ import annotations |
|
|
| import functools |
| import gc |
| import inspect |
| import json |
| import os |
| import pickle |
| import re |
| import warnings |
| from collections.abc import Mapping |
| from pathlib import Path |
| from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union |
|
|
| import h5py |
| import numpy as np |
| import tensorflow as tf |
| from huggingface_hub import Repository, list_repo_files |
| from keras import backend as K |
| from packaging.version import parse |
| from tensorflow.python.util.keras_deps import get_call_context_function |
|
|
| from . import DataCollatorWithPadding, DefaultDataCollator |
| from .activations_tf import get_tf_activation |
| from .configuration_utils import PretrainedConfig |
| from .dynamic_module_utils import custom_object_save |
| from .generation import GenerationConfig, TFGenerationMixin |
| from .tf_utils import ( |
| expand_1d, |
| load_attributes_from_hdf5_group, |
| save_attributes_to_hdf5_group, |
| shape_list, |
| ) |
| from .utils import ( |
| SAFE_WEIGHTS_INDEX_NAME, |
| SAFE_WEIGHTS_NAME, |
| TF2_WEIGHTS_INDEX_NAME, |
| TF2_WEIGHTS_NAME, |
| TF_WEIGHTS_NAME, |
| WEIGHTS_INDEX_NAME, |
| WEIGHTS_NAME, |
| ModelOutput, |
| PushToHubMixin, |
| cached_file, |
| download_url, |
| find_labels, |
| has_file, |
| is_offline_mode, |
| is_remote_url, |
| is_safetensors_available, |
| is_tf_symbolic_tensor, |
| logging, |
| requires_backends, |
| working_or_temp_dir, |
| ) |
| from .utils.hub import convert_file_size_to_int, get_checkpoint_shard_files |
|
|
|
|
| if is_safetensors_available(): |
| from safetensors import safe_open |
| from safetensors.tensorflow import save_file as safe_save_file |
|
|
| if TYPE_CHECKING: |
| from . import PreTrainedTokenizerBase |
|
|
|
|
| logger = logging.get_logger(__name__) |
| tf_logger = tf.get_logger() |
|
|
| TFModelInputType = Union[ |
| List[tf.Tensor], |
| List[np.ndarray], |
| Dict[str, tf.Tensor], |
| Dict[str, np.ndarray], |
| tf.Tensor, |
| np.ndarray, |
| ] |
|
|
|
|
| def dummy_loss(y_true, y_pred): |
| if y_pred.shape.rank <= 1: |
| return y_pred |
| else: |
| reduction_axes = list(range(1, y_pred.shape.rank)) |
| return tf.reduce_mean(y_pred, axis=reduction_axes) |
|
|
|
|
| class TFModelUtilsMixin: |
| """ |
| A few utilities for `tf.keras.Model`, to be used as a mixin. |
| """ |
|
|
| def num_parameters(self, only_trainable: bool = False) -> int: |
| """ |
| Get the number of (optionally, trainable) parameters in the model. |
| |
| Args: |
| only_trainable (`bool`, *optional*, defaults to `False`): |
| Whether or not to return only the number of trainable parameters |
| |
| Returns: |
| `int`: The number of parameters. |
| """ |
| if only_trainable: |
| return int(sum(np.prod(w.shape.as_list()) for w in self.trainable_variables)) |
| else: |
| return self.count_params() |
|
|
|
|
| def keras_serializable(cls): |
| """ |
| Decorate a Keras Layer class to support Keras serialization. |
| |
| This is done by: |
| |
| 1. Adding a `transformers_config` dict to the Keras config dictionary in `get_config` (called by Keras at |
| serialization time. |
| 2. Wrapping `__init__` to accept that `transformers_config` dict (passed by Keras at deserialization time) and |
| convert it to a config object for the actual layer initializer. |
| 3. Registering the class as a custom object in Keras (if the Tensorflow version supports this), so that it does not |
| need to be supplied in `custom_objects` in the call to `tf.keras.models.load_model`. |
| |
| Args: |
| cls (a `tf.keras.layers.Layers subclass`): |
| Typically a `TF.MainLayer` class in this project, in general must accept a `config` argument to its |
| initializer. |
| |
| Returns: |
| The same class object, with modifications for Keras deserialization. |
| """ |
| initializer = cls.__init__ |
|
|
| config_class = getattr(cls, "config_class", None) |
| if config_class is None: |
| raise AttributeError("Must set `config_class` to use @keras_serializable") |
|
|
| @functools.wraps(initializer) |
| def wrapped_init(self, *args, **kwargs): |
| config = args[0] if args and isinstance(args[0], PretrainedConfig) else kwargs.pop("config", None) |
|
|
| if isinstance(config, dict): |
| config = config_class.from_dict(config) |
| initializer(self, config, *args, **kwargs) |
| elif isinstance(config, PretrainedConfig): |
| if len(args) > 0: |
| initializer(self, *args, **kwargs) |
| else: |
| initializer(self, config, *args, **kwargs) |
| else: |
| raise ValueError("Must pass either `config` (PretrainedConfig) or `config` (dict)") |
|
|
| self._config = config |
| self._kwargs = kwargs |
|
|
| cls.__init__ = wrapped_init |
|
|
| if not hasattr(cls, "get_config"): |
| raise TypeError("Only use @keras_serializable on tf.keras.layers.Layer subclasses") |
| if hasattr(cls.get_config, "_is_default"): |
|
|
| def get_config(self): |
| cfg = super(cls, self).get_config() |
| cfg["config"] = self._config.to_dict() |
| cfg.update(self._kwargs) |
| return cfg |
|
|
| cls.get_config = get_config |
|
|
| cls._keras_serializable = True |
| if hasattr(tf.keras.utils, "register_keras_serializable"): |
| cls = tf.keras.utils.register_keras_serializable()(cls) |
| return cls |
|
|
|
|
| class TFCausalLanguageModelingLoss: |
| """ |
| Loss function suitable for causal language modeling (CLM), that is, the task of guessing the next token. |
| |
| <Tip> |
| |
| Any label of -100 will be ignored (along with the corresponding logits) in the loss computation. |
| |
| </Tip> |
| """ |
|
|
| def hf_compute_loss(self, labels, logits): |
| loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( |
| from_logits=True, reduction=tf.keras.losses.Reduction.NONE |
| ) |
| if self.config.tf_legacy_loss: |
| |
| active_loss = tf.not_equal(tf.reshape(labels, (-1,)), -100) |
| reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss) |
| labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss) |
| return loss_fn(labels, reduced_logits) |
|
|
| |
| unmasked_loss = loss_fn(tf.nn.relu(labels), logits) |
| |
| loss_mask = tf.cast(labels != -100, dtype=unmasked_loss.dtype) |
| masked_loss = unmasked_loss * loss_mask |
| reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(loss_mask) |
| return tf.reshape(reduced_masked_loss, (1,)) |
|
|
|
|
| class TFQuestionAnsweringLoss: |
| """ |
| Loss function suitable for question answering. |
| """ |
|
|
| def hf_compute_loss(self, labels, logits): |
| loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( |
| from_logits=True, reduction=tf.keras.losses.Reduction.NONE |
| ) |
| start_loss = loss_fn(labels["start_position"], logits[0]) |
| end_loss = loss_fn(labels["end_position"], logits[1]) |
|
|
| return (start_loss + end_loss) / 2.0 |
|
|
|
|
| class TFTokenClassificationLoss: |
| """ |
| Loss function suitable for token classification. |
| |
| <Tip> |
| |
| Any label of -100 will be ignored (along with the corresponding logits) in the loss computation. |
| |
| </Tip> |
| """ |
|
|
| def hf_compute_loss(self, labels, logits): |
| loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( |
| from_logits=True, reduction=tf.keras.losses.Reduction.NONE |
| ) |
| if tf.executing_eagerly(): |
| if tf.math.reduce_any(labels == -1): |
| tf.print("Using `-1` to mask the loss for the token is deprecated. Please use `-100` instead.") |
|
|
| if self.config.tf_legacy_loss: |
| |
| |
| if tf.math.reduce_any(labels == -1): |
| tf.print("Using `-1` to mask the loss for the token is deprecated. Please use `-100` instead.") |
| active_loss = tf.reshape(labels, (-1,)) != -1 |
| else: |
| active_loss = tf.reshape(labels, (-1,)) != -100 |
| reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss) |
| labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss) |
|
|
| return loss_fn(labels, reduced_logits) |
|
|
| |
| unmasked_loss = loss_fn(tf.nn.relu(labels), logits) |
| |
| |
| loss_mask = tf.cast(labels >= 0, dtype=unmasked_loss.dtype) |
| |
| |
| masked_loss = unmasked_loss * loss_mask |
| reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(loss_mask) |
| return tf.reshape(reduced_masked_loss, (1,)) |
|
|
|
|
| class TFSequenceClassificationLoss: |
| """ |
| Loss function suitable for sequence classification. |
| """ |
|
|
| def hf_compute_loss(self, labels, logits): |
| if logits.shape.rank == 1 or logits.shape[1] == 1: |
| loss_fn = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE) |
| if labels.shape.rank == 1: |
| |
| labels = tf.expand_dims(labels, axis=-1) |
| else: |
| loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( |
| from_logits=True, reduction=tf.keras.losses.Reduction.NONE |
| ) |
|
|
| return loss_fn(labels, logits) |
|
|
|
|
| class TFMultipleChoiceLoss: |
| """Loss function suitable for multiple choice tasks.""" |
|
|
| def hf_compute_loss(self, labels, logits): |
| loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( |
| from_logits=True, reduction=tf.keras.losses.Reduction.NONE |
| ) |
| return loss_fn(labels, logits) |
|
|
|
|
| class TFMaskedLanguageModelingLoss(TFCausalLanguageModelingLoss): |
| """ |
| Loss function suitable for masked language modeling (MLM), that is, the task of guessing the masked tokens. |
| |
| <Tip> |
| |
| Any label of -100 will be ignored (along with the corresponding logits) in the loss computation. |
| |
| </Tip> |
| """ |
|
|
|
|
| class TFNextSentencePredictionLoss: |
| """ |
| Loss function suitable for next sentence prediction (NSP), that is, the task of guessing the next sentence. |
| |
| <Tip> |
| |
| Any label of -100 will be ignored (along with the corresponding logits) in the loss computation. |
| |
| </Tip> |
| """ |
|
|
| def hf_compute_loss(self, labels, logits): |
| loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( |
| from_logits=True, reduction=tf.keras.losses.Reduction.NONE |
| ) |
| if self.config.tf_legacy_loss: |
| |
| |
| next_sentence_active_loss = tf.not_equal(tf.reshape(labels, (-1,)), -100) |
| next_sentence_reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, 2)), next_sentence_active_loss) |
| next_sentence_label = tf.boolean_mask(tf.reshape(labels, (-1,)), next_sentence_active_loss) |
|
|
| return loss_fn(next_sentence_label, next_sentence_reduced_logits) |
|
|
| |
| |
|
|
| |
| unmasked_ns_loss = loss_fn(y_true=tf.nn.relu(labels), y_pred=logits) |
| ns_loss_mask = tf.cast(labels != -100, dtype=unmasked_ns_loss.dtype) |
| |
| masked_ns_loss = unmasked_ns_loss * ns_loss_mask |
|
|
| return masked_ns_loss |
|
|
|
|
| def booleans_processing(config, **kwargs): |
| """ |
| Process the input booleans of each model. |
| |
| Args: |
| config ([`PretrainedConfig`]): |
| The config of the running model. |
| **kwargs: |
| The boolean parameters |
| |
| Returns: |
| A dictionary with the proper values for each boolean |
| """ |
| final_booleans = {} |
|
|
| |
| |
| if "output_attentions" in kwargs: |
| final_booleans["output_attentions"] = ( |
| kwargs["output_attentions"] if kwargs["output_attentions"] is not None else config.output_attentions |
| ) |
| final_booleans["output_hidden_states"] = ( |
| kwargs["output_hidden_states"] if kwargs["output_hidden_states"] is not None else config.output_hidden_states |
| ) |
| final_booleans["return_dict"] = kwargs["return_dict"] if kwargs["return_dict"] is not None else config.return_dict |
|
|
| if "use_cache" in kwargs: |
| final_booleans["use_cache"] = ( |
| kwargs["use_cache"] if kwargs["use_cache"] is not None else getattr(config, "use_cache", None) |
| ) |
| return final_booleans |
|
|
|
|
| def unpack_inputs(func): |
| """ |
| Decorator that processes the inputs to a Keras layer, passing them to the layer as keyword arguments. This enables |
| downstream use of the inputs by their variable name, even if they arrive packed as a dictionary in the first input |
| (common case in Keras). |
| |
| Args: |
| func (`callable`): |
| The callable function of the TensorFlow model. |
| |
| |
| Returns: |
| A callable that wraps the original `func` with the behavior described above. |
| """ |
|
|
| original_signature = inspect.signature(func) |
|
|
| @functools.wraps(func) |
| def run_call_with_unpacked_inputs(self, *args, **kwargs): |
| |
| kwargs_call = {key: val for key, val in kwargs.items() if key not in dict(original_signature.parameters)} |
| fn_args_and_kwargs = {key: val for key, val in kwargs.items() if key not in kwargs_call} |
| fn_args_and_kwargs.update({"kwargs_call": kwargs_call}) |
|
|
| |
| fn_args_and_kwargs.update(dict(zip(func.__code__.co_varnames[1:], args))) |
|
|
| |
| if "EncoderDecoder" in self.__class__.__name__: |
| config = None |
| else: |
| config = self.config |
|
|
| unpacked_inputs = input_processing(func, config, **fn_args_and_kwargs) |
| return func(self, **unpacked_inputs) |
|
|
| |
| |
| |
| run_call_with_unpacked_inputs.__signature__ = original_signature |
|
|
| return run_call_with_unpacked_inputs |
|
|
|
|
| def input_processing(func, config, **kwargs): |
| """ |
| Process the input of each TensorFlow model including the booleans. In case of a list of symbolic inputs, each input |
| has to be named accordingly to the parameters name, i.e. `input_ids = tf.keras.Input(shape=(128,), dtype='int32', |
| name="input_ids")` otherwise the order of the tensors will not be guaranteed during the training. |
| |
| Args: |
| func (`callable`): |
| The callable function of the TensorFlow model. |
| config ([`PretrainedConfig`]): |
| The config of the running model. |
| **kwargs: |
| The inputs of the model. |
| |
| Returns: |
| Two lists, one for the missing layers, and another one for the unexpected layers. |
| """ |
| signature = dict(inspect.signature(func).parameters) |
| has_kwargs = bool(signature.pop("kwargs", None)) |
| signature.pop("self", None) |
| parameter_names = list(signature.keys()) |
| main_input_name = parameter_names[0] |
| main_input = kwargs.pop(main_input_name, None) |
| output = {} |
| allowed_types = (tf.Tensor, bool, int, ModelOutput, tuple, list, dict, np.ndarray) |
|
|
| if "inputs" in kwargs["kwargs_call"]: |
| warnings.warn( |
| "The `inputs` argument is deprecated and will be removed in a future version, use `input_ids` instead.", |
| FutureWarning, |
| ) |
|
|
| output["input_ids"] = kwargs["kwargs_call"].pop("inputs") |
|
|
| if "decoder_cached_states" in kwargs["kwargs_call"]: |
| warnings.warn( |
| "The `decoder_cached_states` argument is deprecated and will be removed in a future version, use" |
| " `past_key_values` instead.", |
| FutureWarning, |
| ) |
| output["past_key_values"] = kwargs["kwargs_call"].pop("decoder_cached_states") |
|
|
| if "past" in kwargs["kwargs_call"] and "past_key_values" in parameter_names: |
| warnings.warn( |
| "The `past` argument is deprecated and will be removed in a future version, use `past_key_values`" |
| " instead.", |
| FutureWarning, |
| ) |
| kwargs["past_key_values"] = kwargs["kwargs_call"].pop("past") |
| elif "past_key_values" in kwargs["kwargs_call"] and "past" in parameter_names: |
| kwargs["past"] = kwargs["kwargs_call"].pop("past_key_values") |
|
|
| if has_kwargs: |
| output["kwargs"] = kwargs.pop("kwargs_call", {}) |
| else: |
| if len(kwargs["kwargs_call"]) > 0: |
| raise ValueError( |
| "The following keyword arguments are not supported by this model:" |
| f" {list(kwargs['kwargs_call'].keys())}." |
| ) |
| kwargs.pop("kwargs_call") |
|
|
| for k, v in kwargs.items(): |
| if isinstance(v, allowed_types) or tf.is_tensor(v) or v is None: |
| output[k] = v |
| else: |
| raise ValueError(f"Data of type {type(v)} is not allowed only {allowed_types} is accepted for {k}.") |
|
|
| if isinstance(main_input, (tuple, list)): |
| for i, input in enumerate(main_input): |
| |
| if is_tf_symbolic_tensor(input): |
| |
| |
| tensor_name = input.name.split(":")[0] |
|
|
| if tensor_name in parameter_names: |
| output[tensor_name] = input |
| else: |
| output[parameter_names[i]] = input |
| elif isinstance(input, allowed_types) or input is None: |
| output[parameter_names[i]] = input |
| else: |
| raise ValueError( |
| f"Data of type {type(input)} is not allowed only {allowed_types} is accepted for" |
| f" {parameter_names[i]}." |
| ) |
| elif isinstance(main_input, Mapping): |
| if "inputs" in main_input: |
| warnings.warn( |
| "The `inputs` argument is deprecated and will be removed in a future version, use `input_ids`" |
| " instead.", |
| FutureWarning, |
| ) |
|
|
| output["input_ids"] = main_input.pop("inputs") |
|
|
| if "decoder_cached_states" in main_input: |
| warnings.warn( |
| "The `decoder_cached_states` argument is deprecated and will be removed in a future version, use" |
| " `past_key_values` instead.", |
| FutureWarning, |
| ) |
| output["past_key_values"] = main_input.pop("decoder_cached_states") |
|
|
| for k, v in dict(main_input).items(): |
| if isinstance(v, allowed_types) or v is None: |
| output[k] = v |
| elif k not in parameter_names and "args" not in parameter_names: |
| logger.warning( |
| f"The parameter {k} does not belongs to the parameter list {parameter_names} and will be ignored." |
| ) |
| continue |
| else: |
| raise ValueError(f"Data of type {type(v)} is not allowed only {allowed_types} is accepted for {k}.") |
| else: |
| if tf.is_tensor(main_input) or main_input is None: |
| output[main_input_name] = main_input |
| else: |
| raise ValueError( |
| f"Data of type {type(main_input)} is not allowed only {allowed_types} is accepted for" |
| f" {main_input_name}." |
| ) |
|
|
| |
| for name in parameter_names: |
| if name not in list(output.keys()) and name != "args": |
| output[name] = kwargs.pop(name, signature[name].default) |
|
|
| |
| |
| if "args" in output: |
| if output["args"] is not None and is_tf_symbolic_tensor(output["args"]): |
| tensor_name = output["args"].name.split(":")[0] |
| output[tensor_name] = output["args"] |
| else: |
| |
| output["input_ids"] = output["args"] |
|
|
| del output["args"] |
|
|
| if "kwargs" in output: |
| del output["kwargs"] |
|
|
| cast_output = {} |
| for key, val in output.items(): |
| if isinstance(val, tf.Tensor) and val.dtype == tf.int64: |
| cast_output[key] = tf.cast(val, tf.int32) |
| elif isinstance(val, np.ndarray) and val.dtype == np.int64: |
| cast_output[key] = val.astype(np.int32) |
| else: |
| cast_output[key] = val |
|
|
| output = cast_output |
| del cast_output |
|
|
| if config is not None: |
| boolean_dict = { |
| k: v |
| for k, v in output.items() |
| if k in ["return_dict", "output_attentions", "output_hidden_states", "use_cache"] |
| } |
|
|
| output.update( |
| booleans_processing( |
| config=config, |
| **boolean_dict, |
| ) |
| ) |
|
|
| return output |
|
|
|
|
| def dtype_byte_size(dtype): |
| """ |
| Returns the size (in bytes) occupied by one parameter of type `dtype`. |
| |
| Example: |
| |
| ```py |
| >>> dtype_byte_size(tf.float32) |
| 4 |
| ``` |
| """ |
| if dtype == tf.bool: |
| return 1 / 8 |
| bit_search = re.search(r"[^\d](\d+)$", dtype.name) |
| if bit_search is None: |
| raise ValueError(f"`dtype` is not a valid dtype: {dtype}.") |
| bit_size = int(bit_search.groups()[0]) |
| return bit_size // 8 |
|
|
|
|
| def format_weight_name(name, _prefix=None): |
| if "model." not in name and len(name.split("/")) > 1: |
| name = "/".join(name.split("/")[1:]) |
| if _prefix is not None: |
| name = _prefix + "/" + name |
| return name |
|
|
|
|
| def tf_shard_checkpoint(weights, max_shard_size="10GB"): |
| """ |
| Splits a model state dictionary in sub-checkpoints so that the final size of each sub-checkpoint does not exceed a |
| given size. |
| |
| The sub-checkpoints are determined by iterating through the `state_dict` in the order of its keys, so there is no |
| optimization made to make each sub-checkpoint as close as possible to the maximum size passed. For example, if the |
| limit is 10GB and we have weights of sizes [6GB, 6GB, 2GB, 6GB, 2GB, 2GB] they will get sharded as [6GB], [6+2GB], |
| [6+2+2GB] and not [6+2+2GB], [6+2GB], [6GB]. |
| |
| <Tip warning={true}> |
| |
| If one of the model's weight is bigger that `max_shard_size`, it will end up in its own sub-checkpoint which will |
| have a size greater than `max_shard_size`. |
| |
| </Tip> |
| |
| Args: |
| weights (`Dict[str, tf.RessourceVariable]`): The list of tf.RessourceVariable of a model to save. |
| max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`): |
| The maximum size of each sub-checkpoint. If expressed as a string, needs to be digits followed by a unit |
| (like `"5MB"`). |
| """ |
| max_shard_size = convert_file_size_to_int(max_shard_size) |
|
|
| sharded_state_dicts = [] |
| current_block = [] |
| current_block_size = 0 |
| total_size = 0 |
|
|
| for item in weights: |
| weight_size = item.numpy().size * dtype_byte_size(item.dtype) |
|
|
| |
| if current_block_size + weight_size > max_shard_size: |
| sharded_state_dicts.append(current_block) |
| current_block = [] |
| current_block_size = 0 |
|
|
| current_block.append(item) |
| current_block_size += weight_size |
| total_size += weight_size |
|
|
| |
| sharded_state_dicts.append(current_block) |
|
|
| |
| if len(sharded_state_dicts) == 1: |
| return {TF2_WEIGHTS_NAME: sharded_state_dicts[0]}, None |
|
|
| |
| weight_map = {} |
| shards = {} |
| for idx, shard in enumerate(sharded_state_dicts): |
| shard_file = TF2_WEIGHTS_NAME.replace(".h5", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.h5") |
| shards[shard_file] = shard |
| for weight in shard: |
| weight_name = weight.name |
| weight_map[weight_name] = shard_file |
|
|
| |
| metadata = {"total_size": total_size} |
| index = {"metadata": metadata, "weight_map": weight_map} |
| return shards, index |
|
|
|
|
| def load_tf_sharded_weights(model, shard_files, ignore_mismatched_sizes=False, strict=False, _prefix=None): |
| """ |
| This is the same as `load_tf_weights` but for a sharded checkpoint. Detect missing and unexpected layers and load |
| the TF weights from the shard file accordingly to their names and shapes. |
| |
| This load is performed efficiently: each checkpoint shard is loaded one by one in RAM and deleted after being |
| loaded in the model. |
| |
| Args: |
| model (`tf.keras.models.Model`): The model in which to load the checkpoint. |
| shard_files (`str` or `os.PathLike`): A list containing the sharded checkpoint names. |
| ignore_mismatched_sizes`bool`, *optional`, defaults to `True`): |
| Whether or not to ignore the mismatch between the sizes |
| strict (`bool`, *optional*, defaults to `True`): |
| Whether to strictly enforce that the keys in the model state dict match the keys in the sharded checkpoint. |
| |
| Returns: |
| Three lists, one for the missing layers, another one for the unexpected layers, and a last one for the |
| mismatched layers. |
| """ |
|
|
| |
| unexpected_keys = set() |
| saved_keys = set() |
| mismatched_keys = set() |
|
|
| |
| |
| model_keys = set() |
| model_layer_map = {} |
| for i, k in enumerate(model.weights): |
| layer_name = k.name |
| if _prefix is not None and layer_name.startswith(_prefix): |
| layer_name = layer_name[len(_prefix) :] |
| layer_name = layer_name.lstrip("/") |
| if not ("model." in layer_name or len(layer_name.split("/")) == 1): |
| layer_name = "/".join(layer_name.split("/")[1:]) |
| model_keys.add(layer_name) |
| model_layer_map[layer_name] = i |
|
|
| for shard_file in shard_files: |
| saved_weight_names_set, unexpected_keys_set, mismatched_keys_set = load_tf_shard( |
| model, |
| model_layer_map, |
| shard_file, |
| ignore_mismatched_sizes=ignore_mismatched_sizes, |
| _prefix=_prefix, |
| ) |
| saved_keys.update(saved_weight_names_set) |
| unexpected_keys.update(unexpected_keys_set) |
| mismatched_keys.update(mismatched_keys_set) |
| gc.collect() |
|
|
| missing_keys = model_keys - saved_keys |
| if strict and (len(missing_keys) > 0 or len(unexpected_keys) > 0): |
| error_message = f"Error(s) in loading state_dict for {model.__class__.__name__}" |
| if len(missing_keys) > 0: |
| str_missing_keys = ",".join([f'"{k}"' for k in missing_keys]) |
| error_message += f"\nMissing key(s): {str_missing_keys}." |
| if len(unexpected_keys) > 0: |
| str_unexpected_keys = ",".join([f'"{k}"' for k in unexpected_keys]) |
| error_message += f"\nMissing key(s): {str_unexpected_keys}." |
| raise RuntimeError(error_message) |
|
|
| return missing_keys, unexpected_keys, mismatched_keys |
|
|
|
|
| def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatched_sizes=False, _prefix=None): |
| """ |
| Loads a shard from a sharded checkpoint file. Handles the missing keys and unexpected keys. |
| |
| Args: |
| model (`tf.keras.models.Model`): Model in which the weights are loaded |
| model_layer_map (`Dict`): A dictionary mapping the layer name to the index of the layer in the model. |
| resolved_archive_file (`str`): Path to the checkpoint file from which the weights will be loaded |
| ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`): Whether to ignore the mismatched keys |
| |
| Returns: |
| `tf.keras.models.Model`: Three lists, one for the layers that were found and succesfully restored (from the |
| shard file), one for the mismatched layers, and another one for the unexpected layers. |
| """ |
| saved_weight_names_set = set() |
| saved_weights = {} |
| mismatched_keys = set() |
| unexpected_keys = set() |
| |
| try: |
| with h5py.File(resolved_archive_file, "r") as sharded_checkpoint_file: |
| |
| saved_h5_model_layers_name = set(load_attributes_from_hdf5_group(sharded_checkpoint_file, "layer_names")) |
| weight_value_tuples = [] |
|
|
| |
| |
| for layer_name in saved_h5_model_layers_name: |
| h5_layer_object = sharded_checkpoint_file[layer_name] |
| saved_weights[layer_name] = np.asarray(h5_layer_object) |
|
|
| saved_weight_names_set.add(layer_name) |
|
|
| if layer_name not in model_layer_map: |
| unexpected_keys.add(layer_name) |
| else: |
| symbolic_weight = model.weights[model_layer_map[layer_name]] |
|
|
| saved_weight_value = saved_weights[layer_name] |
| |
| if saved_weight_value is not None: |
| |
| if K.int_shape(symbolic_weight) != saved_weight_value.shape: |
| |
| |
| try: |
| array = np.reshape(saved_weight_value, K.int_shape(symbolic_weight)) |
| except ValueError as e: |
| if ignore_mismatched_sizes: |
| mismatched_keys.add( |
| (layer_name, saved_weight_value.shape, K.int_shape(symbolic_weight)) |
| ) |
| continue |
| else: |
| raise e |
| else: |
| array = saved_weight_value |
|
|
| |
| weight_value_tuples.append((symbolic_weight, array)) |
|
|
| K.batch_set_value(weight_value_tuples) |
|
|
| return saved_weight_names_set, unexpected_keys, mismatched_keys |
|
|
| except Exception as e: |
| try: |
| with open(resolved_archive_file) as f: |
| if f.read().startswith("version"): |
| raise OSError( |
| "You seem to have cloned a repository without having git-lfs installed. Please install " |
| "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder " |
| "you cloned." |
| ) |
| else: |
| raise ValueError( |
| f"Unable to locate the file {resolved_archive_file} which is necessary to load this pretrained" |
| " model. Make sure you have saved the model properly." |
| ) from e |
| except (UnicodeDecodeError, ValueError): |
| raise OSError( |
| f"Unable to load weights from TF checkpoint file for '{resolved_archive_file}' " |
| f"at '{resolved_archive_file}'. " |
| "If you tried to load a TF model from a sharded checkpoint, you should try converting the model" |
| "by loading it in pytorch and saving it localy. A convertion script should be realeased soon." |
| ) |
|
|
|
|
| def load_tf_weights(model, resolved_archive_file, ignore_mismatched_sizes=False, _prefix=None): |
| """ |
| Detect missing and unexpected layers and load the TF weights from the shard file accordingly to their names and |
| shapes. |
| |
| Args: |
| model (`tf.keras.models.Model`): |
| The model to load the weights into. |
| resolved_archive_file (`str`): |
| The location of the H5 file. |
| ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`): |
| Whether or not to ignore weights with shapes that don't match between the checkpoint of the model. |
| |
| Returns: |
| Three lists, one for the missing layers, another one for the unexpected layers, and a last one for the |
| mismatched layers. |
| """ |
| if resolved_archive_file.endswith(".safetensors"): |
| load_function = load_tf_weights_from_safetensors |
| else: |
| load_function = load_tf_weights_from_h5 |
|
|
| return load_function( |
| model, resolved_archive_file, ignore_mismatched_sizes=ignore_mismatched_sizes, _prefix=_prefix |
| ) |
|
|
|
|
| def load_tf_weights_from_h5(model, resolved_archive_file, ignore_mismatched_sizes=False, _prefix=None): |
| mismatched_layers = [] |
|
|
| |
| with h5py.File(resolved_archive_file, "r") as sharded_checkpoint_file: |
| |
| saved_h5_model_layers_name = set(load_attributes_from_hdf5_group(sharded_checkpoint_file, "layer_names")) |
|
|
| |
| missing_layers = list({layer.name for layer in model.layers} - saved_h5_model_layers_name) |
|
|
| |
| unexpected_layers = list(saved_h5_model_layers_name - {layer.name for layer in model.layers}) |
| saved_weight_names_set = set() |
| symbolic_weights_names = set() |
| weight_value_tuples = [] |
|
|
| |
| |
| for layer in model.layers: |
| |
| if layer.name in saved_h5_model_layers_name: |
| |
| h5_layer_object = sharded_checkpoint_file[layer.name] |
| |
| symbolic_weights = layer.trainable_weights + layer.non_trainable_weights |
| saved_weights = {} |
|
|
| |
| |
| for weight_name in load_attributes_from_hdf5_group(h5_layer_object, "weight_names"): |
| |
| name = "/".join(weight_name.split("/")[1:]) |
|
|
| if _prefix is not None: |
| name = _prefix + "/" + name |
|
|
| saved_weights[name] = np.asarray(h5_layer_object[weight_name]) |
|
|
| |
| saved_weight_names_set.add(name) |
|
|
| |
| for symbolic_weight in symbolic_weights: |
| |
| if _prefix is not None: |
| delimeter = len(_prefix.split("/")) |
| symbolic_weight_name = "/".join( |
| symbolic_weight.name.split("/")[:delimeter] |
| + symbolic_weight.name.split("/")[delimeter + 1 :] |
| ) |
| else: |
| symbolic_weight_name = "/".join(symbolic_weight.name.split("/")[1:]) |
|
|
| |
| |
| |
| saved_weight_value = saved_weights.get(symbolic_weight_name, None) |
|
|
| |
| |
| if saved_weight_value is None and symbolic_weight_name.endswith("embeddings:0"): |
| symbolic_weight_name = symbolic_weight_name[:-12] + "weight:0" |
| saved_weight_value = saved_weights.get(symbolic_weight_name, None) |
|
|
| |
| symbolic_weights_names.add(symbolic_weight_name) |
|
|
| |
| if saved_weight_value is not None: |
| |
| if K.int_shape(symbolic_weight) != saved_weight_value.shape: |
| |
| |
| try: |
| array = np.reshape(saved_weight_value, K.int_shape(symbolic_weight)) |
| except ValueError as e: |
| if ignore_mismatched_sizes: |
| mismatched_layers.append( |
| (symbolic_weight_name, saved_weight_value.shape, K.int_shape(symbolic_weight)) |
| ) |
| continue |
| else: |
| raise e |
| else: |
| array = saved_weight_value |
|
|
| |
| weight_value_tuples.append((symbolic_weight, array)) |
|
|
| |
| K.batch_set_value(weight_value_tuples) |
|
|
| |
| missing_layers.extend(list(symbolic_weights_names - saved_weight_names_set)) |
| unexpected_layers.extend(list(saved_weight_names_set - symbolic_weights_names)) |
|
|
| return missing_layers, unexpected_layers, mismatched_layers |
|
|
|
|
| def load_tf_weights_from_safetensors(model, resolved_archive_file, ignore_mismatched_sizes=False, _prefix=None): |
| |
| with safe_open(resolved_archive_file, framework="tf") as safetensors_archive: |
| mismatched_layers = [] |
| weight_names = [format_weight_name(w.name, _prefix=_prefix) for w in model.weights] |
| loaded_weight_names = list(safetensors_archive.keys()) |
| |
| missing_layers = list(set(weight_names) - set(loaded_weight_names)) |
| |
| unexpected_layers = list(set(loaded_weight_names) - set(weight_names)) |
|
|
| for weight in model.weights: |
| weight_name = format_weight_name(weight.name, _prefix=_prefix) |
| if weight_name in loaded_weight_names: |
| weight_value = safetensors_archive.get_tensor(weight_name) |
| |
| if K.int_shape(weight) != weight_value.shape: |
| |
| |
| try: |
| weight_value = tf.reshape(weight_value, K.int_shape(weight)) |
| except ValueError as e: |
| if ignore_mismatched_sizes: |
| mismatched_layers.append((weight_name, weight_value.shape, K.int_shape(weight))) |
| continue |
| else: |
| raise e |
|
|
| K.set_value(weight, weight_value) |
| return missing_layers, unexpected_layers, mismatched_layers |
|
|
|
|
| def init_copy_embeddings(old_embeddings, new_num_tokens): |
| r""" |
| This function aims to reduce the embeddings in case new_num_tokens < old_num_tokens or to pad with -1 in case |
| new_num_tokens > old_num_tokens. A mask is also computed in order to know which weight in the embeddings should be |
| kept or not. Example: |
| |
| - if new_num_tokens=5 and old_num_tokens=4 and old_embeddings=[w1,w2,w3,w4] |
| |
| - mask=[True,True,True,True,False] and current_weights=[w1,w2,w3,w4,-1] |
| - if new_num_tokens=4 and old_num_tokens=5 and old_embeddings=[w1,w2,w3,w4,w5] |
| |
| - mask=[True,True,True,True] and current_weights=[w1,w2,w3,w4] |
| """ |
| old_num_tokens, old_embedding_dim = shape_list(old_embeddings) |
| size_diff = new_num_tokens - old_num_tokens |
|
|
| |
| |
| if tf.math.greater(size_diff, 0): |
| |
| |
| |
| current_weights = tf.pad( |
| old_embeddings.value(), tf.convert_to_tensor([[0, size_diff], [0, 0]]), constant_values=-1 |
| ) |
| num_tokens_to_copy = min(old_num_tokens, new_num_tokens) |
| mask = tf.fill(tf.convert_to_tensor([num_tokens_to_copy, 1]), True) |
| mask = tf.pad(mask, tf.convert_to_tensor([[0, size_diff], [0, 0]]), constant_values=False) |
| else: |
| |
| current_weights = tf.slice( |
| old_embeddings.value(), |
| tf.convert_to_tensor([0, 0]), |
| tf.convert_to_tensor([new_num_tokens, old_embedding_dim]), |
| ) |
| mask = tf.fill(tf.convert_to_tensor([new_num_tokens, 1]), True) |
|
|
| return mask, current_weights |
|
|
|
|
| class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, PushToHubMixin): |
| r""" |
| Base class for all TF models. |
| |
| [`TFPreTrainedModel`] takes care of storing the configuration of the models and handles methods for loading, |
| downloading and saving models as well as a few methods common to all models to: |
| |
| - resize the input embeddings, |
| - prune heads in the self-attention heads. |
| |
| Class attributes (overridden by derived classes): |
| |
| - **config_class** ([`PretrainedConfig`]) -- A subclass of [`PretrainedConfig`] to use as configuration class |
| for this model architecture. |
| - **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in derived |
| classes of the same architecture adding modules on top of the base model. |
| - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for NLP |
| models, `pixel_values` for vision models and `input_values` for speech models). |
| """ |
| config_class = None |
| base_model_prefix = "" |
| main_input_name = "input_ids" |
| _auto_class = None |
| _using_dummy_loss = None |
| _label_to_output_map = None |
|
|
| |
| |
| _keys_to_ignore_on_load_missing = None |
| |
| |
| _keys_to_ignore_on_load_unexpected = None |
| _requires_load_weight_prefix = False |
|
|
| @property |
| def dummy_inputs(self) -> Dict[str, tf.Tensor]: |
| """ |
| Dummy inputs to build the network. |
| |
| Returns: |
| `Dict[str, tf.Tensor]`: The dummy inputs. |
| """ |
| dummies = {} |
| for key, spec in self.input_signature.items(): |
| |
| dummy_shape = [dim if dim is not None else 2 for dim in spec.shape] |
| if spec.shape[0] is None: |
| |
| dummy_shape[0] = 1 |
| dummies[key] = tf.ones(shape=dummy_shape, dtype=spec.dtype) |
| if key == "token_type_ids": |
| |
| dummies[key] = tf.zeros_like(dummies[key]) |
| if self.config.add_cross_attention and "encoder_hidden_states" in inspect.signature(self.call).parameters: |
| if "encoder_hidden_states" not in dummies: |
| if self.main_input_name == "input_ids": |
| dummies["encoder_hidden_states"] = tf.ones( |
| shape=(1, 2, self.config.hidden_size), dtype=tf.float32, name="encoder_hidden_states" |
| ) |
| else: |
| raise NotImplementedError( |
| "Model has cross-attention but we couldn't infer the shape for the encoder hidden states. Please manually override dummy_inputs!" |
| ) |
| return dummies |
|
|
| @property |
| def framework(self) -> str: |
| """ |
| :str: Identifies that this is a TensorFlow model. |
| """ |
| return "tf" |
|
|
| def build(self, input_shape=None): |
| call_context = get_call_context_function() |
| if self.built or call_context().in_call: |
| self.built = True |
| else: |
| self.built = True |
| |
| |
| self._set_save_spec(self.input_signature) |
| self(self.dummy_inputs, training=False) |
|
|
| def __init__(self, config, *inputs, **kwargs): |
| super().__init__(*inputs, **kwargs) |
| if not isinstance(config, PretrainedConfig): |
| raise ValueError( |
| f"Parameter config in `{self.__class__.__name__}(config)` should be an instance of class " |
| "`PretrainedConfig`. To create a model from a pretrained model use " |
| f"`model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`" |
| ) |
| |
| self.config = config |
| self.name_or_path = config.name_or_path |
| self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None |
|
|
| def get_config(self): |
| return self.config.to_dict() |
|
|
| @classmethod |
| def from_config(cls, config, **kwargs): |
| if isinstance(config, PretrainedConfig): |
| return cls._from_config(config, **kwargs) |
| return cls._from_config(cls.config_class.from_dict(config, **kwargs)) |
|
|
| @classmethod |
| def _from_config(cls, config, **kwargs): |
| """ |
| All context managers that the model should be initialized under go here. |
| """ |
| return cls(config, **kwargs) |
|
|
| def get_head_mask(self, head_mask: tf.Tensor | None, num_hidden_layers: int) -> tf.Tensor: |
| """ |
| Prepare the head mask if needed. |
| |
| Args: |
| head_mask (`tf.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*): |
| The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard). |
| num_hidden_layers (`int`): |
| The number of hidden layers in the model. |
| |
| Returns: |
| `tf.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with |
| `[None]` for each layer. |
| """ |
| if head_mask is not None: |
| head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers) |
| else: |
| head_mask = [None] * num_hidden_layers |
|
|
| return head_mask |
|
|
| def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers): |
| """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]""" |
| if head_mask.shape.rank == 1: |
| head_mask = head_mask[None, None, :, None, None] |
| head_mask = tf.repeat(head_mask, repeats=num_hidden_layers, axis=0) |
| elif head_mask.shape.rank == 2: |
| head_mask = head_mask[:, None, :, None, None] |
| assert head_mask.shape.rank == 5, f"head_mask.dim != 5, instead {head_mask.dim()}" |
| head_mask = tf.cast(head_mask, tf.float32) |
| return head_mask |
|
|
| @tf.function |
| def serving(self, inputs): |
| """ |
| Args: |
| Method used for serving the model. Does not have a specific signature, but will be specialized as concrete |
| functions when saving with `save_pretrained`. |
| inputs (`Dict[str, tf.Tensor]`): |
| The input of the saved model as a dictionary of tensors. |
| """ |
| output = self.call(inputs) |
|
|
| return self.serving_output(output) |
|
|
| def eager_serving(self, inputs): |
| """ |
| Method used for serving the model. This method is deprecated, and will be removed. |
| |
| Args: |
| inputs (`Dict[str, tf.Tensor]`): |
| The input of the saved model as a dictionary of tensors. |
| """ |
| warnings.warn( |
| "The function `eager_serving` is deprecated and will be removed in version 4.32.0 of Transformers", |
| FutureWarning, |
| ) |
| output = self.call(inputs) |
|
|
| return self.serving_output(output) |
|
|
| @property |
| def input_signature(self) -> Dict[str, tf.TensorSpec]: |
| """ |
| This property should return a dict mapping input names to tf.TensorSpec objects, representing the expected |
| shape and dtype for model inputs. It is used for both serving and for generating the dummy inputs used to build |
| the model. |
| """ |
| model_inputs = list(inspect.signature(self.call).parameters) |
| sig = {} |
| if "input_ids" in model_inputs: |
| if self.__class__.__name__.endswith("ForMultipleChoice"): |
| text_dims = 3 |
| else: |
| text_dims = 2 |
| for input_name in ( |
| "input_ids", |
| "attention_mask", |
| "token_type_ids", |
| "decoder_input_ids", |
| "decoder_attention_mask", |
| ): |
| if input_name in model_inputs: |
| sig[input_name] = tf.TensorSpec([None] * text_dims, tf.int32, name=input_name) |
| if "pixel_values" in model_inputs: |
| pixel_values_shape = [None, None, None, None] |
| if hasattr(self.config, "vision_config"): |
| vision_config = self.config.vision_config |
| else: |
| vision_config = self.config |
| if hasattr(vision_config, "num_channels"): |
| pixel_values_shape[1] = vision_config.num_channels |
| else: |
| raise NotImplementedError( |
| "Could not infer number of channels from config, please override input_signature to specify input shapes." |
| ) |
| if hasattr(vision_config, "image_size"): |
| pixel_values_shape[2] = pixel_values_shape[3] = vision_config.image_size |
| elif hasattr(vision_config, "input_size"): |
| pixel_values_shape[2] = pixel_values_shape[3] = vision_config.input_size |
| else: |
| raise NotImplementedError( |
| "Could not infer input image shape from config, please override input_signature to specify input shapes." |
| ) |
| sig["pixel_values"] = tf.TensorSpec(pixel_values_shape, tf.float32, name="pixel_values") |
| if "input_features" in model_inputs: |
| raise NotImplementedError("Audio models need a manually defined input_signature") |
| return sig |
|
|
| def serving_output(self, output): |
| """ |
| Prepare the output of the saved model. Can be overridden if specific serving modifications are required. |
| """ |
| if not isinstance(output, ModelOutput): |
| return output |
| for key in output: |
| if key.endswith("hidden_states") and not getattr(self.config, "output_hidden_states", False): |
| output[key] = None |
| elif key.endswith("attentions") and not getattr(self.config, "output_attentions", False): |
| output[key] = None |
| elif key == "past_key_values" and not getattr(self.config, "use_cache", False): |
| output[key] = None |
| elif key == "cross_attentions" and not ( |
| getattr(self.config, "output_attentions", False) and getattr(self.config, "add_cross_attention", False) |
| ): |
| output[key] = None |
| if isinstance(output[key], (tuple, list)): |
| try: |
| output[key] = tf.convert_to_tensor(output[key]) |
| except (ValueError, tf.errors.InvalidArgumentError): |
| pass |
| return output |
|
|
| @classmethod |
| def can_generate(cls) -> bool: |
| """ |
| Returns whether this model can generate sequences with `.generate()`. |
| |
| Returns: |
| `bool`: Whether this model can generate sequences with `.generate()`. |
| """ |
| |
| |
| if "GenerationMixin" in str(cls.prepare_inputs_for_generation) and "GenerationMixin" in str(cls.generate): |
| return False |
| return True |
|
|
| def get_input_embeddings(self) -> tf.keras.layers.Layer: |
| """ |
| Returns the model's input embeddings layer. |
| |
| Returns: |
| `tf.Variable`: The embeddings layer mapping vocabulary to hidden states. |
| """ |
| main_layer = getattr(self, self.base_model_prefix, self) |
|
|
| if main_layer is not self: |
| return main_layer.get_input_embeddings() |
| else: |
| raise NotImplementedError |
|
|
| def _save_checkpoint(self, checkpoint_dir, epoch): |
| if not os.path.isdir(checkpoint_dir): |
| os.mkdir(checkpoint_dir) |
| |
| |
| |
| weights_path = os.path.join(checkpoint_dir, "weights.h5") |
| self.save_weights(weights_path) |
| extra_data = {"epoch": epoch, "optimizer_state": self.optimizer.get_weights()} |
| extra_data_path = os.path.join(checkpoint_dir, "extra_data.pickle") |
| with open(extra_data_path, "wb") as f: |
| pickle.dump(extra_data, f) |
|
|
| def load_repo_checkpoint(self, repo_path_or_name): |
| """ |
| Loads a saved checkpoint (model weights and optimizer state) from a repo. Returns the current epoch count when |
| the checkpoint was made. |
| |
| Args: |
| repo_path_or_name (`str`): |
| Can either be a repository name for your {object} in the Hub or a path to a local folder (in which case |
| the repository will have the name of that local folder). |
| |
| Returns: |
| `dict`: A dictionary of extra metadata from the checkpoint, most commonly an "epoch" count. |
| """ |
| if getattr(self, "optimizer", None) is None: |
| raise RuntimeError( |
| "Checkpoint loading failed as no optimizer is attached to the model. " |
| "This is most likely caused by the model not being compiled." |
| ) |
| if os.path.isdir(repo_path_or_name): |
| local_dir = repo_path_or_name |
| else: |
| |
| repo_files = list_repo_files(repo_path_or_name) |
| for file in ("checkpoint/weights.h5", "checkpoint/extra_data.pickle"): |
| if file not in repo_files: |
| raise FileNotFoundError(f"Repo {repo_path_or_name} does not contain checkpoint file {file}!") |
| repo = Repository(repo_path_or_name.split("/")[-1], clone_from=repo_path_or_name) |
| local_dir = repo.local_dir |
|
|
| |
| checkpoint_dir = os.path.join(local_dir, "checkpoint") |
| weights_file = os.path.join(checkpoint_dir, "weights.h5") |
| if not os.path.isfile(weights_file): |
| raise FileNotFoundError(f"Could not find checkpoint file weights.h5 in repo {repo_path_or_name}!") |
| extra_data_file = os.path.join(checkpoint_dir, "extra_data.pickle") |
| if not os.path.isfile(extra_data_file): |
| raise FileNotFoundError(f"Could not find checkpoint file extra_data.pickle in repo {repo_path_or_name}!") |
|
|
| |
| |
| self.load_weights(weights_file) |
| with open(extra_data_file, "rb") as f: |
| extra_data = pickle.load(f) |
| self.optimizer.set_weights(extra_data["optimizer_state"]) |
|
|
| |
| |
| return {"epoch": extra_data["epoch"]} |
|
|
| def prepare_tf_dataset( |
| self, |
| dataset: "datasets.Dataset", |
| batch_size: int = 8, |
| shuffle: bool = True, |
| tokenizer: Optional["PreTrainedTokenizerBase"] = None, |
| collate_fn: Optional[Callable] = None, |
| collate_fn_args: Optional[Dict[str, Any]] = None, |
| drop_remainder: Optional[bool] = None, |
| prefetch: bool = True, |
| ): |
| """ |
| Wraps a HuggingFace [`~datasets.Dataset`] as a `tf.data.Dataset` with collation and batching. This method is |
| designed to create a "ready-to-use" dataset that can be passed directly to Keras methods like `fit()` without |
| further modification. The method will drop columns from the dataset if they don't match input names for the |
| model. If you want to specify the column names to return rather than using the names that match this model, we |
| recommend using `Dataset.to_tf_dataset()` instead. |
| |
| Args: |
| dataset (`Any`): |
| A [~`datasets.Dataset`] to be wrapped as a `tf.data.Dataset`. |
| batch_size (`int`, defaults to 8): |
| The size of batches to return. |
| shuffle (`bool`, defaults to `True`): |
| Whether to return samples from the dataset in random order. Usually `True` for training datasets and |
| `False` for validation/test datasets. |
| tokenizer ([`PreTrainedTokenizerBase`], *optional*): |
| A `PreTrainedTokenizer` that will be used to pad samples to create batches. Has no effect if a specific |
| `collate_fn` is passed instead. |
| collate_fn (`Callable`, *optional*): |
| A function that collates samples from the dataset into a single batch. Defaults to |
| `DefaultDataCollator` if no `tokenizer` is supplied or `DataCollatorWithPadding` if a `tokenizer` is |
| passed. |
| collate_fn_args (`Dict[str, Any]`, *optional*): |
| A dict of arguments to pass to the `collate_fn` alongside the list of samples. |
| drop_remainder (`bool`, *optional*): |
| Whether to drop the final batch, if the batch_size does not evenly divide the dataset length. Defaults |
| to the same setting as `shuffle`. |
| prefetch (`bool`, defaults to `True`): |
| Whether to add prefetching to the end of the `tf.data` pipeline. This is almost always beneficial for |
| performance, but can be disabled in edge cases. |
| |
| |
| Returns: |
| `Dataset`: A `tf.data.Dataset` which is ready to pass to the Keras API. |
| """ |
| requires_backends(self, ["datasets"]) |
| import datasets |
|
|
| if collate_fn is None: |
| if tokenizer is None: |
| collate_fn = DefaultDataCollator(return_tensors="np") |
| else: |
| collate_fn = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="np") |
| if collate_fn_args is None: |
| collate_fn_args = {} |
|
|
| if not isinstance(dataset, datasets.Dataset): |
| raise TypeError("Dataset argument should be a datasets.Dataset!") |
| model_inputs = list(inspect.signature(self.call).parameters) |
| model_labels = find_labels(self.__class__) |
| if "cols_to_retain" in list(inspect.signature(dataset._get_output_signature).parameters.keys()): |
| output_signature, _ = dataset._get_output_signature( |
| dataset, |
| batch_size=None, |
| collate_fn=collate_fn, |
| collate_fn_args=collate_fn_args, |
| cols_to_retain=model_inputs, |
| ) |
| else: |
| |
| |
| unwanted_columns = [ |
| feature |
| for feature in dataset.features |
| if feature not in model_inputs and feature not in ("label_ids", "label") |
| ] |
| dataset = dataset.remove_columns(unwanted_columns) |
| output_signature, _ = dataset._get_output_signature( |
| dataset, batch_size=None, collate_fn=collate_fn, collate_fn_args=collate_fn_args |
| ) |
| output_columns = list(output_signature.keys()) |
| feature_cols = [col for col in output_columns if col in model_inputs and col not in model_labels] |
| label_cols = [col for col in output_columns if col in model_labels] |
|
|
| |
| |
| |
| feature_cols = feature_cols[0] if len(feature_cols) == 1 else feature_cols |
| label_cols = label_cols[0] if len(label_cols) == 1 else label_cols |
|
|
| if drop_remainder is None: |
| drop_remainder = shuffle |
| tf_dataset = dataset.to_tf_dataset( |
| columns=feature_cols, |
| label_cols=label_cols, |
| batch_size=batch_size, |
| shuffle=shuffle, |
| drop_remainder=drop_remainder, |
| collate_fn=collate_fn, |
| collate_fn_args=collate_fn_args, |
| prefetch=prefetch, |
| ) |
| return tf_dataset |
|
|
| def compile( |
| self, |
| optimizer="rmsprop", |
| loss="auto_with_warning", |
| metrics=None, |
| loss_weights=None, |
| weighted_metrics=None, |
| run_eagerly=None, |
| steps_per_execution=None, |
| **kwargs, |
| ): |
| """ |
| This is a thin wrapper that sets the model's loss output head as the loss if the user does not specify a loss |
| function themselves. |
| """ |
| if loss in ("auto_with_warning", "passthrough"): |
| logger.info( |
| "No loss specified in compile() - the model's internal loss computation will be used as the " |
| "loss. Don't panic - this is a common way to train TensorFlow models in Transformers! " |
| "To disable this behaviour please pass a loss argument, or explicitly pass " |
| "`loss=None` if you do not want your model to compute a loss. You can also specify `loss='auto'` to " |
| "get the internal loss without printing this info string." |
| ) |
| loss = "auto" |
| if loss == "auto": |
| loss = dummy_loss |
| self._using_dummy_loss = True |
| else: |
| self._using_dummy_loss = False |
| parent_args = list(inspect.signature(tf.keras.Model.compile).parameters.keys()) |
| |
| if "steps_per_execution" in parent_args: |
| super().compile( |
| optimizer=optimizer, |
| loss=loss, |
| metrics=metrics, |
| loss_weights=loss_weights, |
| weighted_metrics=weighted_metrics, |
| run_eagerly=run_eagerly, |
| steps_per_execution=steps_per_execution, |
| **kwargs, |
| ) |
| else: |
| super().compile( |
| optimizer=optimizer, |
| loss=loss, |
| metrics=metrics, |
| loss_weights=loss_weights, |
| weighted_metrics=weighted_metrics, |
| run_eagerly=run_eagerly, |
| experimental_steps_per_execution=steps_per_execution, |
| **kwargs, |
| ) |
|
|
| def compute_loss(self, *args, **kwargs): |
| if hasattr(tf.keras.Model, "compute_loss"): |
| |
| return super().compute_loss(*args, **kwargs) |
| else: |
| warnings.warn( |
| "The old compute_loss method is deprecated as it conflicts with the Keras compute_loss " |
| "method added in TF 2.8. If you want the original HF compute_loss, please call " |
| "hf_compute_loss() instead. From TF versions >= 2.8, or Transformers versions >= 5, " |
| "calling compute_loss() will get the Keras method instead.", |
| FutureWarning, |
| ) |
| return self.hf_compute_loss(*args, **kwargs) |
|
|
| def get_label_to_output_name_mapping(self): |
| arg_names = list(inspect.signature(self.call).parameters) |
| if self._label_to_output_map is not None: |
| return self._label_to_output_map |
| elif "start_positions" in arg_names: |
| return {"start_positions": "start_logits", "end_positions": "end_logits"} |
| elif "sentence_order_label" in arg_names: |
| return {"labels": "prediction_logits", "sentence_order_label": "sop_logits"} |
| elif "next_sentence_label" in arg_names: |
| return {"labels": "prediction_logits", "next_sentence_label": "seq_relationship_logits"} |
| elif "mc_labels" in arg_names: |
| return {"labels": "logits", "mc_labels": "mc_logits"} |
| else: |
| return {} |
|
|
| def train_step(self, data): |
| """ |
| A modification of Keras's default `train_step` that correctly handles matching outputs to labels for our models |
| and supports directly training on the loss output head. In addition, it ensures input keys are copied to the |
| labels where appropriate. It will also copy label keys into the input dict when using the dummy loss, to ensure |
| that they are available to the model during the forward pass. |
| """ |
|
|
| |
| arg_names = list(inspect.signature(self.call).parameters) |
| label_kwargs = find_labels(self.__class__) |
| label_to_output = self.get_label_to_output_name_mapping() |
| output_to_label = {val: key for key, val in label_to_output.items()} |
| if not self._using_dummy_loss and parse(tf.__version__) < parse("2.11.0"): |
| |
| data = expand_1d(data) |
| x, y, sample_weight = tf.keras.utils.unpack_x_y_sample_weight(data) |
| |
| |
| |
| if isinstance(x, dict): |
| x = x.copy() |
| if isinstance(y, dict): |
| y = y.copy() |
|
|
| |
| |
| if self._using_dummy_loss and y is not None: |
| |
| if len(label_kwargs) == 1 and isinstance(y, tf.Tensor): |
| if isinstance(x, tf.Tensor): |
| x = {arg_names[0]: x} |
| label_kwarg = next(iter(label_kwargs)) |
| if label_kwarg not in x: |
| x[label_kwarg] = y |
| |
| elif isinstance(y, dict): |
| if isinstance(x, tf.Tensor): |
| x = {arg_names[0]: x} |
| for key, val in y.items(): |
| if key in arg_names and key not in x: |
| x[key] = val |
| elif output_to_label.get(key, None) in arg_names and key not in x: |
| x[output_to_label[key]] = val |
| if y is None: |
| y = {key: val for key, val in x.items() if key in label_kwargs} |
| if not y and not self._using_dummy_loss: |
| raise ValueError("Could not find label column(s) in input dict and no separate labels were provided!") |
|
|
| if isinstance(y, dict): |
| |
| y = {label_to_output.get(key, key): val for key, val in y.items()} |
|
|
| |
| with tf.GradientTape() as tape: |
| if self._using_dummy_loss and "return_loss" in arg_names: |
| y_pred = self(x, training=True, return_loss=True) |
| else: |
| y_pred = self(x, training=True) |
| if self._using_dummy_loss: |
| loss = self.compiled_loss(y_pred.loss, y_pred.loss, sample_weight, regularization_losses=self.losses) |
| else: |
| loss = None |
|
|
| |
| |
| if isinstance(y, dict) and len(y) == 1: |
| if list(y.keys())[0] in y_pred.keys(): |
| y_pred = y_pred[list(y.keys())[0]] |
| elif list(y_pred.keys())[0] == "loss": |
| y_pred = y_pred[1] |
| else: |
| y_pred = y_pred[0] |
| _, y = y.popitem() |
| elif isinstance(y, dict): |
| |
| y_pred = {key: val for key, val in y_pred.items() if key in y} |
| elif isinstance(y, tuple) or isinstance(y, list): |
| |
| if list(y_pred.keys())[0] == "loss": |
| y_pred = y_pred.to_tuple()[1:] |
| else: |
| y_pred = y_pred.to_tuple() |
| y_pred = y_pred[: len(y)] |
| else: |
| |
| if list(y_pred.keys())[0] == "loss": |
| y_pred = y_pred[1] |
| else: |
| y_pred = y_pred[0] |
|
|
| if loss is None: |
| loss = self.compiled_loss(y, y_pred, sample_weight, regularization_losses=self.losses) |
|
|
| |
| self.optimizer.minimize(loss, self.trainable_variables, tape=tape) |
|
|
| self.compiled_metrics.update_state(y, y_pred, sample_weight) |
| |
| return_metrics = {} |
| for metric in self.metrics: |
| result = metric.result() |
| if isinstance(result, dict): |
| return_metrics.update(result) |
| else: |
| return_metrics[metric.name] = result |
| return return_metrics |
|
|
| def test_step(self, data): |
| """ |
| A modification of Keras's default `train_step` that correctly handles matching outputs to labels for our models |
| and supports directly training on the loss output head. In addition, it ensures input keys are copied to the |
| labels where appropriate. It will also copy label keys into the input dict when using the dummy loss, to ensure |
| that they are available to the model during the forward pass. |
| """ |
| |
| arg_names = list(inspect.signature(self.call).parameters) |
| label_kwargs = find_labels(self.__class__) |
| label_to_output = self.get_label_to_output_name_mapping() |
| output_to_label = {val: key for key, val in label_to_output.items()} |
| if not self._using_dummy_loss and parse(tf.__version__) < parse("2.11.0"): |
| |
| data = expand_1d(data) |
| x, y, sample_weight = tf.keras.utils.unpack_x_y_sample_weight(data) |
| |
| |
| |
| if isinstance(x, dict): |
| x = x.copy() |
| if isinstance(y, dict): |
| y = y.copy() |
|
|
| |
| |
| if self._using_dummy_loss and y is not None: |
| arg_names = list(inspect.signature(self.call).parameters) |
| |
| if len(label_kwargs) == 1 and isinstance(y, tf.Tensor): |
| if isinstance(x, tf.Tensor): |
| x = {arg_names[0]: x} |
| label_kwarg = next(iter(label_kwargs)) |
| if label_kwarg not in x: |
| x[label_kwarg] = y |
| |
| elif isinstance(y, dict): |
| if isinstance(x, tf.Tensor): |
| x = {arg_names[0]: x} |
| for key, val in y.items(): |
| if key in arg_names and key not in x: |
| x[key] = val |
| elif output_to_label.get(key, None) in arg_names and key not in x: |
| x[output_to_label[key]] = val |
| if y is None: |
| y = {key: val for key, val in x.items() if key in label_kwargs} |
| if not y and not self._using_dummy_loss: |
| raise ValueError("Could not find label column(s) in input dict and no separate labels were provided!") |
|
|
| if isinstance(y, dict): |
| |
| y = {label_to_output.get(key, key): val for key, val in y.items()} |
|
|
| |
| if self._using_dummy_loss and "return_loss" in arg_names: |
| y_pred = self(x, return_loss=True, training=False) |
| else: |
| y_pred = self(x, training=False) |
| if self._using_dummy_loss: |
| loss = self.compiled_loss(y_pred.loss, y_pred.loss, sample_weight, regularization_losses=self.losses) |
| else: |
| loss = None |
|
|
| |
| |
| if isinstance(y, dict) and len(y) == 1: |
| if list(y.keys())[0] in y_pred.keys(): |
| y_pred = y_pred[list(y.keys())[0]] |
| elif list(y_pred.keys())[0] == "loss": |
| y_pred = y_pred[1] |
| else: |
| y_pred = y_pred[0] |
| _, y = y.popitem() |
| elif isinstance(y, dict): |
| |
| y_pred = {key: val for key, val in y_pred.items() if key in y} |
| elif isinstance(y, tuple) or isinstance(y, list): |
| |
| if list(y_pred.keys())[0] == "loss": |
| y_pred = y_pred.to_tuple()[1:] |
| else: |
| y_pred = y_pred.to_tuple() |
| y_pred = y_pred[: len(y)] |
| else: |
| |
| if list(y_pred.keys())[0] == "loss": |
| y_pred = y_pred[1] |
| else: |
| y_pred = y_pred[0] |
|
|
| if loss is None: |
| loss = self.compiled_loss(y, y_pred, sample_weight, regularization_losses=self.losses) |
|
|
| self.compiled_metrics.update_state(y, y_pred, sample_weight) |
| |
| return_metrics = {} |
| for metric in self.metrics: |
| result = metric.result() |
| if isinstance(result, dict): |
| return_metrics.update(result) |
| else: |
| return_metrics[metric.name] = result |
| return return_metrics |
|
|
| def create_model_card( |
| self, |
| output_dir, |
| model_name: str, |
| language: Optional[str] = None, |
| license: Optional[str] = None, |
| tags: Optional[str] = None, |
| finetuned_from: Optional[str] = None, |
| tasks: Optional[str] = None, |
| dataset_tags: Optional[Union[str, List[str]]] = None, |
| dataset: Optional[Union[str, List[str]]] = None, |
| dataset_args: Optional[Union[str, List[str]]] = None, |
| ): |
| """ |
| Creates a draft of a model card using the information available to the `Trainer`. |
| |
| Args: |
| output_dir (`str` or `os.PathLike`): |
| The folder in which to create the model card. |
| model_name (`str`, *optional*): |
| The name of the model. |
| language (`str`, *optional*): |
| The language of the model (if applicable) |
| license (`str`, *optional*): |
| The license of the model. Will default to the license of the pretrained model used, if the original |
| model given to the `Trainer` comes from a repo on the Hub. |
| tags (`str` or `List[str]`, *optional*): |
| Some tags to be included in the metadata of the model card. |
| finetuned_from (`str`, *optional*): |
| The name of the model used to fine-tune this one (if applicable). Will default to the name of the repo |
| of the original model given to the `Trainer` (if it comes from the Hub). |
| tasks (`str` or `List[str]`, *optional*): |
| One or several task identifiers, to be included in the metadata of the model card. |
| dataset_tags (`str` or `List[str]`, *optional*): |
| One or several dataset tags, to be included in the metadata of the model card. |
| dataset (`str` or `List[str]`, *optional*): |
| One or several dataset identifiers, to be included in the metadata of the model card. |
| dataset_args (`str` or `List[str]`, *optional*): |
| One or several dataset arguments, to be included in the metadata of the model card. |
| """ |
| |
| from .modelcard import TrainingSummary |
|
|
| training_summary = TrainingSummary.from_keras( |
| self, |
| keras_history=self.history, |
| language=language, |
| license=license, |
| tags=tags, |
| model_name=model_name, |
| finetuned_from=finetuned_from, |
| tasks=tasks, |
| dataset_tags=dataset_tags, |
| dataset=dataset, |
| dataset_args=dataset_args, |
| ) |
| model_card = training_summary.to_model_card() |
| with open(os.path.join(output_dir, "README.md"), "w") as f: |
| f.write(model_card) |
|
|
| def set_input_embeddings(self, value): |
| """ |
| Set model's input embeddings |
| |
| Args: |
| value (`tf.Variable`): |
| The new weights mapping hidden states to vocabulary. |
| """ |
| main_layer = getattr(self, self.base_model_prefix) |
|
|
| if main_layer is None: |
| raise NotImplementedError("The model does not implements the base_model_prefix attribute.") |
|
|
| try: |
| main_layer.set_input_embeddings(value) |
| except AttributeError: |
| logger.info("Building the model") |
| self.build() |
| main_layer.set_input_embeddings(value) |
|
|
| def get_output_embeddings(self) -> Union[None, tf.keras.layers.Layer]: |
| """ |
| Returns the model's output embeddings |
| |
| Returns: |
| `tf.Variable`: The new weights mapping vocabulary to hidden states. |
| """ |
| if self.get_lm_head() is not None: |
| lm_head = self.get_lm_head() |
|
|
| try: |
| return lm_head.get_output_embeddings() |
| except AttributeError: |
| logger.info("Building the model") |
| self.build() |
|
|
| return lm_head().get_output_embeddings() |
|
|
| return None |
|
|
| def set_output_embeddings(self, value): |
| """ |
| Set model's output embeddings |
| |
| Args: |
| value (`tf.Variable`): |
| The new weights mapping hidden states to vocabulary. |
| """ |
| if self.get_lm_head() is not None: |
| lm_head = self.get_lm_head() |
| try: |
| lm_head.set_output_embeddings(value) |
| except AttributeError: |
| logger.info("Building the model") |
| self.build() |
| lm_head.set_output_embeddings(value) |
|
|
| def get_output_layer_with_bias(self) -> Union[None, tf.keras.layers.Layer]: |
| """ |
| Get the layer that handles a bias attribute in case the model has an LM head with weights tied to the |
| embeddings |
| |
| Return: |
| `tf.keras.layers.Layer`: The layer that handles the bias, None if not an LM model. |
| """ |
| warnings.warn( |
| "The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.", FutureWarning |
| ) |
| return self.get_lm_head() |
|
|
| def get_prefix_bias_name(self) -> Union[None, str]: |
| """ |
| Get the concatenated _prefix name of the bias from the model name to the parent layer |
| |
| Return: |
| `str`: The _prefix name of the bias. |
| """ |
| warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) |
| return None |
|
|
| def get_bias(self) -> Union[None, Dict[str, tf.Variable]]: |
| """ |
| Dict of bias attached to an LM head. The key represents the name of the bias attribute. |
| |
| Return: |
| `tf.Variable`: The weights representing the bias, None if not an LM model. |
| """ |
| if self.get_lm_head() is not None: |
| lm_head = self.get_lm_head() |
| try: |
| return lm_head.get_bias() |
| except AttributeError: |
| self.build() |
|
|
| return lm_head.get_bias() |
| return None |
|
|
| def set_bias(self, value): |
| """ |
| Set all the bias in the LM head. |
| |
| Args: |
| value (`Dict[tf.Variable]`): |
| All the new bias attached to an LM head. |
| """ |
| if self.get_lm_head() is not None: |
| lm_head = self.get_lm_head() |
| try: |
| lm_head.set_bias(value) |
| except AttributeError: |
| self.build() |
| lm_head.set_bias(value) |
|
|
| def get_lm_head(self) -> tf.keras.layers.Layer: |
| """ |
| The LM Head layer. This method must be overwritten by all the models that have a lm head. |
| |
| Return: |
| `tf.keras.layers.Layer`: The LM head layer if the model has one, None if not. |
| """ |
| return None |
|
|
| def resize_token_embeddings( |
| self, new_num_tokens: Optional[int] = None |
| ) -> Union[tf.keras.layers.Embedding, tf.Variable]: |
| """ |
| Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`. |
| |
| Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method. |
| |
| Arguments: |
| new_num_tokens (`int`, *optional*): |
| The number of new tokens in the embedding matrix. Increasing the size will add newly initialized |
| vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just |
| returns a pointer to the input tokens without doing anything. |
| |
| Return: |
| `tf.Variable` or `tf.keras.layers.Embedding`: Pointer to the input tokens of the model. |
| """ |
| |
|
|
| |
| if isinstance(self.get_input_embeddings(), tf.keras.layers.Embedding): |
| return self._v2_resized_token_embeddings(new_num_tokens) |
|
|
| if new_num_tokens is None or new_num_tokens == self.config.vocab_size: |
| return self._get_word_embedding_weight(self.get_input_embeddings()) |
|
|
| model_embeds = self._resize_token_embeddings(new_num_tokens) |
|
|
| |
| self.config.vocab_size = new_num_tokens |
|
|
| return model_embeds |
|
|
| def _v2_resized_token_embeddings(self, new_num_tokens: Optional[int] = None) -> tf.keras.layers.Embedding: |
| """ |
| Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`. |
| |
| Arguments: |
| new_num_tokens (`int`, *optional*): |
| The number of new tokens in the embedding matrix. Increasing the size will add newly initialized |
| vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just |
| returns a pointer to the input tokens without doing anything. |
| |
| Return: |
| `tf.keras.layers.Embedding`: Pointer to the input tokens of the model. |
| """ |
| if new_num_tokens is None or new_num_tokens == self.config.vocab_size: |
| return self.get_input_embeddings() |
|
|
| model_embeds = self._v2_resize_token_embeddings(new_num_tokens) |
|
|
| |
| self.config.vocab_size = new_num_tokens |
|
|
| return model_embeds |
|
|
| def _get_word_embedding_weight(model, embedding_layer): |
| |
|
|
| |
| if isinstance(embedding_layer, tf.Tensor): |
| return embedding_layer |
| |
|
|
| embeds = getattr(embedding_layer, "weight", None) |
| if embeds is not None: |
| return embeds |
|
|
| embeds = getattr(embedding_layer, "decoder", None) |
| if embeds is not None: |
| return embeds |
|
|
| |
| |
| |
| model.build() |
|
|
| embeds = getattr(embedding_layer, "weight", None) |
| if embeds is not None: |
| return embeds |
|
|
| embeds = getattr(embedding_layer, "decoder", None) |
| if embeds is not None: |
| return embeds |
|
|
| return None |
|
|
| def _resize_token_embeddings(self, new_num_tokens): |
| |
| old_embeddings = self._get_word_embedding_weight(self.get_input_embeddings()) |
| new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens) |
|
|
| |
| if self.get_bias() is not None: |
| old_lm_head_bias = self.get_bias() |
| new_lm_head_bias = self._get_resized_lm_head_bias(old_lm_head_bias, new_num_tokens) |
|
|
| self.set_bias(new_lm_head_bias) |
|
|
| |
| if self.get_output_embeddings() is not None: |
| old_lm_head_decoder = self._get_word_embedding_weight(self.get_output_embeddings()) |
| new_lm_head_decoder = self._get_resized_lm_head_decoder(old_lm_head_decoder, new_num_tokens) |
|
|
| self.set_output_embeddings(new_lm_head_decoder) |
|
|
| self.set_input_embeddings(new_embeddings) |
|
|
| return self.get_input_embeddings() |
|
|
| def _v2_resize_token_embeddings(self, new_num_tokens): |
| old_embeddings = self.get_input_embeddings() |
| new_embeddings = self._v2_get_resized_embeddings(old_embeddings, new_num_tokens) |
| self.set_input_embeddings(new_embeddings) |
|
|
| |
| if self.get_bias() is not None: |
| old_lm_head_bias = self.get_bias() |
| new_lm_head_bias = self._v2_get_resized_lm_head_bias(old_lm_head_bias, new_num_tokens) |
| self.set_bias(new_lm_head_bias) |
|
|
| |
| tied_weights = self.get_input_embeddings() == self.get_output_embeddings() |
| if self.get_output_embeddings() is not None and not tied_weights: |
| old_lm_head_decoder = self._get_word_embedding_weight(self.get_output_embeddings()) |
| |
| new_lm_head_decoder = self._get_resized_lm_head_decoder(old_lm_head_decoder, new_num_tokens) |
| self.set_output_embeddings(new_lm_head_decoder) |
|
|
| return self.get_input_embeddings() |
|
|
| def _get_resized_lm_head_bias(self, old_lm_head_bias, new_num_tokens): |
| """ |
| Build a resized bias from the old ones. Increasing the size will add newly initialized vectors at the end. |
| Reducing the size will remove vectors from the end |
| |
| Args: |
| old_lm_head_bias (`tf.Variable`): |
| Old lm head bias to be resized. |
| new_num_tokens (`int`, *optional*): |
| New number of tokens in the linear matrix. |
| |
| Increasing the size will add newly initialized vectors at the end. Reducing the size will remove |
| vectors from the end. If not provided or `None`, just returns None |
| |
| Return: |
| `tf.Variable`: Pointer to the resized bias. |
| """ |
| |
| new_lm_head_bias = {} |
|
|
| for attr, weight in old_lm_head_bias.items(): |
| first_dim, old_num_tokens = (None, shape_list(weight)[0]) if tf.rank(weight) == 1 else shape_list(weight) |
| size_diff = new_num_tokens - old_num_tokens |
| final_shape = [new_num_tokens] if first_dim is None else [first_dim, new_num_tokens] |
|
|
| |
| if tf.math.greater(size_diff, 0): |
| padding_shape = [[0, size_diff]] if first_dim is None else [[0, 0], [0, size_diff]] |
| current_bias = tf.pad(weight.value(), tf.convert_to_tensor(padding_shape), constant_values=-1) |
| num_tokens_to_copy = min(old_num_tokens, new_num_tokens) |
| mask_shape = [num_tokens_to_copy] if first_dim is None else [1, num_tokens_to_copy] |
| bias_mask = tf.fill(tf.convert_to_tensor(mask_shape), True) |
| bias_mask = tf.pad(bias_mask, tf.convert_to_tensor(padding_shape), constant_values=False) |
| else: |
| slice_from = [0] if first_dim is None else [0, 0] |
| current_bias = tf.slice( |
| weight.value(), tf.convert_to_tensor(slice_from), tf.convert_to_tensor(final_shape) |
| ) |
| bias_mask = tf.fill(tf.convert_to_tensor(final_shape), True) |
|
|
| new_bias = self.add_weight( |
| shape=final_shape, |
| initializer="zeros", |
| trainable=True, |
| name=weight.name.split(":")[0], |
| ) |
| init_bias = tf.where(bias_mask, current_bias, new_bias.value()) |
|
|
| new_bias.assign(init_bias) |
| new_lm_head_bias[attr] = new_bias |
|
|
| return new_lm_head_bias |
|
|
| def _v2_get_resized_lm_head_bias( |
| self, old_lm_head_bias: Dict[str, tf.Variable], new_num_tokens: int |
| ) -> Dict[str, tf.Tensor]: |
| """ |
| Build a resized bias from the old ones. Increasing the size will add newly initialized vectors at the end. |
| Reducing the size will remove vectors from the end |
| |
| Args: |
| old_lm_head_bias (`Dict[str, tf.Variable]`): |
| Old lm head bias to be resized. |
| new_num_tokens (`int`): |
| New number of tokens in the linear matrix. Increasing the size will add newly initialized vectors at |
| the end. Reducing the size will remove vectors from the end. |
| |
| Return: |
| `tf.Tensor`: Values for the resized bias. |
| """ |
| new_lm_head_bias = {} |
|
|
| for attr, weight in old_lm_head_bias.items(): |
| |
| first_dim, old_num_tokens = (None, shape_list(weight)[0]) if tf.rank(weight) == 1 else shape_list(weight) |
| size_diff = new_num_tokens - old_num_tokens |
|
|
| |
| if old_num_tokens > new_num_tokens: |
| new_bias = weight.value()[..., :new_num_tokens] |
| else: |
| padding_shape = [[0, size_diff]] if first_dim is None else [[0, 0], [0, size_diff]] |
| new_bias = tf.pad(weight.value(), tf.convert_to_tensor(padding_shape)) |
|
|
| new_lm_head_bias[attr] = new_bias |
| return new_lm_head_bias |
|
|
| def _get_resized_lm_head_decoder(self, old_lm_head_decoder, new_num_tokens): |
| """ |
| Build a resized decoder from the old ones. Increasing the size will add newly initialized vectors at the end. |
| Reducing the size will remove vectors from the end |
| |
| Args: |
| old_lm_head_decoder (`tf.Variable`): |
| Old lm head decoder to be resized. |
| new_num_tokens (`int`, *optional*): |
| New number of tokens in the linear matrix. |
| |
| Increasing the size will add newly initialized vectors at the end. Reducing the size will remove |
| vectors from the end. If not provided or `None`, just returns None |
| |
| Return: |
| `tf.Variable`: Pointer to the resized decoder or None if the output embeddings are different from the input |
| ones. |
| """ |
| new_lm_head_decoder = old_lm_head_decoder |
| is_input_output_equals = tf.reduce_any( |
| self._get_word_embedding_weight(self.get_input_embeddings()) == old_lm_head_decoder |
| ) |
|
|
| if old_lm_head_decoder is not None and not is_input_output_equals: |
| old_embedding_dim = shape_list(old_lm_head_decoder)[1] |
| decoder_mask, current_decoder = init_copy_embeddings(old_lm_head_decoder, new_num_tokens) |
| new_lm_head_decoder = self.add_weight( |
| shape=(new_num_tokens, old_embedding_dim), |
| initializer="zeros", |
| trainable=True, |
| name=old_lm_head_decoder.name.split(":")[0], |
| ) |
| init_decoder = tf.where(decoder_mask, current_decoder, new_lm_head_decoder.value()) |
|
|
| new_lm_head_decoder.assign(init_decoder) |
|
|
| return new_lm_head_decoder |
|
|
| def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None) -> tf.Variable: |
| """ |
| Build a resized Embedding weights from a provided token Embedding weights. Increasing the size will add newly |
| initialized vectors at the end. Reducing the size will remove vectors from the end |
| |
| Args: |
| old_embeddings (`tf.Variable`): |
| Old embeddings to be resized. |
| new_num_tokens (`int`, *optional*): |
| New number of tokens in the embedding matrix. |
| |
| Increasing the size will add newly initialized vectors at the end. Reducing the size will remove |
| vectors from the end. If not provided or `None`, just returns a pointer to the input tokens |
| `tf.Variable` module of the model without doing anything. |
| |
| Return: |
| `tf.Variable`: Pointer to the resized Embedding Module or the old Embedding Module if `new_num_tokens` is |
| `None` |
| """ |
| |
| old_embedding_dim = shape_list(old_embeddings)[1] |
| init_range = getattr(self.config, "initializer_range", 0.02) |
| embeddings_mask, current_embeddings = init_copy_embeddings(old_embeddings, new_num_tokens) |
| new_embeddings = self.add_weight( |
| name=old_embeddings.name.split(":")[0], |
| shape=[new_num_tokens, old_embedding_dim], |
| initializer=get_initializer(init_range), |
| dtype=tf.float32, |
| ) |
| init_embeddings = tf.where(embeddings_mask, current_embeddings, new_embeddings.value()) |
|
|
| new_embeddings.assign(init_embeddings) |
|
|
| return new_embeddings |
|
|
| def _v2_get_resized_embeddings( |
| self, old_embeddings: tf.keras.layers.Embedding, new_num_tokens: int |
| ) -> tf.keras.layers.Embedding: |
| """ |
| Build a resized Embedding layer from a provided Embedding layer. Increasing the size will add newly initialized |
| vectors at the end. Reducing the size will remove vectors from the end. |
| |
| Args: |
| old_embeddings (`tf.keras.layers.Embedding`): |
| Old embeddings to be resized. |
| new_num_tokens (`int`, *optional*): |
| New number of tokens in the embedding matrix. |
| |
| Return: |
| `tf.keras.layers.Embedding`: Resized Embedding layer. |
| """ |
|
|
| |
| init_range = 0.02 |
| potential_initialization_variable_names = [ |
| "initializer_range", |
| "initializer_factor", |
| "init_std", |
| ] |
| for var_name in potential_initialization_variable_names: |
| if hasattr(self.config, var_name): |
| init_range = getattr(self.config, var_name) |
|
|
| |
| new_embeddings = tf.keras.layers.Embedding( |
| input_dim=new_num_tokens, |
| output_dim=old_embeddings.output_dim, |
| embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=init_range), |
| name=old_embeddings.embeddings.name[:-13], |
| ) |
| new_embeddings(tf.constant([[0]])) |
|
|
| |
| if old_embeddings.input_dim >= new_num_tokens: |
| init_embeddings = old_embeddings.embeddings[:new_num_tokens] |
| else: |
| init_embeddings = tf.concat( |
| [old_embeddings.embeddings, new_embeddings.embeddings[old_embeddings.input_dim :]], axis=0 |
| ) |
| new_embeddings.embeddings.assign(init_embeddings) |
| return new_embeddings |
|
|
| def prune_heads(self, heads_to_prune): |
| """ |
| Prunes heads of the base model. |
| |
| Arguments: |
| heads_to_prune (`Dict[int, List[int]]`): |
| Dictionary with keys being selected layer indices (`int`) and associated values being the list of heads |
| to prune in said layer (list of `int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on |
| layer 1 and heads 2 and 3 on layer 2. |
| """ |
| raise NotImplementedError |
|
|
| def save_pretrained( |
| self, |
| save_directory, |
| saved_model=False, |
| version=1, |
| push_to_hub=False, |
| signatures=None, |
| max_shard_size: Union[int, str] = "10GB", |
| create_pr: bool = False, |
| safe_serialization: bool = False, |
| token: Optional[Union[str, bool]] = None, |
| **kwargs, |
| ): |
| """ |
| Save a model and its configuration file to a directory, so that it can be re-loaded using the |
| [`~TFPreTrainedModel.from_pretrained`] class method. |
| |
| Arguments: |
| save_directory (`str`): |
| Directory to which to save. Will be created if it doesn't exist. |
| saved_model (`bool`, *optional*, defaults to `False`): |
| If the model has to be saved in saved model format as well or not. |
| version (`int`, *optional*, defaults to 1): |
| The version of the saved model. A saved model needs to be versioned in order to be properly loaded by |
| TensorFlow Serving as detailed in the official documentation |
| https://www.tensorflow.org/tfx/serving/serving_basic |
| push_to_hub (`bool`, *optional*, defaults to `False`): |
| Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the |
| repository you want to push to with `repo_id` (will default to the name of `save_directory` in your |
| namespace). |
| signatures (`dict` or `tf.function`, *optional*): |
| Model's signature used for serving. This will be passed to the `signatures` argument of model.save(). |
| max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`): |
| The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size |
| lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`). |
| |
| <Tip warning={true}> |
| |
| If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard |
| which will be bigger than `max_shard_size`. |
| |
| </Tip> |
| |
| create_pr (`bool`, *optional*, defaults to `False`): |
| Whether or not to create a PR with the uploaded files or directly commit. |
| safe_serialization (`bool`, *optional*, defaults to `False`): |
| Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). |
| token (`str` or `bool`, *optional*): |
| The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use |
| the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). |
| kwargs (`Dict[str, Any]`, *optional*): |
| Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. |
| """ |
| use_auth_token = kwargs.pop("use_auth_token", None) |
|
|
| if use_auth_token is not None: |
| warnings.warn( |
| "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning |
| ) |
| if token is not None: |
| raise ValueError( |
| "`token` and `use_auth_token` are both specified. Please set only the argument `token`." |
| ) |
| token = use_auth_token |
|
|
| if token is not None: |
| kwargs["token"] = token |
|
|
| if os.path.isfile(save_directory): |
| logger.error(f"Provided path ({save_directory}) should be a directory, not a file") |
| return |
|
|
| os.makedirs(save_directory, exist_ok=True) |
|
|
| if push_to_hub: |
| commit_message = kwargs.pop("commit_message", None) |
| repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1]) |
| repo_id = self._create_repo(repo_id, **kwargs) |
| files_timestamps = self._get_files_timestamps(save_directory) |
|
|
| if saved_model: |
| |
| |
| if getattr(self.config, "torch_dtype", None) is not None and not isinstance(self.config.torch_dtype, str): |
| self.config.torch_dtype = str(self.config.torch_dtype).split(".")[1] |
| if signatures is None: |
| serving_default = self.serving.get_concrete_function(self.input_signature) |
| if any(spec.dtype == tf.int32 for spec in self.input_signature.values()): |
| int64_spec = { |
| key: tf.TensorSpec( |
| shape=spec.shape, dtype=tf.int64 if spec.dtype == tf.int32 else spec.dtype, name=spec.name |
| ) |
| for key, spec in self.input_signature.items() |
| } |
| int64_serving = self.serving.get_concrete_function(int64_spec) |
| signatures = {"serving_default": serving_default, "int64_serving": int64_serving} |
| else: |
| signatures = serving_default |
| saved_model_dir = os.path.join(save_directory, "saved_model", str(version)) |
| self.save(saved_model_dir, include_optimizer=False, signatures=signatures) |
| logger.info(f"Saved model created in {saved_model_dir}") |
|
|
| |
| self.config.architectures = [self.__class__.__name__[2:]] |
|
|
| |
| |
| if self._auto_class is not None: |
| custom_object_save(self, save_directory, config=self.config) |
|
|
| self.config.save_pretrained(save_directory) |
| if self.can_generate(): |
| self.generation_config.save_pretrained(save_directory) |
|
|
| |
| weights_name = SAFE_WEIGHTS_NAME if safe_serialization else TF2_WEIGHTS_NAME |
| output_model_file = os.path.join(save_directory, weights_name) |
|
|
| shards, index = tf_shard_checkpoint(self.weights, max_shard_size) |
|
|
| |
| for filename in os.listdir(save_directory): |
| full_filename = os.path.join(save_directory, filename) |
| |
| |
| weights_no_suffix = weights_name.replace(".bin", "").replace(".safetensors", "") |
| if ( |
| filename.startswith(weights_no_suffix) |
| and os.path.isfile(full_filename) |
| and filename not in shards.keys() |
| ): |
| os.remove(full_filename) |
|
|
| if index is None: |
| if safe_serialization: |
| state_dict = {format_weight_name(w.name): w.value() for w in self.weights} |
| safe_save_file(state_dict, output_model_file, metadata={"format": "tf"}) |
| else: |
| self.save_weights(output_model_file) |
| logger.info(f"Model weights saved in {output_model_file}") |
| else: |
| save_index_file = os.path.join(save_directory, TF2_WEIGHTS_INDEX_NAME) |
| |
| with open(save_index_file, "w", encoding="utf-8") as index_file: |
| content = json.dumps(index, indent=2, sort_keys=True) + "\n" |
| index_file.write(content) |
| logger.info( |
| f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be " |
| f"split in {len(shards)} checkpoint shards. You can find where each parameters has been saved in the " |
| f"index located at {save_index_file}." |
| ) |
| for shard_file, shard in shards.items(): |
| with h5py.File(os.path.join(save_directory, shard_file), mode="w") as shard_file: |
| layers = [] |
| for layer in sorted(shard, key=lambda x: x.name): |
| if "model." in layer.name or len(layer.name.split("/")) == 1: |
| layer_name = layer.name |
| else: |
| layer_name = "/".join(layer.name.split("/")[1:]) |
| param_dset = shard_file.create_dataset( |
| layer_name, layer.numpy().shape, dtype=layer.numpy().dtype |
| ) |
| param_dset[:] = layer.numpy() |
| layers.append(layer_name.encode("utf8")) |
| save_attributes_to_hdf5_group(shard_file, "layer_names", layers) |
|
|
| if push_to_hub: |
| self._upload_modified_files( |
| save_directory, |
| repo_id, |
| files_timestamps, |
| commit_message=commit_message, |
| token=token, |
| ) |
|
|
| @classmethod |
| def from_pretrained( |
| cls, |
| pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], |
| *model_args, |
| config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None, |
| cache_dir: Optional[Union[str, os.PathLike]] = None, |
| ignore_mismatched_sizes: bool = False, |
| force_download: bool = False, |
| local_files_only: bool = False, |
| token: Optional[Union[str, bool]] = None, |
| revision: str = "main", |
| **kwargs, |
| ): |
| r""" |
| Instantiate a pretrained TF 2.0 model from a pre-trained model configuration. |
| |
| The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come |
| pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning |
| task. |
| |
| The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those |
| weights are discarded. |
| |
| Parameters: |
| pretrained_model_name_or_path (`str`, *optional*): |
| Can be either: |
| |
| - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. |
| Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a |
| user or organization name, like `dbmdz/bert-base-german-cased`. |
| - A path to a *directory* containing model weights saved using |
| [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`. |
| - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In this |
| case, `from_pt` should be set to `True` and a configuration object should be provided as `config` |
| argument. This loading path is slower than converting the PyTorch model in a TensorFlow model |
| using the provided conversion scripts and loading the TensorFlow model afterwards. |
| - `None` if you are both providing the configuration and state dictionary (resp. with keyword |
| arguments `config` and `state_dict`). |
| model_args (sequence of positional arguments, *optional*): |
| All remaining positional arguments will be passed to the underlying model's `__init__` method. |
| config (`Union[PretrainedConfig, str]`, *optional*): |
| Can be either: |
| |
| - an instance of a class derived from [`PretrainedConfig`], |
| - a string valid as input to [`~PretrainedConfig.from_pretrained`]. |
| |
| Configuration for the model to use instead of an automatically loaded configuration. Configuration can |
| be automatically loaded when: |
| |
| - The model is a model provided by the library (loaded with the *model id* string of a pretrained |
| model). |
| - The model was saved using [`~TFPreTrainedModel.save_pretrained`] and is reloaded by supplying the |
| save directory. |
| - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a |
| configuration JSON file named *config.json* is found in the directory. |
| from_pt (`bool`, *optional*, defaults to `False`): |
| Load the model weights from a PyTorch state_dict save file (see docstring of |
| `pretrained_model_name_or_path` argument). |
| ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`): |
| Whether or not to raise an error if some of the weights from the checkpoint do not have the same size |
| as the weights of the model (if for instance, you are instantiating a model with 10 labels from a |
| checkpoint with 3 labels). |
| cache_dir (`str`, *optional*): |
| Path to a directory in which a downloaded pretrained model configuration should be cached if the |
| standard cache should not be used. |
| force_download (`bool`, *optional*, defaults to `False`): |
| Whether or not to force the (re-)download of the model weights and configuration files, overriding the |
| cached versions if they exist. |
| resume_download (`bool`, *optional*, defaults to `False`): |
| Whether or not to delete incompletely received files. Will attempt to resume the download if such a |
| file exists. |
| proxies: |
| (`Dict[str, str], `optional`): A dictionary of proxy servers to use by protocol or endpoint, e.g., |
| `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. |
| output_loading_info(`bool`, *optional*, defaults to `False`): Whether ot not to also return a |
| dictionary containing missing keys, unexpected keys and error messages. |
| local_files_only(`bool`, *optional*, defaults to `False`): |
| Whether or not to only look at local files (e.g., not try downloading the model). |
| token (`str` or `bool`, *optional*): |
| The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use |
| the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). |
| revision (`str`, *optional*, defaults to `"main"`): |
| The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a |
| git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any |
| identifier allowed by git. |
| |
| |
| <Tip> |
| |
| To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>". |
| |
| </Tip> |
| |
| mirror (`str`, *optional*): |
| Mirror source to accelerate downloads in China. If you are from China and have an accessibility |
| problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety. |
| Please refer to the mirror site for more information. |
| subfolder (`str`, *optional*, defaults to `""`): |
| In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can |
| specify the folder name here. |
| tf_to_pt_weight_rename (`Callable`, *optional*): |
| A function that is called to transform the names of weights during the PyTorch to TensorFlow |
| crossloading process. This is not necessary for most models, but is useful to allow composite models to |
| be crossloaded correctly. |
| kwargs (remaining dictionary of keyword arguments, *optional*): |
| Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., |
| `output_attentions=True`). Behaves differently depending on whether a `config` is provided or |
| automatically loaded: |
| |
| - If a configuration is provided with `config`, `**kwargs` will be directly passed to the |
| underlying model's `__init__` method (we assume all relevant updates to the configuration have |
| already been done) |
| - If a configuration is not provided, `kwargs` will be first passed to the configuration class |
| initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that |
| corresponds to a configuration attribute will be used to override said attribute with the |
| supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute |
| will be passed to the underlying model's `__init__` function. |
| |
| Examples: |
| |
| ```python |
| >>> from transformers import BertConfig, TFBertModel |
| |
| >>> # Download model and configuration from huggingface.co and cache. |
| >>> model = TFBertModel.from_pretrained("bert-base-uncased") |
| >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable). |
| >>> model = TFBertModel.from_pretrained("./test/saved_model/") |
| >>> # Update configuration during loading. |
| >>> model = TFBertModel.from_pretrained("bert-base-uncased", output_attentions=True) |
| >>> assert model.config.output_attentions == True |
| >>> # Loading from a Pytorch model file instead of a TensorFlow checkpoint (slower, for example purposes, not runnable). |
| >>> config = BertConfig.from_json_file("./pt_model/my_pt_model_config.json") |
| >>> model = TFBertModel.from_pretrained("./pt_model/my_pytorch_model.bin", from_pt=True, config=config) |
| ```""" |
| from_pt = kwargs.pop("from_pt", False) |
| resume_download = kwargs.pop("resume_download", False) |
| proxies = kwargs.pop("proxies", None) |
| output_loading_info = kwargs.pop("output_loading_info", False) |
| use_auth_token = kwargs.pop("use_auth_token", None) |
| trust_remote_code = kwargs.pop("trust_remote_code", None) |
| _ = kwargs.pop("mirror", None) |
| load_weight_prefix = kwargs.pop("load_weight_prefix", None) |
| from_pipeline = kwargs.pop("_from_pipeline", None) |
| from_auto_class = kwargs.pop("_from_auto", False) |
| subfolder = kwargs.pop("subfolder", "") |
| commit_hash = kwargs.pop("_commit_hash", None) |
| tf_to_pt_weight_rename = kwargs.pop("tf_to_pt_weight_rename", None) |
|
|
| |
| _ = kwargs.pop("adapter_kwargs", None) |
|
|
| if use_auth_token is not None: |
| warnings.warn( |
| "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning |
| ) |
| if token is not None: |
| raise ValueError( |
| "`token` and `use_auth_token` are both specified. Please set only the argument `token`." |
| ) |
| token = use_auth_token |
|
|
| if trust_remote_code is True: |
| logger.warning( |
| "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is" |
| " ignored." |
| ) |
|
|
| user_agent = {"file_type": "model", "framework": "tensorflow", "from_auto_class": from_auto_class} |
| if from_pipeline is not None: |
| user_agent["using_pipeline"] = from_pipeline |
|
|
| if is_offline_mode() and not local_files_only: |
| logger.info("Offline mode: forcing local_files_only=True") |
| local_files_only = True |
|
|
| |
| if not isinstance(config, PretrainedConfig): |
| config_path = config if config is not None else pretrained_model_name_or_path |
| config, model_kwargs = cls.config_class.from_pretrained( |
| config_path, |
| cache_dir=cache_dir, |
| return_unused_kwargs=True, |
| force_download=force_download, |
| resume_download=resume_download, |
| proxies=proxies, |
| local_files_only=local_files_only, |
| token=token, |
| revision=revision, |
| _from_auto=from_auto_class, |
| _from_pipeline=from_pipeline, |
| _commit_hash=commit_hash, |
| **kwargs, |
| ) |
| else: |
| model_kwargs = kwargs |
|
|
| if commit_hash is None: |
| commit_hash = getattr(config, "_commit_hash", None) |
|
|
| |
| |
| is_sharded = False |
| |
| if pretrained_model_name_or_path is not None: |
| pretrained_model_name_or_path = str(pretrained_model_name_or_path) |
| is_local = os.path.isdir(pretrained_model_name_or_path) |
| if is_local: |
| if from_pt and os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)): |
| |
| archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) |
| elif from_pt and os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_INDEX_NAME)): |
| |
| archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_INDEX_NAME) |
| is_sharded = True |
| elif is_safetensors_available() and os.path.isfile( |
| os.path.join(pretrained_model_name_or_path, SAFE_WEIGHTS_NAME) |
| ): |
| |
| archive_file = os.path.join(pretrained_model_name_or_path, SAFE_WEIGHTS_NAME) |
| elif is_safetensors_available() and os.path.isfile( |
| os.path.join(pretrained_model_name_or_path, SAFE_WEIGHTS_INDEX_NAME) |
| ): |
| |
| archive_file = os.path.join(pretrained_model_name_or_path, SAFE_WEIGHTS_INDEX_NAME) |
| is_sharded = True |
| raise NotImplementedError("Support for sharded checkpoints using safetensors is coming soon!") |
| elif os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)): |
| |
| archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME) |
| elif os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_INDEX_NAME)): |
| |
| archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_INDEX_NAME) |
| is_sharded = True |
| |
| elif os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)) or os.path.isfile( |
| os.path.join(pretrained_model_name_or_path, WEIGHTS_INDEX_NAME) |
| ): |
| raise EnvironmentError( |
| f"Error no file named {TF2_WEIGHTS_NAME} found in directory {pretrained_model_name_or_path} " |
| "but there is a file for PyTorch weights. Use `from_pt=True` to load this model from those " |
| "weights." |
| ) |
| else: |
| raise EnvironmentError( |
| f"Error no file named {TF2_WEIGHTS_NAME} or {WEIGHTS_NAME} found in directory " |
| f"{pretrained_model_name_or_path}." |
| ) |
| elif os.path.isfile(pretrained_model_name_or_path): |
| archive_file = pretrained_model_name_or_path |
| is_local = True |
| elif os.path.isfile(pretrained_model_name_or_path + ".index"): |
| archive_file = pretrained_model_name_or_path + ".index" |
| is_local = True |
| elif is_remote_url(pretrained_model_name_or_path): |
| filename = pretrained_model_name_or_path |
| resolved_archive_file = download_url(pretrained_model_name_or_path) |
| else: |
| |
| if from_pt: |
| filename = WEIGHTS_NAME |
| elif is_safetensors_available(): |
| filename = SAFE_WEIGHTS_NAME |
| else: |
| filename = TF2_WEIGHTS_NAME |
|
|
| try: |
| |
| cached_file_kwargs = { |
| "cache_dir": cache_dir, |
| "force_download": force_download, |
| "proxies": proxies, |
| "resume_download": resume_download, |
| "local_files_only": local_files_only, |
| "token": token, |
| "user_agent": user_agent, |
| "revision": revision, |
| "subfolder": subfolder, |
| "_raise_exceptions_for_missing_entries": False, |
| "_commit_hash": commit_hash, |
| } |
| resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs) |
|
|
| |
| |
| if resolved_archive_file is None and filename == SAFE_WEIGHTS_NAME: |
| |
| resolved_archive_file = cached_file( |
| pretrained_model_name_or_path, SAFE_WEIGHTS_INDEX_NAME, **cached_file_kwargs |
| ) |
| if resolved_archive_file is not None: |
| is_sharded = True |
| raise NotImplementedError( |
| "Support for sharded checkpoints using safetensors is coming soon!" |
| ) |
| else: |
| |
| filename = TF2_WEIGHTS_NAME |
| resolved_archive_file = cached_file( |
| pretrained_model_name_or_path, TF2_WEIGHTS_NAME, **cached_file_kwargs |
| ) |
| if resolved_archive_file is None and filename == TF2_WEIGHTS_NAME: |
| |
| resolved_archive_file = cached_file( |
| pretrained_model_name_or_path, TF2_WEIGHTS_INDEX_NAME, **cached_file_kwargs |
| ) |
| if resolved_archive_file is not None: |
| is_sharded = True |
| if resolved_archive_file is None and filename == WEIGHTS_NAME: |
| |
| resolved_archive_file = cached_file( |
| pretrained_model_name_or_path, WEIGHTS_INDEX_NAME, **cached_file_kwargs |
| ) |
| if resolved_archive_file is not None: |
| is_sharded = True |
| if resolved_archive_file is None: |
| |
| |
| has_file_kwargs = { |
| "revision": revision, |
| "proxies": proxies, |
| "token": token, |
| } |
| if has_file(pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs): |
| raise EnvironmentError( |
| f"{pretrained_model_name_or_path} does not appear to have a file named" |
| f" {TF2_WEIGHTS_NAME} but there is a file for PyTorch weights. Use `from_pt=True` to" |
| " load this model from those weights." |
| ) |
| else: |
| raise EnvironmentError( |
| f"{pretrained_model_name_or_path} does not appear to have a file named {WEIGHTS_NAME}," |
| f" {TF2_WEIGHTS_NAME} or {TF_WEIGHTS_NAME}" |
| ) |
|
|
| except EnvironmentError: |
| |
| |
| raise |
| except Exception: |
| |
|
|
| raise EnvironmentError( |
| f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it" |
| " from 'https://huggingface.co/models', make sure you don't have a local directory with the" |
| f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a" |
| f" directory containing a file named {WEIGHTS_NAME}, {TF2_WEIGHTS_NAME} or {TF_WEIGHTS_NAME}" |
| ) |
| if is_local: |
| logger.info(f"loading weights file {archive_file}") |
| resolved_archive_file = archive_file |
| filename = resolved_archive_file.split(os.path.sep)[-1] |
| else: |
| logger.info(f"loading weights file {filename} from cache at {resolved_archive_file}") |
| else: |
| resolved_archive_file = None |
|
|
| |
| if is_sharded: |
| |
| resolved_archive_file, _ = get_checkpoint_shard_files( |
| pretrained_model_name_or_path, |
| resolved_archive_file, |
| cache_dir=cache_dir, |
| force_download=force_download, |
| proxies=proxies, |
| resume_download=resume_download, |
| local_files_only=local_files_only, |
| token=token, |
| user_agent=user_agent, |
| revision=revision, |
| _commit_hash=commit_hash, |
| ) |
|
|
| safetensors_from_pt = False |
| if filename == SAFE_WEIGHTS_NAME: |
| with safe_open(resolved_archive_file, framework="tf") as f: |
| safetensors_metadata = f.metadata() |
| if safetensors_metadata is None or safetensors_metadata.get("format") not in ["pt", "tf", "flax"]: |
| raise OSError( |
| f"The safetensors archive passed at {resolved_archive_file} does not contain the valid metadata." |
| " Make sure you save your model with the `save_pretrained` method." |
| ) |
| safetensors_from_pt = safetensors_metadata.get("format") == "pt" |
|
|
| config.name_or_path = pretrained_model_name_or_path |
|
|
| |
| |
| if cls._requires_load_weight_prefix and model_kwargs.get("name") is not None: |
| model_kwargs["load_weight_prefix"] = load_weight_prefix + "/" + model_kwargs.get("name") |
|
|
| |
| model = cls(config, *model_args, **model_kwargs) |
|
|
| if from_pt: |
| from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model |
|
|
| |
| return load_pytorch_checkpoint_in_tf2_model( |
| model, |
| resolved_archive_file, |
| allow_missing_keys=True, |
| output_loading_info=output_loading_info, |
| _prefix=load_weight_prefix, |
| tf_to_pt_weight_rename=tf_to_pt_weight_rename, |
| ) |
|
|
| |
| if load_weight_prefix is not None: |
| with tf.compat.v1.variable_scope(load_weight_prefix): |
| model.build() |
| else: |
| model.build() |
|
|
| if safetensors_from_pt: |
| from .modeling_tf_pytorch_utils import load_pytorch_state_dict_in_tf2_model |
|
|
| with safe_open(resolved_archive_file, framework="tf") as safetensors_archive: |
| |
| |
| |
| return load_pytorch_state_dict_in_tf2_model( |
| model, |
| safetensors_archive, |
| tf_inputs=False, |
| allow_missing_keys=True, |
| output_loading_info=output_loading_info, |
| _prefix=load_weight_prefix, |
| ignore_mismatched_sizes=ignore_mismatched_sizes, |
| ) |
|
|
| |
| |
| try: |
| if is_sharded: |
| for file in resolved_archive_file: |
| os.path.isfile(file), f"Error retrieving files {file}" |
|
|
| missing_keys, unexpected_keys, mismatched_keys = load_tf_sharded_weights( |
| model, |
| resolved_archive_file, |
| ignore_mismatched_sizes=ignore_mismatched_sizes, |
| _prefix=load_weight_prefix, |
| ) |
| else: |
| missing_keys, unexpected_keys, mismatched_keys = load_tf_weights( |
| model, |
| resolved_archive_file, |
| ignore_mismatched_sizes=ignore_mismatched_sizes, |
| _prefix=load_weight_prefix, |
| ) |
| except OSError as e: |
| try: |
| with open(resolved_archive_file) as f: |
| if f.read().startswith("version"): |
| raise OSError( |
| "You seem to have cloned a repository without having git-lfs installed. Please install " |
| "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder " |
| "you cloned." |
| ) |
| else: |
| raise ValueError from e |
| except (UnicodeDecodeError, ValueError): |
| raise OSError( |
| "Unable to load weights from h5 file. " |
| "If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. " |
| ) |
|
|
| if cls._keys_to_ignore_on_load_missing is not None: |
| for pat in cls._keys_to_ignore_on_load_missing: |
| missing_keys = [k for k in missing_keys if re.search(pat, k) is None] |
|
|
| if cls._keys_to_ignore_on_load_unexpected is not None: |
| for pat in cls._keys_to_ignore_on_load_unexpected: |
| unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None] |
|
|
| if len(unexpected_keys) > 0: |
| logger.warning( |
| f"Some layers from the model checkpoint at {pretrained_model_name_or_path} were not used when" |
| f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are" |
| f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task or" |
| " with another architecture (e.g. initializing a BertForSequenceClassification model from a" |
| " BertForPreTraining model).\n- This IS NOT expected if you are initializing" |
| f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly identical" |
| " (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)." |
| ) |
| else: |
| logger.warning(f"All model checkpoint layers were used when initializing {model.__class__.__name__}.\n") |
|
|
| if len(missing_keys) > 0: |
| logger.warning( |
| f"Some layers of {model.__class__.__name__} were not initialized from the model checkpoint at" |
| f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably" |
| " TRAIN this model on a down-stream task to be able to use it for predictions and inference." |
| ) |
| elif len(mismatched_keys) == 0: |
| logger.warning( |
| f"All the layers of {model.__class__.__name__} were initialized from the model checkpoint at" |
| f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the checkpoint" |
| f" was trained on, you can already use {model.__class__.__name__} for predictions without further" |
| " training." |
| ) |
| if len(mismatched_keys) > 0: |
| mismatched_warning = "\n".join( |
| [ |
| f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated" |
| for key, shape1, shape2 in mismatched_keys |
| ] |
| ) |
| logger.warning( |
| f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at" |
| f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not" |
| f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be able" |
| " to use it for predictions and inference." |
| ) |
|
|
| |
| if model.can_generate(): |
| try: |
| model.generation_config = GenerationConfig.from_pretrained( |
| pretrained_model_name_or_path, |
| cache_dir=cache_dir, |
| force_download=force_download, |
| resume_download=resume_download, |
| proxies=proxies, |
| local_files_only=local_files_only, |
| token=token, |
| revision=revision, |
| subfolder=subfolder, |
| _from_auto=from_auto_class, |
| _from_pipeline=from_pipeline, |
| **kwargs, |
| ) |
| except OSError: |
| logger.info( |
| "Generation config file not found, using a generation config created from the model config." |
| ) |
| pass |
|
|
| if output_loading_info: |
| loading_info = { |
| "missing_keys": missing_keys, |
| "unexpected_keys": unexpected_keys, |
| "mismatched_keys": mismatched_keys, |
| } |
|
|
| return model, loading_info |
|
|
| return model |
|
|
| def push_to_hub( |
| self, |
| repo_id: str, |
| use_temp_dir: Optional[bool] = None, |
| commit_message: Optional[str] = None, |
| private: Optional[bool] = None, |
| max_shard_size: Optional[Union[int, str]] = "10GB", |
| token: Optional[Union[bool, str]] = None, |
| |
| use_auth_token: Optional[Union[bool, str]] = None, |
| create_pr: bool = False, |
| **base_model_card_args, |
| ) -> str: |
| """ |
| Upload the model files to the 🤗 Model Hub while synchronizing a local clone of the repo in `repo_path_or_name`. |
| |
| Parameters: |
| repo_id (`str`): |
| The name of the repository you want to push your model to. It should contain your organization name |
| when pushing to a given organization. |
| use_temp_dir (`bool`, *optional*): |
| Whether or not to use a temporary directory to store the files saved before they are pushed to the Hub. |
| Will default to `True` if there is no directory named like `repo_id`, `False` otherwise. |
| commit_message (`str`, *optional*): |
| Message to commit while pushing. Will default to `"Upload model"`. |
| private (`bool`, *optional*): |
| Whether or not the repository created should be private. |
| token (`bool` or `str`, *optional*): |
| The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated |
| when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url` |
| is not specified. |
| max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`): |
| Only applicable for models. The maximum size for a checkpoint before being sharded. Checkpoints shard |
| will then be each of size lower than this size. If expressed as a string, needs to be digits followed |
| by a unit (like `"5MB"`). |
| create_pr (`bool`, *optional*, defaults to `False`): |
| Whether or not to create a PR with the uploaded files or directly commit. |
| |
| Examples: |
| |
| ```python |
| from transformers import TFAutoModel |
| |
| model = TFAutoModel.from_pretrained("bert-base-cased") |
| |
| # Push the model to your namespace with the name "my-finetuned-bert". |
| model.push_to_hub("my-finetuned-bert") |
| |
| # Push the model to an organization with the name "my-finetuned-bert". |
| model.push_to_hub("huggingface/my-finetuned-bert") |
| ``` |
| """ |
| if use_auth_token is not None: |
| warnings.warn( |
| "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning |
| ) |
| if token is not None: |
| raise ValueError( |
| "`token` and `use_auth_token` are both specified. Please set only the argument `token`." |
| ) |
| token = use_auth_token |
|
|
| if "repo_path_or_name" in base_model_card_args: |
| warnings.warn( |
| "The `repo_path_or_name` argument is deprecated and will be removed in v5 of Transformers. Use " |
| "`repo_id` instead." |
| ) |
| repo_id = base_model_card_args.pop("repo_path_or_name") |
| |
| repo_url = base_model_card_args.pop("repo_url", None) |
| organization = base_model_card_args.pop("organization", None) |
|
|
| if os.path.isdir(repo_id): |
| working_dir = repo_id |
| repo_id = repo_id.split(os.path.sep)[-1] |
| else: |
| working_dir = repo_id.split("/")[-1] |
|
|
| repo_id = self._create_repo( |
| repo_id, private=private, token=token, repo_url=repo_url, organization=organization |
| ) |
|
|
| if use_temp_dir is None: |
| use_temp_dir = not os.path.isdir(working_dir) |
|
|
| with working_or_temp_dir(working_dir=working_dir, use_temp_dir=use_temp_dir) as work_dir: |
| files_timestamps = self._get_files_timestamps(work_dir) |
|
|
| |
| self.save_pretrained(work_dir, max_shard_size=max_shard_size) |
| if hasattr(self, "history") and hasattr(self, "create_model_card"): |
| |
| base_model_card_args = { |
| "output_dir": work_dir, |
| "model_name": Path(repo_id).name, |
| } |
| base_model_card_args.update(base_model_card_args) |
| self.create_model_card(**base_model_card_args) |
|
|
| self._upload_modified_files( |
| work_dir, |
| repo_id, |
| files_timestamps, |
| commit_message=commit_message, |
| token=token, |
| create_pr=create_pr, |
| ) |
|
|
| @classmethod |
| def register_for_auto_class(cls, auto_class="TFAutoModel"): |
| """ |
| Register this class with a given auto class. This should only be used for custom models as the ones in the |
| library are already mapped with an auto class. |
| |
| <Tip warning={true}> |
| |
| This API is experimental and may have some slight breaking changes in the next releases. |
| |
| </Tip> |
| |
| Args: |
| auto_class (`str` or `type`, *optional*, defaults to `"TFAutoModel"`): |
| The auto class to register this new model with. |
| """ |
| if not isinstance(auto_class, str): |
| auto_class = auto_class.__name__ |
|
|
| import transformers.models.auto as auto_module |
|
|
| if not hasattr(auto_module, auto_class): |
| raise ValueError(f"{auto_class} is not a valid auto class.") |
|
|
| cls._auto_class = auto_class |
|
|
|
|
| class TFConv1D(tf.keras.layers.Layer): |
| """ |
| 1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2). |
| |
| Basically works like a linear layer but the weights are transposed. |
| |
| Args: |
| nf (`int`): |
| The number of output features. |
| nx (`int`): |
| The number of input features. |
| initializer_range (`float`, *optional*, defaults to 0.02): |
| The standard deviation to use to initialize the weights. |
| kwargs (`Dict[str, Any]`, *optional*): |
| Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`. |
| """ |
|
|
| def __init__(self, nf, nx, initializer_range=0.02, **kwargs): |
| super().__init__(**kwargs) |
| self.nf = nf |
| self.nx = nx |
| self.initializer_range = initializer_range |
|
|
| def build(self, input_shape): |
| self.weight = self.add_weight( |
| "weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range) |
| ) |
| self.bias = self.add_weight("bias", shape=[1, self.nf], initializer=tf.zeros_initializer()) |
|
|
| def call(self, x): |
| bz, sl = shape_list(x)[:2] |
|
|
| x = tf.reshape(x, [-1, self.nx]) |
| x = tf.matmul(x, self.weight) + self.bias |
|
|
| x = tf.reshape(x, [bz, sl, self.nf]) |
|
|
| return x |
|
|
|
|
| class TFSharedEmbeddings(tf.keras.layers.Layer): |
| r""" |
| Construct shared token embeddings. |
| |
| The weights of the embedding layer is usually shared with the weights of the linear decoder when doing language |
| modeling. |
| |
| Args: |
| vocab_size (`int`): |
| The size of the vocabulary, e.g., the number of unique tokens. |
| hidden_size (`int`): |
| The size of the embedding vectors. |
| initializer_range (`float`, *optional*): |
| The standard deviation to use when initializing the weights. If no value is provided, it will default to |
| \\(1/\sqrt{hidden\_size}\\). |
| kwargs (`Dict[str, Any]`, *optional*): |
| Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`. |
| """ |
| |
|
|
| def __init__(self, vocab_size: int, hidden_size: int, initializer_range: Optional[float] = None, **kwargs): |
| super().__init__(**kwargs) |
| self.vocab_size = vocab_size |
| self.hidden_size = hidden_size |
| self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range |
| warnings.warn( |
| "`TFSharedEmbeddings` is scheduled for deletion in v4.32, use `tf.keras.layers.Embedding` instead.", |
| DeprecationWarning, |
| ) |
|
|
| def build(self, input_shape): |
| """ |
| Build shared token embedding layer Shared weights logic adapted from |
| https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 |
| """ |
| self.weight = self.add_weight( |
| "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range) |
| ) |
| super().build(input_shape) |
|
|
| def get_config(self): |
| config = { |
| "vocab_size": self.vocab_size, |
| "hidden_size": self.hidden_size, |
| "initializer_range": self.initializer_range, |
| } |
| base_config = super().get_config() |
|
|
| return dict(list(base_config.items()) + list(config.items())) |
|
|
| def call(self, inputs: tf.Tensor, mode: str = "embedding") -> tf.Tensor: |
| """ |
| Get token embeddings of inputs or decode final hidden state. |
| |
| Args: |
| inputs (`tf.Tensor`): |
| In embedding mode, should be an int64 tensor with shape `[batch_size, length]`. |
| |
| In linear mode, should be a float tensor with shape `[batch_size, length, hidden_size]`. |
| mode (`str`, defaults to `"embedding"`): |
| A valid value is either `"embedding"` or `"linear"`, the first one indicates that the layer should be |
| used as an embedding layer, the second one that the layer should be used as a linear decoder. |
| |
| Returns: |
| `tf.Tensor`: In embedding mode, the output is a float32 embedding tensor, with shape `[batch_size, length, |
| embedding_size]`. |
| |
| In linear mode, the output is a float32 with shape `[batch_size, length, vocab_size]`. |
| |
| Raises: |
| ValueError: if `mode` is not valid. |
| |
| Shared weights logic is adapted from |
| [here](https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24). |
| """ |
| if mode == "embedding": |
| return self._embedding(inputs) |
| elif mode == "linear": |
| return self._linear(inputs) |
| else: |
| raise ValueError(f"mode {mode} is not valid.") |
|
|
| def _embedding(self, input_ids): |
| """Applies embedding based on inputs tensor.""" |
| return tf.gather(self.weight, input_ids) |
|
|
| def _linear(self, inputs): |
| """ |
| Computes logits by running inputs through a linear layer. |
| |
| Args: |
| inputs: A float32 tensor with shape [..., hidden_size] |
| |
| Returns: |
| float32 tensor with shape [..., vocab_size]. |
| """ |
| first_dims = shape_list(inputs)[:-1] |
| x = tf.reshape(inputs, [-1, self.hidden_size]) |
| logits = tf.matmul(x, self.weight, transpose_b=True) |
|
|
| return tf.reshape(logits, first_dims + [self.vocab_size]) |
|
|
|
|
| class TFSequenceSummary(tf.keras.layers.Layer): |
| """ |
| Compute a single vector summary of a sequence hidden states. |
| |
| Args: |
| config ([`PretrainedConfig`]): |
| The config used by the model. Relevant arguments in the config class of the model are (refer to the actual |
| config class of your model for the default values it uses): |
| |
| - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are: |
| |
| - `"last"` -- Take the last token hidden state (like XLNet) |
| - `"first"` -- Take the first token hidden state (like Bert) |
| - `"mean"` -- Take the mean of all tokens hidden states |
| - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2) |
| - `"attn"` -- Not implemented now, use multi-head attention |
| |
| - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction. |
| - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes |
| (otherwise to `config.hidden_size`). |
| - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output, |
| another string or `None` will add no activation. |
| - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation. |
| - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation. |
| |
| initializer_range (`float`, defaults to 0.02): The standard deviation to use to initialize the weights. |
| kwargs (`Dict[str, Any]`, *optional*): |
| Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`. |
| """ |
|
|
| def __init__(self, config: PretrainedConfig, initializer_range: float = 0.02, **kwargs): |
| super().__init__(**kwargs) |
|
|
| self.summary_type = config.summary_type if hasattr(config, "summary_use_proj") else "last" |
| if self.summary_type == "attn": |
| |
| |
| |
| raise NotImplementedError |
|
|
| self.has_summary = hasattr(config, "summary_use_proj") and config.summary_use_proj |
| if self.has_summary: |
| if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0: |
| num_classes = config.num_labels |
| else: |
| num_classes = config.hidden_size |
| self.summary = tf.keras.layers.Dense( |
| num_classes, kernel_initializer=get_initializer(initializer_range), name="summary" |
| ) |
|
|
| self.has_activation = False |
| activation_string = getattr(config, "summary_activation", None) |
| if activation_string is not None: |
| self.has_activation = True |
| self.activation = get_tf_activation(activation_string) |
|
|
| self.has_first_dropout = hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0 |
| if self.has_first_dropout: |
| self.first_dropout = tf.keras.layers.Dropout(config.summary_first_dropout) |
|
|
| self.has_last_dropout = hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0 |
| if self.has_last_dropout: |
| self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout) |
|
|
| def call(self, inputs, cls_index=None, training=False): |
| if not isinstance(inputs, (dict, tuple, list)): |
| hidden_states = inputs |
| elif isinstance(inputs, (tuple, list)): |
| hidden_states = inputs[0] |
| cls_index = inputs[1] if len(inputs) > 1 else None |
| assert len(inputs) <= 2, "Too many inputs." |
| else: |
| hidden_states = inputs.get("hidden_states") |
| cls_index = inputs.get("cls_index", None) |
|
|
| if self.summary_type == "last": |
| output = hidden_states[:, -1] |
| elif self.summary_type == "first": |
| output = hidden_states[:, 0] |
| elif self.summary_type == "mean": |
| output = tf.reduce_mean(hidden_states, axis=1) |
| elif self.summary_type == "cls_index": |
| hidden_shape = shape_list(hidden_states) |
| if cls_index is None: |
| cls_index = tf.fill( |
| hidden_shape[:-2], hidden_shape[-2] - 1 |
| ) |
| cls_shape = shape_list(cls_index) |
| if len(cls_shape) <= len(hidden_shape) - 2: |
| cls_index = tf.expand_dims(cls_index, axis=-1) |
| |
| |
| |
| |
| output = tf.gather(hidden_states, cls_index, batch_dims=len(hidden_shape) - 2) |
| output = tf.squeeze( |
| output, axis=len(hidden_shape) - 2 |
| ) |
| elif self.summary_type == "attn": |
| raise NotImplementedError |
|
|
| if self.has_first_dropout: |
| output = self.first_dropout(output, training=training) |
|
|
| if self.has_summary: |
| output = self.summary(output) |
|
|
| if self.has_activation: |
| output = self.activation(output) |
|
|
| if self.has_last_dropout: |
| output = self.last_dropout(output, training=training) |
|
|
| return output |
|
|
|
|
| def get_initializer(initializer_range: float = 0.02) -> tf.keras.initializers.TruncatedNormal: |
| """ |
| Creates a `tf.keras.initializers.TruncatedNormal` with the given range. |
| |
| Args: |
| initializer_range (*float*, defaults to 0.02): Standard deviation of the initializer range. |
| |
| Returns: |
| `tf.keras.initializers.TruncatedNormal`: The truncated normal initializer. |
| """ |
| return tf.keras.initializers.TruncatedNormal(stddev=initializer_range) |
|
|