Spaces:
Sleeping
Sleeping
| # Copyright 2024 The TensorFlow Authors. All Rights Reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """Library of components of export_tfhub.py. See docstring there for more.""" | |
| import contextlib | |
| import hashlib | |
| import os | |
| import tempfile | |
| from typing import Optional, Text, Tuple | |
| # Import libraries | |
| from absl import logging | |
| import tensorflow as tf, tf_keras | |
| # pylint: disable=g-direct-tensorflow-import TODO(b/175369555): Remove these. | |
| from tensorflow.core.protobuf import saved_model_pb2 | |
| from tensorflow.python.ops import control_flow_assert | |
| # pylint: enable=g-direct-tensorflow-import | |
| from official.legacy.bert import configs | |
| from official.modeling import tf_utils | |
| from official.nlp.configs import encoders | |
| from official.nlp.modeling import layers | |
| from official.nlp.modeling import models | |
| from official.nlp.modeling import networks | |
| def get_bert_encoder(bert_config): | |
| """Returns a BertEncoder with dict outputs.""" | |
| bert_encoder = networks.BertEncoder( | |
| vocab_size=bert_config.vocab_size, | |
| hidden_size=bert_config.hidden_size, | |
| num_layers=bert_config.num_hidden_layers, | |
| num_attention_heads=bert_config.num_attention_heads, | |
| intermediate_size=bert_config.intermediate_size, | |
| activation=tf_utils.get_activation(bert_config.hidden_act), | |
| dropout_rate=bert_config.hidden_dropout_prob, | |
| attention_dropout_rate=bert_config.attention_probs_dropout_prob, | |
| max_sequence_length=bert_config.max_position_embeddings, | |
| type_vocab_size=bert_config.type_vocab_size, | |
| initializer=tf_keras.initializers.TruncatedNormal( | |
| stddev=bert_config.initializer_range), | |
| embedding_width=bert_config.embedding_size, | |
| dict_outputs=True) | |
| return bert_encoder | |
| def get_do_lower_case(do_lower_case, vocab_file=None, sp_model_file=None): | |
| """Returns do_lower_case, replacing None by a guess from vocab file name.""" | |
| if do_lower_case is not None: | |
| return do_lower_case | |
| elif vocab_file: | |
| do_lower_case = "uncased" in vocab_file | |
| logging.info("Using do_lower_case=%s based on name of vocab_file=%s", | |
| do_lower_case, vocab_file) | |
| return do_lower_case | |
| elif sp_model_file: | |
| do_lower_case = True # All public ALBERTs (as of Oct 2020) do it. | |
| logging.info("Defaulting to do_lower_case=%s for Sentencepiece tokenizer", | |
| do_lower_case) | |
| return do_lower_case | |
| else: | |
| raise ValueError("Must set vocab_file or sp_model_file.") | |
| def _create_model( | |
| *, | |
| bert_config: Optional[configs.BertConfig] = None, | |
| encoder_config: Optional[encoders.EncoderConfig] = None, | |
| with_mlm: bool, | |
| ) -> Tuple[tf_keras.Model, tf_keras.Model]: | |
| """Creates the model to export and the model to restore the checkpoint. | |
| Args: | |
| bert_config: A legacy `BertConfig` to create a `BertEncoder` object. Exactly | |
| one of encoder_config and bert_config must be set. | |
| encoder_config: An `EncoderConfig` to create an encoder of the configured | |
| type (`BertEncoder` or other). | |
| with_mlm: A bool to control the second component of the result. If True, | |
| will create a `BertPretrainerV2` object; otherwise, will create a | |
| `BertEncoder` object. | |
| Returns: | |
| A Tuple of (1) a Keras model that will be exported, (2) a `BertPretrainerV2` | |
| object or `BertEncoder` object depending on the value of `with_mlm` | |
| argument, which contains the first model and will be used for restoring | |
| weights from the checkpoint. | |
| """ | |
| if (bert_config is not None) == (encoder_config is not None): | |
| raise ValueError("Exactly one of `bert_config` and `encoder_config` " | |
| "can be specified, but got %s and %s" % | |
| (bert_config, encoder_config)) | |
| if bert_config is not None: | |
| encoder = get_bert_encoder(bert_config) | |
| else: | |
| encoder = encoders.build_encoder(encoder_config) | |
| # Convert from list of named inputs to dict of inputs keyed by name. | |
| # Only the latter accepts a dict of inputs after restoring from SavedModel. | |
| if isinstance(encoder.inputs, list) or isinstance(encoder.inputs, tuple): | |
| encoder_inputs_dict = {x.name: x for x in encoder.inputs} | |
| else: | |
| # encoder.inputs by default is dict for BertEncoderV2. | |
| encoder_inputs_dict = encoder.inputs | |
| encoder_output_dict = encoder(encoder_inputs_dict) | |
| # For interchangeability with other text representations, | |
| # add "default" as an alias for BERT's whole-input reptesentations. | |
| encoder_output_dict["default"] = encoder_output_dict["pooled_output"] | |
| core_model = tf_keras.Model( | |
| inputs=encoder_inputs_dict, outputs=encoder_output_dict) | |
| if with_mlm: | |
| if bert_config is not None: | |
| hidden_act = bert_config.hidden_act | |
| else: | |
| assert encoder_config is not None | |
| hidden_act = encoder_config.get().hidden_activation | |
| pretrainer = models.BertPretrainerV2( | |
| encoder_network=encoder, | |
| mlm_activation=tf_utils.get_activation(hidden_act)) | |
| if isinstance(pretrainer.inputs, dict): | |
| pretrainer_inputs_dict = pretrainer.inputs | |
| else: | |
| pretrainer_inputs_dict = {x.name: x for x in pretrainer.inputs} | |
| pretrainer_output_dict = pretrainer(pretrainer_inputs_dict) | |
| mlm_model = tf_keras.Model( | |
| inputs=pretrainer_inputs_dict, outputs=pretrainer_output_dict) | |
| # Set `_auto_track_sub_layers` to False, so that the additional weights | |
| # from `mlm` sub-object will not be included in the core model. | |
| # TODO(b/169210253): Use a public API when available. | |
| core_model._auto_track_sub_layers = False # pylint: disable=protected-access | |
| core_model.mlm = mlm_model | |
| return core_model, pretrainer | |
| else: | |
| return core_model, encoder | |
| def export_model(export_path: Text, | |
| *, | |
| bert_config: Optional[configs.BertConfig] = None, | |
| encoder_config: Optional[encoders.EncoderConfig] = None, | |
| model_checkpoint_path: Text, | |
| with_mlm: bool, | |
| copy_pooler_dense_to_encoder: bool = False, | |
| vocab_file: Optional[Text] = None, | |
| sp_model_file: Optional[Text] = None, | |
| do_lower_case: Optional[bool] = None) -> None: | |
| """Exports an Encoder as SavedModel after restoring pre-trained weights. | |
| The exported SavedModel implements a superset of the Encoder API for | |
| Text embeddings with Transformer Encoders described at | |
| https://www.tensorflow.org/hub/common_saved_model_apis/text. | |
| In particular, the exported SavedModel can be used in the following way: | |
| ``` | |
| # Calls default interface (encoder only). | |
| encoder = hub.load(...) | |
| encoder_inputs = dict( | |
| input_word_ids=..., # Shape [batch, seq_length], dtype=int32 | |
| input_mask=..., # Shape [batch, seq_length], dtype=int32 | |
| input_type_ids=..., # Shape [batch, seq_length], dtype=int32 | |
| ) | |
| encoder_outputs = encoder(encoder_inputs) | |
| assert encoder_outputs.keys() == { | |
| "pooled_output", # Shape [batch_size, width], dtype=float32 | |
| "default", # Alias for "pooled_output" (aligns with other models). | |
| "sequence_output" # Shape [batch_size, seq_length, width], dtype=float32 | |
| "encoder_outputs", # List of Tensors with outputs of all transformer layers. | |
| } | |
| ``` | |
| If `with_mlm` is True, the exported SavedModel can also be called in the | |
| following way: | |
| ``` | |
| # Calls expanded interface that includes logits of the Masked Language Model. | |
| mlm_inputs = dict( | |
| input_word_ids=..., # Shape [batch, seq_length], dtype=int32 | |
| input_mask=..., # Shape [batch, seq_length], dtype=int32 | |
| input_type_ids=..., # Shape [batch, seq_length], dtype=int32 | |
| masked_lm_positions=..., # Shape [batch, num_predictions], dtype=int32 | |
| ) | |
| mlm_outputs = encoder.mlm(mlm_inputs) | |
| assert mlm_outputs.keys() == { | |
| "pooled_output", # Shape [batch, width], dtype=float32 | |
| "sequence_output", # Shape [batch, seq_length, width], dtype=float32 | |
| "encoder_outputs", # List of Tensors with outputs of all transformer layers. | |
| "mlm_logits" # Shape [batch, num_predictions, vocab_size], dtype=float32 | |
| } | |
| ``` | |
| Args: | |
| export_path: The SavedModel output directory. | |
| bert_config: An optional `configs.BertConfig` object. Note: exactly one of | |
| `bert_config` and following `encoder_config` must be specified. | |
| encoder_config: An optional `encoders.EncoderConfig` object. | |
| model_checkpoint_path: The path to the checkpoint. | |
| with_mlm: Whether to export the additional mlm sub-object. | |
| copy_pooler_dense_to_encoder: Whether to copy the pooler's dense layer used | |
| in the next sentence prediction task to the encoder. | |
| vocab_file: The path to the wordpiece vocab file, or None. | |
| sp_model_file: The path to the sentencepiece model file, or None. Exactly | |
| one of vocab_file and sp_model_file must be set. | |
| do_lower_case: Whether to lower-case text before tokenization. | |
| """ | |
| if with_mlm: | |
| core_model, pretrainer = _create_model( | |
| bert_config=bert_config, | |
| encoder_config=encoder_config, | |
| with_mlm=with_mlm) | |
| encoder = pretrainer.encoder_network | |
| # It supports both the new pretrainer checkpoint produced by TF-NLP and | |
| # the checkpoint converted from TF1 (original BERT, SmallBERTs). | |
| checkpoint_items = pretrainer.checkpoint_items | |
| checkpoint = tf.train.Checkpoint(**checkpoint_items) | |
| else: | |
| core_model, encoder = _create_model( | |
| bert_config=bert_config, | |
| encoder_config=encoder_config, | |
| with_mlm=with_mlm) | |
| checkpoint = tf.train.Checkpoint( | |
| model=encoder, # Legacy checkpoints. | |
| encoder=encoder) | |
| checkpoint.restore(model_checkpoint_path).assert_existing_objects_matched() | |
| if copy_pooler_dense_to_encoder: | |
| logging.info("Copy pooler's dense layer to the encoder.") | |
| pooler_checkpoint = tf.train.Checkpoint( | |
| **{"next_sentence.pooler_dense": encoder.pooler_layer}) | |
| pooler_checkpoint.restore( | |
| model_checkpoint_path).assert_existing_objects_matched() | |
| # Before SavedModels for preprocessing appeared in Oct 2020, the encoders | |
| # provided this information to let users do preprocessing themselves. | |
| # We keep doing that for now. It helps users to upgrade incrementally. | |
| # Moreover, it offers an escape hatch for advanced users who want the | |
| # full vocab, not the high-level operations from the preprocessing model. | |
| if vocab_file: | |
| core_model.vocab_file = tf.saved_model.Asset(vocab_file) | |
| if do_lower_case is None: | |
| raise ValueError("Must pass do_lower_case if passing vocab_file.") | |
| core_model.do_lower_case = tf.Variable(do_lower_case, trainable=False) | |
| elif sp_model_file: | |
| # This was used by ALBERT, with implied values of do_lower_case=True | |
| # and strip_diacritics=True. | |
| core_model.sp_model_file = tf.saved_model.Asset(sp_model_file) | |
| else: | |
| raise ValueError("Must set vocab_file or sp_model_file") | |
| core_model.save(export_path, include_optimizer=False, save_format="tf") | |
| class BertPackInputsSavedModelWrapper(tf.train.Checkpoint): | |
| """Wraps a BertPackInputs layer for export to SavedModel. | |
| The wrapper object is suitable for use with `tf.saved_model.save()` and | |
| `.load()`. The wrapper object is callable with inputs and outputs like the | |
| BertPackInputs layer, but differs from saving an unwrapped Keras object: | |
| - The inputs can be a list of 1 or 2 RaggedTensors of dtype int32 and | |
| ragged rank 1 or 2. (In Keras, saving to a tf.function in a SavedModel | |
| would fix the number of RaggedTensors and their ragged rank.) | |
| - The call accepts an optional keyword argument `seq_length=` to override | |
| the layer's .seq_length hyperparameter. (In Keras, a hyperparameter | |
| could not be changed after saving to a tf.function in a SavedModel.) | |
| """ | |
| def __init__(self, bert_pack_inputs: layers.BertPackInputs): | |
| super().__init__() | |
| # Preserve the layer's configured seq_length as a default but make it | |
| # overridable. Having this dynamically determined default argument | |
| # requires self.__call__ to be defined in this indirect way. | |
| default_seq_length = bert_pack_inputs.seq_length | |
| def call(inputs, seq_length=default_seq_length): | |
| return layers.BertPackInputs.bert_pack_inputs( | |
| inputs, | |
| seq_length=seq_length, | |
| start_of_sequence_id=bert_pack_inputs.start_of_sequence_id, | |
| end_of_segment_id=bert_pack_inputs.end_of_segment_id, | |
| padding_id=bert_pack_inputs.padding_id) | |
| self.__call__ = call | |
| for ragged_rank in range(1, 3): | |
| for num_segments in range(1, 3): | |
| _ = self.__call__.get_concrete_function([ | |
| tf.RaggedTensorSpec([None] * (ragged_rank + 1), dtype=tf.int32) | |
| for _ in range(num_segments) | |
| ], | |
| seq_length=tf.TensorSpec( | |
| [], tf.int32)) | |
| def create_preprocessing(*, | |
| vocab_file: Optional[str] = None, | |
| sp_model_file: Optional[str] = None, | |
| do_lower_case: bool, | |
| tokenize_with_offsets: bool, | |
| default_seq_length: int) -> tf_keras.Model: | |
| """Returns a preprocessing Model for given tokenization parameters. | |
| This function builds a Keras Model with attached subobjects suitable for | |
| saving to a SavedModel. The resulting SavedModel implements the Preprocessor | |
| API for Text embeddings with Transformer Encoders described at | |
| https://www.tensorflow.org/hub/common_saved_model_apis/text. | |
| Args: | |
| vocab_file: The path to the wordpiece vocab file, or None. | |
| sp_model_file: The path to the sentencepiece model file, or None. Exactly | |
| one of vocab_file and sp_model_file must be set. This determines the type | |
| of tokenzer that is used. | |
| do_lower_case: Whether to do lower case. | |
| tokenize_with_offsets: Whether to include the .tokenize_with_offsets | |
| subobject. | |
| default_seq_length: The sequence length of preprocessing results from root | |
| callable. This is also the default sequence length for the | |
| bert_pack_inputs subobject. | |
| Returns: | |
| A tf_keras.Model object with several attached subobjects, suitable for | |
| saving as a preprocessing SavedModel. | |
| """ | |
| # Select tokenizer. | |
| if bool(vocab_file) == bool(sp_model_file): | |
| raise ValueError("Must set exactly one of vocab_file, sp_model_file") | |
| if vocab_file: | |
| tokenize = layers.BertTokenizer( | |
| vocab_file=vocab_file, | |
| lower_case=do_lower_case, | |
| tokenize_with_offsets=tokenize_with_offsets) | |
| else: | |
| tokenize = layers.SentencepieceTokenizer( | |
| model_file_path=sp_model_file, | |
| lower_case=do_lower_case, | |
| strip_diacritics=True, # Strip diacritics to follow ALBERT model. | |
| tokenize_with_offsets=tokenize_with_offsets) | |
| # The root object of the preprocessing model can be called to do | |
| # one-shot preprocessing for users with single-sentence inputs. | |
| sentences = tf_keras.layers.Input(shape=(), dtype=tf.string, name="sentences") | |
| if tokenize_with_offsets: | |
| tokens, start_offsets, limit_offsets = tokenize(sentences) | |
| else: | |
| tokens = tokenize(sentences) | |
| pack = layers.BertPackInputs( | |
| seq_length=default_seq_length, | |
| special_tokens_dict=tokenize.get_special_tokens_dict()) | |
| model_inputs = pack(tokens) | |
| preprocessing = tf_keras.Model(sentences, model_inputs) | |
| # Individual steps of preprocessing are made available as named subobjects | |
| # to enable more general preprocessing. For saving, they need to be Models | |
| # in their own right. | |
| preprocessing.tokenize = tf_keras.Model(sentences, tokens) | |
| # Provide an equivalent to tokenize.get_special_tokens_dict(). | |
| preprocessing.tokenize.get_special_tokens_dict = tf.train.Checkpoint() | |
| preprocessing.tokenize.get_special_tokens_dict.__call__ = tf.function( | |
| lambda: tokenize.get_special_tokens_dict(), # pylint: disable=[unnecessary-lambda] | |
| input_signature=[]) | |
| if tokenize_with_offsets: | |
| preprocessing.tokenize_with_offsets = tf_keras.Model( | |
| sentences, [tokens, start_offsets, limit_offsets]) | |
| preprocessing.tokenize_with_offsets.get_special_tokens_dict = ( | |
| preprocessing.tokenize.get_special_tokens_dict) | |
| # Conceptually, this should be | |
| # preprocessing.bert_pack_inputs = tf_keras.Model(tokens, model_inputs) | |
| # but technicalities require us to use a wrapper (see comments there). | |
| # In particular, seq_length can be overridden when calling this. | |
| preprocessing.bert_pack_inputs = BertPackInputsSavedModelWrapper(pack) | |
| return preprocessing | |
| def _move_to_tmpdir(file_path: Optional[Text], tmpdir: Text) -> Optional[Text]: | |
| """Returns new path with same basename and hash of original path.""" | |
| if file_path is None: | |
| return None | |
| olddir, filename = os.path.split(file_path) | |
| hasher = hashlib.sha1() | |
| hasher.update(olddir.encode("utf-8")) | |
| target_dir = os.path.join(tmpdir, hasher.hexdigest()) | |
| target_file = os.path.join(target_dir, filename) | |
| tf.io.gfile.mkdir(target_dir) | |
| tf.io.gfile.copy(file_path, target_file) | |
| return target_file | |
| def export_preprocessing(export_path: Text, | |
| *, | |
| vocab_file: Optional[Text] = None, | |
| sp_model_file: Optional[Text] = None, | |
| do_lower_case: bool, | |
| tokenize_with_offsets: bool, | |
| default_seq_length: int, | |
| experimental_disable_assert: bool = False) -> None: | |
| """Exports preprocessing to a SavedModel for TF Hub.""" | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| # TODO(b/175369555): Remove experimental_disable_assert and its use. | |
| with _maybe_disable_assert(experimental_disable_assert): | |
| preprocessing = create_preprocessing( | |
| vocab_file=_move_to_tmpdir(vocab_file, tmpdir), | |
| sp_model_file=_move_to_tmpdir(sp_model_file, tmpdir), | |
| do_lower_case=do_lower_case, | |
| tokenize_with_offsets=tokenize_with_offsets, | |
| default_seq_length=default_seq_length) | |
| preprocessing.save(export_path, include_optimizer=False, save_format="tf") | |
| if experimental_disable_assert: | |
| _check_no_assert(export_path) | |
| # It helps the unit test to prevent stray copies of the vocab file. | |
| if tf.io.gfile.exists(tmpdir): | |
| raise IOError("Failed to clean up TemporaryDirectory") | |
| # TODO(b/175369555): Remove all workarounds for this bug of TensorFlow 2.4 | |
| # when this bug is no longer a concern for publishing new models. | |
| # TensorFlow 2.4 has a placement issue with Assert ops in tf.functions called | |
| # from Dataset.map() on a TPU worker. They end up on the TPU coordinator, | |
| # and invoking them from the TPU worker is either inefficient (when possible) | |
| # or impossible (notably when using "headless" TPU workers on Cloud that do not | |
| # have a channel to the coordinator). The bug has been fixed in time for TF 2.5. | |
| # To work around this, the following code avoids Assert ops in the exported | |
| # SavedModels. It monkey-patches calls to tf.Assert from inside TensorFlow and | |
| # replaces them by a no-op while building the exported model. This is fragile, | |
| # so _check_no_assert() validates the result. The resulting model should be fine | |
| # to read on future versions of TF, even if this workaround at export time | |
| # may break eventually. (Failing unit tests will tell.) | |
| def _dont_assert(condition, data, summarize=None, name="Assert"): | |
| """The no-op version of tf.Assert installed by _maybe_disable_assert.""" | |
| del condition, data, summarize # Unused. | |
| if tf.executing_eagerly(): | |
| return | |
| with tf.name_scope(name): | |
| return tf.no_op(name="dont_assert") | |
| def _maybe_disable_assert(disable_assert): | |
| """Scoped monkey patch of control_flow_assert.Assert to a no-op.""" | |
| if not disable_assert: | |
| yield | |
| return | |
| original_assert = control_flow_assert.Assert | |
| control_flow_assert.Assert = _dont_assert | |
| yield | |
| control_flow_assert.Assert = original_assert | |
| def _check_no_assert(saved_model_path): | |
| """Raises AssertionError if SavedModel contains Assert ops.""" | |
| saved_model_filename = os.path.join(saved_model_path, "saved_model.pb") | |
| with tf.io.gfile.GFile(saved_model_filename, "rb") as f: | |
| saved_model = saved_model_pb2.SavedModel.FromString(f.read()) | |
| assert_nodes = [] | |
| graph_def = saved_model.meta_graphs[0].graph_def | |
| assert_nodes += [ | |
| "node '{}' in global graph".format(n.name) | |
| for n in graph_def.node | |
| if n.op == "Assert" | |
| ] | |
| for fdef in graph_def.library.function: | |
| assert_nodes += [ | |
| "node '{}' in function '{}'".format(n.name, fdef.signature.name) | |
| for n in fdef.node_def | |
| if n.op == "Assert" | |
| ] | |
| if assert_nodes: | |
| raise AssertionError( | |
| "Internal tool error: " | |
| "failed to suppress {} Assert ops in SavedModel:\n{}".format( | |
| len(assert_nodes), "\n".join(assert_nodes[:10]))) | |