Spaces:
Sleeping
Sleeping
| # Copyright 2024 The TensorFlow Authors. All Rights Reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """Tests export_tfhub_lib.""" | |
| import os | |
| import tempfile | |
| from absl.testing import parameterized | |
| import numpy as np | |
| import tensorflow as tf, tf_keras | |
| from tensorflow import estimator as tf_estimator | |
| import tensorflow_hub as hub | |
| import tensorflow_text as text | |
| from sentencepiece import SentencePieceTrainer | |
| from official.legacy.bert import configs | |
| from official.modeling import tf_utils | |
| from official.nlp.configs import encoders | |
| from official.nlp.modeling import layers | |
| from official.nlp.modeling import models | |
| from official.nlp.tools import export_tfhub_lib | |
| def _get_bert_config_or_encoder_config(use_bert_config, | |
| hidden_size, | |
| num_hidden_layers, | |
| encoder_type="albert", | |
| vocab_size=100): | |
| """Generates config args for export_tfhub_lib._create_model(). | |
| Args: | |
| use_bert_config: bool. If True, returns legacy BertConfig. | |
| hidden_size: int. | |
| num_hidden_layers: int. | |
| encoder_type: str. Can be ['albert', 'bert', 'bert_v2']. If use_bert_config | |
| == True, then model_type is not used. | |
| vocab_size: int. | |
| Returns: | |
| bert_config, encoder_config. Only one is not None. If | |
| `use_bert_config` == True, the first config is valid. Otherwise | |
| `bert_config` == None. | |
| """ | |
| if use_bert_config: | |
| bert_config = configs.BertConfig( | |
| vocab_size=vocab_size, | |
| hidden_size=hidden_size, | |
| intermediate_size=32, | |
| max_position_embeddings=128, | |
| num_attention_heads=2, | |
| num_hidden_layers=num_hidden_layers) | |
| encoder_config = None | |
| else: | |
| bert_config = None | |
| if encoder_type == "albert": | |
| encoder_config = encoders.EncoderConfig( | |
| type="albert", | |
| albert=encoders.AlbertEncoderConfig( | |
| vocab_size=vocab_size, | |
| embedding_width=16, | |
| hidden_size=hidden_size, | |
| intermediate_size=32, | |
| max_position_embeddings=128, | |
| num_attention_heads=2, | |
| num_layers=num_hidden_layers, | |
| dropout_rate=0.1)) | |
| else: | |
| # encoder_type can be 'bert' or 'bert_v2'. | |
| model_config = encoders.BertEncoderConfig( | |
| vocab_size=vocab_size, | |
| embedding_size=16, | |
| hidden_size=hidden_size, | |
| intermediate_size=32, | |
| max_position_embeddings=128, | |
| num_attention_heads=2, | |
| num_layers=num_hidden_layers, | |
| dropout_rate=0.1) | |
| kwargs = {"type": encoder_type, encoder_type: model_config} | |
| encoder_config = encoders.EncoderConfig(**kwargs) | |
| return bert_config, encoder_config | |
| def _get_vocab_or_sp_model_dummy(temp_dir, use_sp_model): | |
| """Returns tokenizer asset args for export_tfhub_lib.export_model().""" | |
| dummy_file = os.path.join(temp_dir, "dummy_file.txt") | |
| with tf.io.gfile.GFile(dummy_file, "w") as f: | |
| f.write("dummy content") | |
| if use_sp_model: | |
| vocab_file, sp_model_file = None, dummy_file | |
| else: | |
| vocab_file, sp_model_file = dummy_file, None | |
| return vocab_file, sp_model_file | |
| def _read_asset(asset: tf.saved_model.Asset): | |
| return tf.io.gfile.GFile(asset.asset_path.numpy()).read() | |
| def _find_lambda_layers(layer): | |
| """Returns list of all Lambda layers in a Keras model.""" | |
| if isinstance(layer, tf_keras.layers.Lambda): | |
| return [layer] | |
| elif hasattr(layer, "layers"): # It's nested, like a Model. | |
| result = [] | |
| for l in layer.layers: | |
| result += _find_lambda_layers(l) | |
| return result | |
| else: | |
| return [] | |
| class ExportModelTest(tf.test.TestCase, parameterized.TestCase): | |
| """Tests exporting a Transformer Encoder model as a SavedModel. | |
| This covers export from an Encoder checkpoint to a SavedModel without | |
| the .mlm subobject. This is no longer preferred, but still useful | |
| for models like Electra that are trained without the MLM task. | |
| The export code is generic. This test focuses on two main cases | |
| (the most important ones in practice when this was written in 2020): | |
| - BERT built from a legacy BertConfig, for use with BertTokenizer. | |
| - ALBERT built from an EncoderConfig (as a representative of all other | |
| choices beyond BERT, for use with SentencepieceTokenizer (the one | |
| alternative to BertTokenizer). | |
| """ | |
| def test_export_model(self, use_bert, encoder_type): | |
| # Create the encoder and export it. | |
| hidden_size = 16 | |
| num_hidden_layers = 1 | |
| bert_config, encoder_config = _get_bert_config_or_encoder_config( | |
| use_bert, | |
| hidden_size=hidden_size, | |
| num_hidden_layers=num_hidden_layers, | |
| encoder_type=encoder_type) | |
| bert_model, encoder = export_tfhub_lib._create_model( | |
| bert_config=bert_config, encoder_config=encoder_config, with_mlm=False) | |
| self.assertEmpty( | |
| _find_lambda_layers(bert_model), | |
| "Lambda layers are non-portable since they serialize Python bytecode.") | |
| model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint") | |
| checkpoint = tf.train.Checkpoint(encoder=encoder) | |
| checkpoint.save(os.path.join(model_checkpoint_dir, "test")) | |
| model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir) | |
| vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy( | |
| self.get_temp_dir(), use_sp_model=not use_bert) | |
| export_path = os.path.join(self.get_temp_dir(), "hub") | |
| export_tfhub_lib.export_model( | |
| export_path=export_path, | |
| bert_config=bert_config, | |
| encoder_config=encoder_config, | |
| model_checkpoint_path=model_checkpoint_path, | |
| with_mlm=False, | |
| vocab_file=vocab_file, | |
| sp_model_file=sp_model_file, | |
| do_lower_case=True) | |
| # Restore the exported model. | |
| hub_layer = hub.KerasLayer(export_path, trainable=True) | |
| # Check legacy tokenization data. | |
| if use_bert: | |
| self.assertTrue(hub_layer.resolved_object.do_lower_case.numpy()) | |
| self.assertEqual("dummy content", | |
| _read_asset(hub_layer.resolved_object.vocab_file)) | |
| self.assertFalse(hasattr(hub_layer.resolved_object, "sp_model_file")) | |
| else: | |
| self.assertFalse(hasattr(hub_layer.resolved_object, "do_lower_case")) | |
| self.assertFalse(hasattr(hub_layer.resolved_object, "vocab_file")) | |
| self.assertEqual("dummy content", | |
| _read_asset(hub_layer.resolved_object.sp_model_file)) | |
| # Check restored weights. | |
| self.assertEqual( | |
| len(bert_model.trainable_weights), len(hub_layer.trainable_weights)) | |
| for source_weight, hub_weight in zip(bert_model.trainable_weights, | |
| hub_layer.trainable_weights): | |
| self.assertAllClose(source_weight.numpy(), hub_weight.numpy()) | |
| # Check computation. | |
| seq_length = 10 | |
| dummy_ids = np.zeros((2, seq_length), dtype=np.int32) | |
| input_dict = dict( | |
| input_word_ids=dummy_ids, | |
| input_mask=dummy_ids, | |
| input_type_ids=dummy_ids) | |
| hub_output = hub_layer(input_dict) | |
| source_output = bert_model(input_dict) | |
| encoder_output = encoder(input_dict) | |
| self.assertEqual(hub_output["pooled_output"].shape, (2, hidden_size)) | |
| self.assertEqual(hub_output["sequence_output"].shape, | |
| (2, seq_length, hidden_size)) | |
| self.assertLen(hub_output["encoder_outputs"], num_hidden_layers) | |
| for key in ("pooled_output", "sequence_output", "encoder_outputs"): | |
| self.assertAllClose(source_output[key], hub_output[key]) | |
| self.assertAllClose(source_output[key], encoder_output[key]) | |
| # The "default" output of BERT as a text representation is pooled_output. | |
| self.assertAllClose(hub_output["pooled_output"], hub_output["default"]) | |
| # Test that training=True makes a difference (activates dropout). | |
| def _dropout_mean_stddev(training, num_runs=20): | |
| input_ids = np.array([[14, 12, 42, 95, 99]], np.int32) | |
| input_dict = dict( | |
| input_word_ids=input_ids, | |
| input_mask=np.ones_like(input_ids), | |
| input_type_ids=np.zeros_like(input_ids)) | |
| outputs = np.concatenate([ | |
| hub_layer(input_dict, training=training)["pooled_output"] | |
| for _ in range(num_runs) | |
| ]) | |
| return np.mean(np.std(outputs, axis=0)) | |
| self.assertLess(_dropout_mean_stddev(training=False), 1e-6) | |
| self.assertGreater(_dropout_mean_stddev(training=True), 1e-3) | |
| # Test propagation of seq_length in shape inference. | |
| input_word_ids = tf_keras.layers.Input(shape=(seq_length,), dtype=tf.int32) | |
| input_mask = tf_keras.layers.Input(shape=(seq_length,), dtype=tf.int32) | |
| input_type_ids = tf_keras.layers.Input(shape=(seq_length,), dtype=tf.int32) | |
| input_dict = dict( | |
| input_word_ids=input_word_ids, | |
| input_mask=input_mask, | |
| input_type_ids=input_type_ids) | |
| output_dict = hub_layer(input_dict) | |
| pooled_output = output_dict["pooled_output"] | |
| sequence_output = output_dict["sequence_output"] | |
| encoder_outputs = output_dict["encoder_outputs"] | |
| self.assertEqual(pooled_output.shape.as_list(), [None, hidden_size]) | |
| self.assertEqual(sequence_output.shape.as_list(), | |
| [None, seq_length, hidden_size]) | |
| self.assertLen(encoder_outputs, num_hidden_layers) | |
| class ExportModelWithMLMTest(tf.test.TestCase, parameterized.TestCase): | |
| """Tests exporting a Transformer Encoder model as a SavedModel. | |
| This covers export from a Pretrainer checkpoint to a SavedModel including | |
| the .mlm subobject, which is the preferred way since 2020. | |
| The export code is generic. This test focuses on two main cases | |
| (the most important ones in practice when this was written in 2020): | |
| - BERT built from a legacy BertConfig, for use with BertTokenizer. | |
| - ALBERT built from an EncoderConfig (as a representative of all other | |
| choices beyond BERT, for use with SentencepieceTokenizer (the one | |
| alternative to BertTokenizer). | |
| """ | |
| def test_copy_pooler_dense_to_encoder(self): | |
| encoder_config = encoders.EncoderConfig( | |
| type="bert", | |
| bert=encoders.BertEncoderConfig( | |
| hidden_size=24, intermediate_size=48, num_layers=2)) | |
| cls_heads = [ | |
| layers.ClassificationHead( | |
| inner_dim=24, num_classes=2, name="next_sentence") | |
| ] | |
| encoder = encoders.build_encoder(encoder_config) | |
| pretrainer = models.BertPretrainerV2( | |
| encoder_network=encoder, | |
| classification_heads=cls_heads, | |
| mlm_activation=tf_utils.get_activation( | |
| encoder_config.get().hidden_activation)) | |
| # Makes sure the pretrainer variables are created. | |
| _ = pretrainer(pretrainer.inputs) | |
| checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items) | |
| model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint") | |
| checkpoint.save(os.path.join(model_checkpoint_dir, "test")) | |
| vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy( | |
| self.get_temp_dir(), use_sp_model=True) | |
| export_path = os.path.join(self.get_temp_dir(), "hub") | |
| export_tfhub_lib.export_model( | |
| export_path=export_path, | |
| encoder_config=encoder_config, | |
| model_checkpoint_path=tf.train.latest_checkpoint(model_checkpoint_dir), | |
| with_mlm=True, | |
| copy_pooler_dense_to_encoder=True, | |
| vocab_file=vocab_file, | |
| sp_model_file=sp_model_file, | |
| do_lower_case=True) | |
| # Restores a hub KerasLayer. | |
| hub_layer = hub.KerasLayer(export_path, trainable=True) | |
| dummy_ids = np.zeros((2, 10), dtype=np.int32) | |
| input_dict = dict( | |
| input_word_ids=dummy_ids, | |
| input_mask=dummy_ids, | |
| input_type_ids=dummy_ids) | |
| hub_pooled_output = hub_layer(input_dict)["pooled_output"] | |
| encoder_outputs = encoder(input_dict) | |
| # Verify that hub_layer's pooled_output is the same as the output of next | |
| # sentence prediction's dense layer. | |
| pretrained_pooled_output = cls_heads[0].dense( | |
| (encoder_outputs["sequence_output"][:, 0, :])) | |
| self.assertAllClose(hub_pooled_output, pretrained_pooled_output) | |
| # But the pooled_output between encoder and hub_layer are not the same. | |
| encoder_pooled_output = encoder_outputs["pooled_output"] | |
| self.assertNotAllClose(hub_pooled_output, encoder_pooled_output) | |
| def test_export_model_with_mlm(self, use_bert): | |
| # Create the encoder and export it. | |
| hidden_size = 16 | |
| num_hidden_layers = 2 | |
| bert_config, encoder_config = _get_bert_config_or_encoder_config( | |
| use_bert, hidden_size, num_hidden_layers) | |
| bert_model, pretrainer = export_tfhub_lib._create_model( | |
| bert_config=bert_config, encoder_config=encoder_config, with_mlm=True) | |
| self.assertEmpty( | |
| _find_lambda_layers(bert_model), | |
| "Lambda layers are non-portable since they serialize Python bytecode.") | |
| bert_model_with_mlm = bert_model.mlm | |
| model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint") | |
| checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items) | |
| checkpoint.save(os.path.join(model_checkpoint_dir, "test")) | |
| model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir) | |
| vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy( | |
| self.get_temp_dir(), use_sp_model=not use_bert) | |
| export_path = os.path.join(self.get_temp_dir(), "hub") | |
| export_tfhub_lib.export_model( | |
| export_path=export_path, | |
| bert_config=bert_config, | |
| encoder_config=encoder_config, | |
| model_checkpoint_path=model_checkpoint_path, | |
| with_mlm=True, | |
| vocab_file=vocab_file, | |
| sp_model_file=sp_model_file, | |
| do_lower_case=True) | |
| # Restore the exported model. | |
| hub_layer = hub.KerasLayer(export_path, trainable=True) | |
| # Check legacy tokenization data. | |
| if use_bert: | |
| self.assertTrue(hub_layer.resolved_object.do_lower_case.numpy()) | |
| self.assertEqual("dummy content", | |
| _read_asset(hub_layer.resolved_object.vocab_file)) | |
| self.assertFalse(hasattr(hub_layer.resolved_object, "sp_model_file")) | |
| else: | |
| self.assertFalse(hasattr(hub_layer.resolved_object, "do_lower_case")) | |
| self.assertFalse(hasattr(hub_layer.resolved_object, "vocab_file")) | |
| self.assertEqual("dummy content", | |
| _read_asset(hub_layer.resolved_object.sp_model_file)) | |
| # Check restored weights. | |
| # Note that we set `_auto_track_sub_layers` to False when exporting the | |
| # SavedModel, so hub_layer has the same number of weights as bert_model; | |
| # otherwise, hub_layer will have extra weights from its `mlm` subobject. | |
| self.assertEqual( | |
| len(bert_model.trainable_weights), len(hub_layer.trainable_weights)) | |
| for source_weight, hub_weight in zip(bert_model.trainable_weights, | |
| hub_layer.trainable_weights): | |
| self.assertAllClose(source_weight, hub_weight) | |
| # Check computation. | |
| seq_length = 10 | |
| dummy_ids = np.zeros((2, seq_length), dtype=np.int32) | |
| input_dict = dict( | |
| input_word_ids=dummy_ids, | |
| input_mask=dummy_ids, | |
| input_type_ids=dummy_ids) | |
| hub_outputs_dict = hub_layer(input_dict) | |
| source_outputs_dict = bert_model(input_dict) | |
| encoder_outputs_dict = pretrainer.encoder_network( | |
| [dummy_ids, dummy_ids, dummy_ids]) | |
| self.assertEqual(hub_outputs_dict["pooled_output"].shape, (2, hidden_size)) | |
| self.assertEqual(hub_outputs_dict["sequence_output"].shape, | |
| (2, seq_length, hidden_size)) | |
| for output_key in ("pooled_output", "sequence_output", "encoder_outputs"): | |
| self.assertAllClose(source_outputs_dict[output_key], | |
| hub_outputs_dict[output_key]) | |
| self.assertAllClose(source_outputs_dict[output_key], | |
| encoder_outputs_dict[output_key]) | |
| # The "default" output of BERT as a text representation is pooled_output. | |
| self.assertAllClose(hub_outputs_dict["pooled_output"], | |
| hub_outputs_dict["default"]) | |
| # Test that training=True makes a difference (activates dropout). | |
| def _dropout_mean_stddev(training, num_runs=20): | |
| input_ids = np.array([[14, 12, 42, 95, 99]], np.int32) | |
| input_dict = dict( | |
| input_word_ids=input_ids, | |
| input_mask=np.ones_like(input_ids), | |
| input_type_ids=np.zeros_like(input_ids)) | |
| outputs = np.concatenate([ | |
| hub_layer(input_dict, training=training)["pooled_output"] | |
| for _ in range(num_runs) | |
| ]) | |
| return np.mean(np.std(outputs, axis=0)) | |
| self.assertLess(_dropout_mean_stddev(training=False), 1e-6) | |
| self.assertGreater(_dropout_mean_stddev(training=True), 1e-3) | |
| # Checks sub-object `mlm`. | |
| self.assertTrue(hasattr(hub_layer.resolved_object, "mlm")) | |
| self.assertLen(hub_layer.resolved_object.mlm.trainable_variables, | |
| len(bert_model_with_mlm.trainable_weights)) | |
| self.assertLen(hub_layer.resolved_object.mlm.trainable_variables, | |
| len(pretrainer.trainable_weights)) | |
| for source_weight, hub_weight, pretrainer_weight in zip( | |
| bert_model_with_mlm.trainable_weights, | |
| hub_layer.resolved_object.mlm.trainable_variables, | |
| pretrainer.trainable_weights): | |
| self.assertAllClose(source_weight, hub_weight) | |
| self.assertAllClose(source_weight, pretrainer_weight) | |
| max_predictions_per_seq = 4 | |
| mlm_positions = np.zeros((2, max_predictions_per_seq), dtype=np.int32) | |
| input_dict = dict( | |
| input_word_ids=dummy_ids, | |
| input_mask=dummy_ids, | |
| input_type_ids=dummy_ids, | |
| masked_lm_positions=mlm_positions) | |
| hub_mlm_outputs_dict = hub_layer.resolved_object.mlm(input_dict) | |
| source_mlm_outputs_dict = bert_model_with_mlm(input_dict) | |
| for output_key in ("pooled_output", "sequence_output", "mlm_logits", | |
| "encoder_outputs"): | |
| self.assertAllClose(hub_mlm_outputs_dict[output_key], | |
| source_mlm_outputs_dict[output_key]) | |
| pretrainer_mlm_logits_output = pretrainer(input_dict)["mlm_logits"] | |
| self.assertAllClose(hub_mlm_outputs_dict["mlm_logits"], | |
| pretrainer_mlm_logits_output) | |
| # Test that training=True makes a difference (activates dropout). | |
| def _dropout_mean_stddev_mlm(training, num_runs=20): | |
| input_ids = np.array([[14, 12, 42, 95, 99]], np.int32) | |
| mlm_position_ids = np.array([[1, 2, 3, 4]], np.int32) | |
| input_dict = dict( | |
| input_word_ids=input_ids, | |
| input_mask=np.ones_like(input_ids), | |
| input_type_ids=np.zeros_like(input_ids), | |
| masked_lm_positions=mlm_position_ids) | |
| outputs = np.concatenate([ | |
| hub_layer.resolved_object.mlm(input_dict, | |
| training=training)["pooled_output"] | |
| for _ in range(num_runs) | |
| ]) | |
| return np.mean(np.std(outputs, axis=0)) | |
| self.assertLess(_dropout_mean_stddev_mlm(training=False), 1e-6) | |
| self.assertGreater(_dropout_mean_stddev_mlm(training=True), 1e-3) | |
| # Test propagation of seq_length in shape inference. | |
| input_word_ids = tf_keras.layers.Input(shape=(seq_length,), dtype=tf.int32) | |
| input_mask = tf_keras.layers.Input(shape=(seq_length,), dtype=tf.int32) | |
| input_type_ids = tf_keras.layers.Input(shape=(seq_length,), dtype=tf.int32) | |
| input_dict = dict( | |
| input_word_ids=input_word_ids, | |
| input_mask=input_mask, | |
| input_type_ids=input_type_ids) | |
| hub_outputs_dict = hub_layer(input_dict) | |
| self.assertEqual(hub_outputs_dict["pooled_output"].shape.as_list(), | |
| [None, hidden_size]) | |
| self.assertEqual(hub_outputs_dict["sequence_output"].shape.as_list(), | |
| [None, seq_length, hidden_size]) | |
| _STRING_NOT_TO_LEAK = "private_path_component_" | |
| class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase): | |
| def _make_vocab_file(self, vocab, filename="vocab.txt", add_mask_token=False): | |
| """Creates wordpiece vocab file with given words plus special tokens. | |
| The tokens of the resulting model are, in this order: | |
| [PAD], [UNK], [CLS], [SEP], [MASK]*, ...vocab... | |
| *=if requested by args. | |
| This function also accepts wordpieces that start with the ## continuation | |
| marker, but avoiding those makes this function interchangeable with | |
| _make_sp_model_file(), up to the extra dimension returned by BertTokenizer. | |
| Args: | |
| vocab: a list of strings with the words or wordpieces to put into the | |
| model's vocabulary. Do not include special tokens here. | |
| filename: Optionally, a filename (relative to the temporary directory | |
| created by this function). | |
| add_mask_token: an optional bool, whether to include a [MASK] token. | |
| Returns: | |
| The absolute filename of the created vocab file. | |
| """ | |
| full_vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]" | |
| ] + ["[MASK]"] * add_mask_token + vocab | |
| path = os.path.join( | |
| tempfile.mkdtemp( | |
| dir=self.get_temp_dir(), # New subdir each time. | |
| prefix=_STRING_NOT_TO_LEAK), | |
| filename) | |
| with tf.io.gfile.GFile(path, "w") as f: | |
| f.write("\n".join(full_vocab + [""])) | |
| return path | |
| def _make_sp_model_file(self, vocab, prefix="spm", add_mask_token=False): | |
| """Creates Sentencepiece word model with given words plus special tokens. | |
| The tokens of the resulting model are, in this order: | |
| <pad>, <unk>, [CLS], [SEP], [MASK]*, ...vocab..., <s>, </s> | |
| *=if requested by args. | |
| The words in the input vocab are plain text, without the whitespace marker. | |
| That makes this function interchangeable with _make_vocab_file(). | |
| Args: | |
| vocab: a list of strings with the words to put into the model's | |
| vocabulary. Do not include special tokens here. | |
| prefix: an optional string, to change the filename prefix for the model | |
| (relative to the temporary directory created by this function). | |
| add_mask_token: an optional bool, whether to include a [MASK] token. | |
| Returns: | |
| The absolute filename of the created Sentencepiece model file. | |
| """ | |
| model_prefix = os.path.join( | |
| tempfile.mkdtemp(dir=self.get_temp_dir()), # New subdir each time. | |
| prefix) | |
| input_file = model_prefix + "_train_input.txt" | |
| # Create input text for training the sp model from the tokens provided. | |
| # Repeat tokens, the earlier the more, because they are sorted by frequency. | |
| input_text = [] | |
| for i, token in enumerate(vocab): | |
| input_text.append(" ".join([token] * (len(vocab) - i))) | |
| with tf.io.gfile.GFile(input_file, "w") as f: | |
| f.write("\n".join(input_text + [""])) | |
| control_symbols = "[CLS],[SEP]" | |
| full_vocab_size = len(vocab) + 6 # <pad>, <unk>, [CLS], [SEP], <s>, </s>. | |
| if add_mask_token: | |
| control_symbols += ",[MASK]" | |
| full_vocab_size += 1 | |
| flags = dict( | |
| model_prefix=model_prefix, | |
| model_type="word", | |
| input=input_file, | |
| pad_id=0, | |
| unk_id=1, | |
| control_symbols=control_symbols, | |
| vocab_size=full_vocab_size, | |
| bos_id=full_vocab_size - 2, | |
| eos_id=full_vocab_size - 1) | |
| SentencePieceTrainer.Train(" ".join( | |
| ["--{}={}".format(k, v) for k, v in flags.items()])) | |
| return model_prefix + ".model" | |
| def _do_export(self, | |
| vocab, | |
| do_lower_case, | |
| default_seq_length=128, | |
| tokenize_with_offsets=True, | |
| use_sp_model=False, | |
| experimental_disable_assert=False, | |
| add_mask_token=False): | |
| """Runs SavedModel export and returns the export_path.""" | |
| export_path = tempfile.mkdtemp(dir=self.get_temp_dir()) | |
| vocab_file = sp_model_file = None | |
| if use_sp_model: | |
| sp_model_file = self._make_sp_model_file( | |
| vocab, add_mask_token=add_mask_token) | |
| else: | |
| vocab_file = self._make_vocab_file(vocab, add_mask_token=add_mask_token) | |
| export_tfhub_lib.export_preprocessing( | |
| export_path, | |
| vocab_file=vocab_file, | |
| sp_model_file=sp_model_file, | |
| do_lower_case=do_lower_case, | |
| tokenize_with_offsets=tokenize_with_offsets, | |
| default_seq_length=default_seq_length, | |
| experimental_disable_assert=experimental_disable_assert) | |
| # Invalidate the original filename to verify loading from the SavedModel. | |
| tf.io.gfile.remove(sp_model_file or vocab_file) | |
| return export_path | |
| def test_no_leaks(self): | |
| """Tests not leaking the path to the original vocab file.""" | |
| path = self._do_export(["d", "ef", "abc", "xy"], | |
| do_lower_case=True, | |
| use_sp_model=False) | |
| with tf.io.gfile.GFile(os.path.join(path, "saved_model.pb"), "rb") as f: | |
| self.assertFalse( # pylint: disable=g-generic-assert | |
| _STRING_NOT_TO_LEAK.encode("ascii") in f.read()) | |
| def test_exported_callables(self, use_sp_model): | |
| preprocess = tf.saved_model.load( | |
| self._do_export( | |
| ["d", "ef", "abc", "xy"], | |
| do_lower_case=True, | |
| # TODO(b/181866850): drop this. | |
| tokenize_with_offsets=not use_sp_model, | |
| # TODO(b/175369555): drop this. | |
| experimental_disable_assert=True, | |
| use_sp_model=use_sp_model)) | |
| def fold_dim(rt): | |
| """Removes the word/subword distinction of BertTokenizer.""" | |
| return rt if use_sp_model else rt.merge_dims(1, 2) | |
| # .tokenize() | |
| inputs = tf.constant(["abc d ef", "ABC D EF d"]) | |
| token_ids = preprocess.tokenize(inputs) | |
| self.assertAllEqual( | |
| fold_dim(token_ids), tf.ragged.constant([[6, 4, 5], [6, 4, 5, 4]])) | |
| special_tokens_dict = { | |
| k: v.numpy().item() # Expecting eager Tensor, converting to Python. | |
| for k, v in preprocess.tokenize.get_special_tokens_dict().items() | |
| } | |
| self.assertDictEqual( | |
| special_tokens_dict, | |
| dict( | |
| padding_id=0, | |
| start_of_sequence_id=2, | |
| end_of_segment_id=3, | |
| vocab_size=4 + 6 if use_sp_model else 4 + 4)) | |
| # .tokenize_with_offsets() | |
| if use_sp_model: | |
| # TODO(b/181866850): Enable tokenize_with_offsets when it works and test. | |
| self.assertFalse(hasattr(preprocess, "tokenize_with_offsets")) | |
| else: | |
| token_ids, start_offsets, limit_offsets = ( | |
| preprocess.tokenize_with_offsets(inputs)) | |
| self.assertAllEqual( | |
| fold_dim(token_ids), tf.ragged.constant([[6, 4, 5], [6, 4, 5, 4]])) | |
| self.assertAllEqual( | |
| fold_dim(start_offsets), tf.ragged.constant([[0, 4, 6], [0, 4, 6, | |
| 9]])) | |
| self.assertAllEqual( | |
| fold_dim(limit_offsets), tf.ragged.constant([[3, 5, 8], [3, 5, 8, | |
| 10]])) | |
| self.assertIs(preprocess.tokenize.get_special_tokens_dict, | |
| preprocess.tokenize_with_offsets.get_special_tokens_dict) | |
| # Root callable. | |
| bert_inputs = preprocess(inputs) | |
| self.assertAllEqual(bert_inputs["input_word_ids"].shape.as_list(), [2, 128]) | |
| self.assertAllEqual( | |
| bert_inputs["input_word_ids"][:, :10], | |
| tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0], | |
| [2, 6, 4, 5, 4, 3, 0, 0, 0, 0]])) | |
| self.assertAllEqual(bert_inputs["input_mask"].shape.as_list(), [2, 128]) | |
| self.assertAllEqual( | |
| bert_inputs["input_mask"][:, :10], | |
| tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0], | |
| [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])) | |
| self.assertAllEqual(bert_inputs["input_type_ids"].shape.as_list(), [2, 128]) | |
| self.assertAllEqual( | |
| bert_inputs["input_type_ids"][:, :10], | |
| tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], | |
| [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])) | |
| # .bert_pack_inputs() | |
| inputs_2 = tf.constant(["d xy", "xy abc"]) | |
| token_ids_2 = preprocess.tokenize(inputs_2) | |
| bert_inputs = preprocess.bert_pack_inputs([token_ids, token_ids_2], | |
| seq_length=256) | |
| self.assertAllEqual(bert_inputs["input_word_ids"].shape.as_list(), [2, 256]) | |
| self.assertAllEqual( | |
| bert_inputs["input_word_ids"][:, :10], | |
| tf.constant([[2, 6, 4, 5, 3, 4, 7, 3, 0, 0], | |
| [2, 6, 4, 5, 4, 3, 7, 6, 3, 0]])) | |
| self.assertAllEqual(bert_inputs["input_mask"].shape.as_list(), [2, 256]) | |
| self.assertAllEqual( | |
| bert_inputs["input_mask"][:, :10], | |
| tf.constant([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0], | |
| [1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])) | |
| self.assertAllEqual(bert_inputs["input_type_ids"].shape.as_list(), [2, 256]) | |
| self.assertAllEqual( | |
| bert_inputs["input_type_ids"][:, :10], | |
| tf.constant([[0, 0, 0, 0, 0, 1, 1, 1, 0, 0], | |
| [0, 0, 0, 0, 0, 0, 1, 1, 1, 0]])) | |
| # For BertTokenizer only: repeat relevant parts for do_lower_case=False, | |
| # default_seq_length=10, experimental_disable_assert=False, | |
| # tokenize_with_offsets=False, and without folding the word/subword dimension. | |
| def test_cased_length10(self): | |
| preprocess = tf.saved_model.load( | |
| self._do_export(["d", "##ef", "abc", "ABC"], | |
| do_lower_case=False, | |
| default_seq_length=10, | |
| tokenize_with_offsets=False, | |
| use_sp_model=False, | |
| experimental_disable_assert=False)) | |
| inputs = tf.constant(["abc def", "ABC DEF"]) | |
| token_ids = preprocess.tokenize(inputs) | |
| self.assertAllEqual(token_ids, | |
| tf.ragged.constant([[[6], [4, 5]], [[7], [1]]])) | |
| self.assertFalse(hasattr(preprocess, "tokenize_with_offsets")) | |
| bert_inputs = preprocess(inputs) | |
| self.assertAllEqual( | |
| bert_inputs["input_word_ids"], | |
| tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0], | |
| [2, 7, 1, 3, 0, 0, 0, 0, 0, 0]])) | |
| self.assertAllEqual( | |
| bert_inputs["input_mask"], | |
| tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0], | |
| [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]])) | |
| self.assertAllEqual( | |
| bert_inputs["input_type_ids"], | |
| tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], | |
| [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])) | |
| inputs_2 = tf.constant(["d ABC", "ABC abc"]) | |
| token_ids_2 = preprocess.tokenize(inputs_2) | |
| bert_inputs = preprocess.bert_pack_inputs([token_ids, token_ids_2]) | |
| # Test default seq_length=10. | |
| self.assertAllEqual( | |
| bert_inputs["input_word_ids"], | |
| tf.constant([[2, 6, 4, 5, 3, 4, 7, 3, 0, 0], | |
| [2, 7, 1, 3, 7, 6, 3, 0, 0, 0]])) | |
| self.assertAllEqual( | |
| bert_inputs["input_mask"], | |
| tf.constant([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0], | |
| [1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])) | |
| self.assertAllEqual( | |
| bert_inputs["input_type_ids"], | |
| tf.constant([[0, 0, 0, 0, 0, 1, 1, 1, 0, 0], | |
| [0, 0, 0, 0, 1, 1, 1, 0, 0, 0]])) | |
| # XLA requires fixed shapes for tensors found in graph mode. | |
| # Statically known shapes in Python are a particularly firm way to | |
| # guarantee that, and they are generally more convenient to work with. | |
| # We test that the exported SavedModel plays well with TF's shape | |
| # inference when applied to fully or partially known input shapes. | |
| def test_shapes(self, use_sp_model): | |
| preprocess = tf.saved_model.load( | |
| self._do_export( | |
| ["abc", "def"], | |
| do_lower_case=True, | |
| # TODO(b/181866850): drop this. | |
| tokenize_with_offsets=not use_sp_model, | |
| # TODO(b/175369555): drop this. | |
| experimental_disable_assert=True, | |
| use_sp_model=use_sp_model)) | |
| def expected_bert_input_shapes(batch_size, seq_length): | |
| return dict( | |
| input_word_ids=[batch_size, seq_length], | |
| input_mask=[batch_size, seq_length], | |
| input_type_ids=[batch_size, seq_length]) | |
| for batch_size in [7, None]: | |
| if use_sp_model: | |
| token_out_shape = [batch_size, None] # No word/subword distinction. | |
| else: | |
| token_out_shape = [batch_size, None, None] | |
| self.assertEqual( | |
| _result_shapes_in_tf_function(preprocess.tokenize, | |
| tf.TensorSpec([batch_size], tf.string)), | |
| token_out_shape, "with batch_size=%s" % batch_size) | |
| # TODO(b/181866850): Enable tokenize_with_offsets when it works and test. | |
| if use_sp_model: | |
| self.assertFalse(hasattr(preprocess, "tokenize_with_offsets")) | |
| else: | |
| self.assertEqual( | |
| _result_shapes_in_tf_function( | |
| preprocess.tokenize_with_offsets, | |
| tf.TensorSpec([batch_size], tf.string)), [token_out_shape] * 3, | |
| "with batch_size=%s" % batch_size) | |
| self.assertEqual( | |
| _result_shapes_in_tf_function( | |
| preprocess.bert_pack_inputs, | |
| [tf.RaggedTensorSpec([batch_size, None, None], tf.int32)] * 2, | |
| seq_length=256), expected_bert_input_shapes(batch_size, 256), | |
| "with batch_size=%s" % batch_size) | |
| self.assertEqual( | |
| _result_shapes_in_tf_function(preprocess, | |
| tf.TensorSpec([batch_size], tf.string)), | |
| expected_bert_input_shapes(batch_size, 128), | |
| "with batch_size=%s" % batch_size) | |
| def test_reexport(self, use_sp_model): | |
| """Test that preprocess keeps working after another save/load cycle.""" | |
| path1 = self._do_export( | |
| ["d", "ef", "abc", "xy"], | |
| do_lower_case=True, | |
| default_seq_length=10, | |
| tokenize_with_offsets=False, | |
| experimental_disable_assert=True, # TODO(b/175369555): drop this. | |
| use_sp_model=use_sp_model) | |
| path2 = path1.rstrip("/") + ".2" | |
| model1 = tf.saved_model.load(path1) | |
| tf.saved_model.save(model1, path2) | |
| # Delete the first SavedModel to test that the sceond one loads by itself. | |
| # https://github.com/tensorflow/tensorflow/issues/46456 reports such a | |
| # failure case for BertTokenizer. | |
| tf.io.gfile.rmtree(path1) | |
| model2 = tf.saved_model.load(path2) | |
| inputs = tf.constant(["abc d ef", "ABC D EF d"]) | |
| bert_inputs = model2(inputs) | |
| self.assertAllEqual( | |
| bert_inputs["input_word_ids"], | |
| tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0], | |
| [2, 6, 4, 5, 4, 3, 0, 0, 0, 0]])) | |
| self.assertAllEqual( | |
| bert_inputs["input_mask"], | |
| tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0], | |
| [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])) | |
| self.assertAllEqual( | |
| bert_inputs["input_type_ids"], | |
| tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], | |
| [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])) | |
| def test_preprocessing_for_mlm(self, use_bert): | |
| """Combines both SavedModel types and TF.text helpers for MLM.""" | |
| # Create the preprocessing SavedModel with a [MASK] token. | |
| non_special_tokens = [ | |
| "hello", "world", "nice", "movie", "great", "actors", "quick", "fox", | |
| "lazy", "dog" | |
| ] | |
| preprocess = tf.saved_model.load( | |
| self._do_export( | |
| non_special_tokens, | |
| do_lower_case=True, | |
| tokenize_with_offsets=use_bert, # TODO(b/181866850): drop this. | |
| experimental_disable_assert=True, # TODO(b/175369555): drop this. | |
| add_mask_token=True, | |
| use_sp_model=not use_bert)) | |
| vocab_size = len(non_special_tokens) + (5 if use_bert else 7) | |
| # Create the encoder SavedModel with an .mlm subobject. | |
| hidden_size = 16 | |
| num_hidden_layers = 2 | |
| bert_config, encoder_config = _get_bert_config_or_encoder_config( | |
| use_bert_config=use_bert, | |
| hidden_size=hidden_size, | |
| num_hidden_layers=num_hidden_layers, | |
| vocab_size=vocab_size) | |
| _, pretrainer = export_tfhub_lib._create_model( | |
| bert_config=bert_config, encoder_config=encoder_config, with_mlm=True) | |
| model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint") | |
| checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items) | |
| checkpoint.save(os.path.join(model_checkpoint_dir, "test")) | |
| model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir) | |
| vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy( # Not used below. | |
| self.get_temp_dir(), use_sp_model=not use_bert) | |
| encoder_export_path = os.path.join(self.get_temp_dir(), "encoder_export") | |
| export_tfhub_lib.export_model( | |
| export_path=encoder_export_path, | |
| bert_config=bert_config, | |
| encoder_config=encoder_config, | |
| model_checkpoint_path=model_checkpoint_path, | |
| with_mlm=True, | |
| vocab_file=vocab_file, | |
| sp_model_file=sp_model_file, | |
| do_lower_case=True) | |
| encoder = tf.saved_model.load(encoder_export_path) | |
| # Get special tokens from the vocab (and vocab size). | |
| special_tokens_dict = preprocess.tokenize.get_special_tokens_dict() | |
| self.assertEqual(int(special_tokens_dict["vocab_size"]), vocab_size) | |
| padding_id = int(special_tokens_dict["padding_id"]) | |
| self.assertEqual(padding_id, 0) | |
| start_of_sequence_id = int(special_tokens_dict["start_of_sequence_id"]) | |
| self.assertEqual(start_of_sequence_id, 2) | |
| end_of_segment_id = int(special_tokens_dict["end_of_segment_id"]) | |
| self.assertEqual(end_of_segment_id, 3) | |
| mask_id = int(special_tokens_dict["mask_id"]) | |
| self.assertEqual(mask_id, 4) | |
| # A batch of 3 segment pairs. | |
| raw_segments = [ | |
| tf.constant(["hello", "nice movie", "quick fox"]), | |
| tf.constant(["world", "great actors", "lazy dog"]) | |
| ] | |
| batch_size = 3 | |
| # Misc hyperparameters. | |
| seq_length = 10 | |
| max_selections_per_seq = 2 | |
| # Tokenize inputs. | |
| tokenized_segments = [preprocess.tokenize(s) for s in raw_segments] | |
| # Trim inputs to eventually fit seq_lentgh. | |
| num_special_tokens = len(raw_segments) + 1 | |
| trimmed_segments = text.WaterfallTrimmer( | |
| seq_length - num_special_tokens).trim(tokenized_segments) | |
| # Combine input segments into one input sequence. | |
| input_ids, segment_ids = text.combine_segments( | |
| trimmed_segments, | |
| start_of_sequence_id=start_of_sequence_id, | |
| end_of_segment_id=end_of_segment_id) | |
| # Apply random masking controlled by policy objects. | |
| (masked_input_ids, masked_lm_positions, | |
| masked_ids) = text.mask_language_model( | |
| input_ids=input_ids, | |
| item_selector=text.RandomItemSelector( | |
| max_selections_per_seq, | |
| selection_rate=0.5, # Adjusted for the short test examples. | |
| unselectable_ids=[start_of_sequence_id, end_of_segment_id]), | |
| mask_values_chooser=text.MaskValuesChooser( | |
| vocab_size=vocab_size, | |
| mask_token=mask_id, | |
| # Always put [MASK] to have a predictable result. | |
| mask_token_rate=1.0, | |
| random_token_rate=0.0)) | |
| # Pad to fixed-length Transformer encoder inputs. | |
| input_word_ids, _ = text.pad_model_inputs( | |
| masked_input_ids, seq_length, pad_value=padding_id) | |
| input_type_ids, input_mask = text.pad_model_inputs( | |
| segment_ids, seq_length, pad_value=0) | |
| masked_lm_positions, _ = text.pad_model_inputs( | |
| masked_lm_positions, max_selections_per_seq, pad_value=0) | |
| masked_lm_positions = tf.cast(masked_lm_positions, tf.int32) | |
| num_predictions = int(tf.shape(masked_lm_positions)[1]) | |
| # Test transformer inputs. | |
| self.assertEqual(num_predictions, max_selections_per_seq) | |
| expected_word_ids = np.array([ | |
| # [CLS] hello [SEP] world [SEP] | |
| [2, 5, 3, 6, 3, 0, 0, 0, 0, 0], | |
| # [CLS] nice movie [SEP] great actors [SEP] | |
| [2, 7, 8, 3, 9, 10, 3, 0, 0, 0], | |
| # [CLS] brown fox [SEP] lazy dog [SEP] | |
| [2, 11, 12, 3, 13, 14, 3, 0, 0, 0] | |
| ]) | |
| for i in range(batch_size): | |
| for j in range(num_predictions): | |
| k = int(masked_lm_positions[i, j]) | |
| if k != 0: | |
| expected_word_ids[i, k] = 4 # [MASK] | |
| self.assertAllEqual(input_word_ids, expected_word_ids) | |
| # Call the MLM head of the Transformer encoder. | |
| mlm_inputs = dict( | |
| input_word_ids=input_word_ids, | |
| input_mask=input_mask, | |
| input_type_ids=input_type_ids, | |
| masked_lm_positions=masked_lm_positions, | |
| ) | |
| mlm_outputs = encoder.mlm(mlm_inputs) | |
| self.assertEqual(mlm_outputs["pooled_output"].shape, | |
| (batch_size, hidden_size)) | |
| self.assertEqual(mlm_outputs["sequence_output"].shape, | |
| (batch_size, seq_length, hidden_size)) | |
| self.assertEqual(mlm_outputs["mlm_logits"].shape, | |
| (batch_size, num_predictions, vocab_size)) | |
| self.assertLen(mlm_outputs["encoder_outputs"], num_hidden_layers) | |
| # A real trainer would now compute the loss of mlm_logits | |
| # trying to predict the masked_ids. | |
| del masked_ids # Unused. | |
| def test_special_tokens_in_estimator(self, use_sp_model): | |
| """Tests getting special tokens without an Eager init context.""" | |
| preprocess_export_path = self._do_export(["d", "ef", "abc", "xy"], | |
| do_lower_case=True, | |
| use_sp_model=use_sp_model, | |
| tokenize_with_offsets=False) | |
| def _get_special_tokens_dict(obj): | |
| """Returns special tokens of restored tokenizer as Python values.""" | |
| if tf.executing_eagerly(): | |
| special_tokens_numpy = { | |
| k: v.numpy() for k, v in obj.get_special_tokens_dict() | |
| } | |
| else: | |
| with tf.Graph().as_default(): | |
| # This code expects `get_special_tokens_dict()` to be a tf.function | |
| # with no dependencies (bound args) from the context it was loaded in, | |
| # and boldly assumes that it can just be called in a dfferent context. | |
| special_tokens_tensors = obj.get_special_tokens_dict() | |
| with tf.compat.v1.Session() as sess: | |
| special_tokens_numpy = sess.run(special_tokens_tensors) | |
| return { | |
| k: v.item() # Numpy to Python. | |
| for k, v in special_tokens_numpy.items() | |
| } | |
| def input_fn(): | |
| self.assertFalse(tf.executing_eagerly()) | |
| # Build a preprocessing Model. | |
| sentences = tf_keras.layers.Input(shape=[], dtype=tf.string) | |
| preprocess = tf.saved_model.load(preprocess_export_path) | |
| tokenize = hub.KerasLayer(preprocess.tokenize) | |
| special_tokens_dict = _get_special_tokens_dict(tokenize.resolved_object) | |
| for k, v in special_tokens_dict.items(): | |
| self.assertIsInstance(v, int, "Unexpected type for {}".format(k)) | |
| tokens = tokenize(sentences) | |
| packed_inputs = layers.BertPackInputs( | |
| 4, special_tokens_dict=special_tokens_dict)( | |
| tokens) | |
| preprocessing = tf_keras.Model(sentences, packed_inputs) | |
| # Map the dataset. | |
| ds = tf.data.Dataset.from_tensors( | |
| (tf.constant(["abc", "D EF"]), tf.constant([0, 1]))) | |
| ds = ds.map(lambda features, labels: (preprocessing(features), labels)) | |
| return ds | |
| def model_fn(features, labels, mode): | |
| del labels # Unused. | |
| return tf_estimator.EstimatorSpec( | |
| mode=mode, predictions=features["input_word_ids"]) | |
| estimator = tf_estimator.Estimator(model_fn=model_fn) | |
| outputs = list(estimator.predict(input_fn)) | |
| self.assertAllEqual(outputs, np.array([[2, 6, 3, 0], [2, 4, 5, 3]])) | |
| # TODO(b/175369555): Remove that code and its test. | |
| def test_check_no_assert(self, use_sp_model): | |
| """Tests the self-check during export without assertions.""" | |
| preprocess_export_path = self._do_export(["d", "ef", "abc", "xy"], | |
| do_lower_case=True, | |
| use_sp_model=use_sp_model, | |
| tokenize_with_offsets=False, | |
| experimental_disable_assert=False) | |
| with self.assertRaisesRegex(AssertionError, | |
| r"failed to suppress \d+ Assert ops"): | |
| export_tfhub_lib._check_no_assert(preprocess_export_path) | |
| def _result_shapes_in_tf_function(fn, *args, **kwargs): | |
| """Returns shapes (as lists) observed on the result of `fn`. | |
| Args: | |
| fn: A callable. | |
| *args: TensorSpecs for Tensor-valued arguments and actual values for | |
| Python-valued arguments to fn. | |
| **kwargs: Same for keyword arguments. | |
| Returns: | |
| The nest of partial tensor shapes (as lists) that is statically known inside | |
| tf.function(fn)(*args, **kwargs) for the nest of its results. | |
| """ | |
| # Use a captured mutable container for a side outout from the wrapper. | |
| uninitialized = "uninitialized!" | |
| result_shapes_container = [uninitialized] | |
| assert result_shapes_container[0] is uninitialized | |
| def shape_reporting_wrapper(*args, **kwargs): | |
| result = fn(*args, **kwargs) | |
| result_shapes_container[0] = tf.nest.map_structure( | |
| lambda x: x.shape.as_list(), result) | |
| return result | |
| shape_reporting_wrapper.get_concrete_function(*args, **kwargs) | |
| assert result_shapes_container[0] is not uninitialized | |
| return result_shapes_container[0] | |
| if __name__ == "__main__": | |
| tf.test.main() | |