Spaces:
Sleeping
Sleeping
| # Copyright 2024 The TensorFlow Authors. All Rights Reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """Tests for official.nlp.data.create_pretraining_data.""" | |
| import random | |
| import tensorflow as tf, tf_keras | |
| from official.nlp.data import create_pretraining_data as cpd | |
| _VOCAB_WORDS = ["vocab_1", "vocab_2"] | |
| class CreatePretrainingDataTest(tf.test.TestCase): | |
| def assertTokens(self, input_tokens, output_tokens, masked_positions, | |
| masked_labels): | |
| # Ensure the masked positions are unique. | |
| self.assertCountEqual(masked_positions, set(masked_positions)) | |
| # Ensure we can reconstruct the input from the output. | |
| reconstructed_tokens = output_tokens | |
| for pos, label in zip(masked_positions, masked_labels): | |
| reconstructed_tokens[pos] = label | |
| self.assertEqual(input_tokens, reconstructed_tokens) | |
| # Ensure each label is valid. | |
| for pos, label in zip(masked_positions, masked_labels): | |
| output_token = output_tokens[pos] | |
| if (output_token == "[MASK]" or output_token in _VOCAB_WORDS or | |
| output_token == input_tokens[pos]): | |
| continue | |
| self.fail("invalid mask value: {}".format(output_token)) | |
| def test_tokens_to_grams(self): | |
| tests = [ | |
| (["That", "cone"], [(0, 1), (1, 2)]), | |
| (["That", "cone", "##s"], [(0, 1), (1, 3)]), | |
| (["Swit", "##zer", "##land"], [(0, 3)]), | |
| (["[CLS]", "Up", "##dog"], [(1, 3)]), | |
| (["[CLS]", "Up", "##dog", "[SEP]", "Down"], [(1, 3), (4, 5)]), | |
| ] | |
| for inp, expected in tests: | |
| output = cpd._tokens_to_grams(inp) | |
| self.assertEqual(expected, output) | |
| def test_window(self): | |
| input_list = [1, 2, 3, 4] | |
| window_outputs = [ | |
| (1, [[1], [2], [3], [4]]), | |
| (2, [[1, 2], [2, 3], [3, 4]]), | |
| (3, [[1, 2, 3], [2, 3, 4]]), | |
| (4, [[1, 2, 3, 4]]), | |
| (5, []), | |
| ] | |
| for window, expected in window_outputs: | |
| output = cpd._window(input_list, window) | |
| self.assertEqual(expected, list(output)) | |
| def test_create_masked_lm_predictions(self): | |
| tokens = ["[CLS]", "a", "##a", "b", "##b", "c", "##c", "[SEP]"] | |
| rng = random.Random(123) | |
| for _ in range(0, 5): | |
| output_tokens, masked_positions, masked_labels = ( | |
| cpd.create_masked_lm_predictions( | |
| tokens=tokens, | |
| masked_lm_prob=1.0, | |
| max_predictions_per_seq=3, | |
| vocab_words=_VOCAB_WORDS, | |
| rng=rng, | |
| do_whole_word_mask=False, | |
| max_ngram_size=None)) | |
| self.assertLen(masked_positions, 3) | |
| self.assertLen(masked_labels, 3) | |
| self.assertTokens(tokens, output_tokens, masked_positions, masked_labels) | |
| def test_create_masked_lm_predictions_whole_word(self): | |
| tokens = ["[CLS]", "a", "##a", "b", "##b", "c", "##c", "[SEP]"] | |
| rng = random.Random(345) | |
| for _ in range(0, 5): | |
| output_tokens, masked_positions, masked_labels = ( | |
| cpd.create_masked_lm_predictions( | |
| tokens=tokens, | |
| masked_lm_prob=1.0, | |
| max_predictions_per_seq=3, | |
| vocab_words=_VOCAB_WORDS, | |
| rng=rng, | |
| do_whole_word_mask=True, | |
| max_ngram_size=None)) | |
| # since we can't get exactly three tokens without breaking a word we | |
| # only take two. | |
| self.assertLen(masked_positions, 2) | |
| self.assertLen(masked_labels, 2) | |
| self.assertTokens(tokens, output_tokens, masked_positions, masked_labels) | |
| # ensure that we took an entire word. | |
| self.assertIn(masked_labels, [["a", "##a"], ["b", "##b"], ["c", "##c"]]) | |
| def test_create_masked_lm_predictions_ngram(self): | |
| tokens = ["[CLS]"] + ["tok{}".format(i) for i in range(0, 512)] + ["[SEP]"] | |
| rng = random.Random(345) | |
| for _ in range(0, 5): | |
| output_tokens, masked_positions, masked_labels = ( | |
| cpd.create_masked_lm_predictions( | |
| tokens=tokens, | |
| masked_lm_prob=1.0, | |
| max_predictions_per_seq=76, | |
| vocab_words=_VOCAB_WORDS, | |
| rng=rng, | |
| do_whole_word_mask=True, | |
| max_ngram_size=3)) | |
| self.assertLen(masked_positions, 76) | |
| self.assertLen(masked_labels, 76) | |
| self.assertTokens(tokens, output_tokens, masked_positions, masked_labels) | |
| if __name__ == "__main__": | |
| tf.test.main() | |