|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import tempfile |
|
|
import unittest |
|
|
from tempfile import TemporaryDirectory |
|
|
|
|
|
from transformers import AddedToken, CamembertTokenizer, CamembertTokenizerFast |
|
|
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow |
|
|
from transformers.utils import is_torch_available |
|
|
|
|
|
from ...test_tokenization_common import TokenizerTesterMixin |
|
|
|
|
|
|
|
|
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") |
|
|
SAMPLE_BPE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe.model") |
|
|
|
|
|
FRAMEWORK = "pt" if is_torch_available() else "tf" |
|
|
|
|
|
|
|
|
@require_sentencepiece |
|
|
@require_tokenizers |
|
|
class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): |
|
|
from_pretrained_id = "almanach/camembert-base" |
|
|
tokenizer_class = CamembertTokenizer |
|
|
rust_tokenizer_class = CamembertTokenizerFast |
|
|
test_rust_tokenizer = True |
|
|
test_sentencepiece = True |
|
|
|
|
|
@classmethod |
|
|
def setUpClass(cls): |
|
|
super().setUpClass() |
|
|
|
|
|
|
|
|
tokenizer = CamembertTokenizer(SAMPLE_VOCAB) |
|
|
tokenizer.save_pretrained(cls.tmpdirname) |
|
|
|
|
|
@unittest.skip( |
|
|
"Token maps are not equal because someone set the probability of ('<unk>NOTUSED', -100), so it's never encoded for fast" |
|
|
) |
|
|
def test_special_tokens_map_equal(self): |
|
|
return |
|
|
|
|
|
def test_convert_token_and_id(self): |
|
|
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" |
|
|
token = "<pad>" |
|
|
token_id = 1 |
|
|
|
|
|
self.assertEqual(self.get_tokenizer().convert_tokens_to_ids(token), token_id) |
|
|
self.assertEqual(self.get_tokenizer().convert_ids_to_tokens(token_id), token) |
|
|
|
|
|
def test_get_vocab(self): |
|
|
vocab_keys = list(self.get_tokenizer().get_vocab().keys()) |
|
|
|
|
|
self.assertEqual(vocab_keys[0], "<s>NOTUSED") |
|
|
self.assertEqual(vocab_keys[1], "<pad>") |
|
|
self.assertEqual(vocab_keys[-1], "<mask>") |
|
|
self.assertEqual(len(vocab_keys), 1_005) |
|
|
|
|
|
def test_vocab_size(self): |
|
|
self.assertEqual(self.get_tokenizer().vocab_size, 1_000) |
|
|
|
|
|
def test_rust_and_python_bpe_tokenizers(self): |
|
|
tokenizer = CamembertTokenizer(SAMPLE_BPE_VOCAB) |
|
|
with TemporaryDirectory() as tmpdirname: |
|
|
tokenizer.save_pretrained(tmpdirname) |
|
|
rust_tokenizer = CamembertTokenizerFast.from_pretrained(tmpdirname) |
|
|
|
|
|
sequence = "I was born in 92000, and this is falsé." |
|
|
|
|
|
ids = tokenizer.encode(sequence) |
|
|
rust_ids = rust_tokenizer.encode(sequence) |
|
|
self.assertListEqual(ids, rust_ids) |
|
|
|
|
|
ids = tokenizer.encode(sequence, add_special_tokens=False) |
|
|
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False) |
|
|
self.assertListEqual(ids, rust_ids) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokens = tokenizer.convert_ids_to_tokens(ids) |
|
|
rust_tokens = rust_tokenizer.tokenize(sequence) |
|
|
self.assertListEqual(tokens, rust_tokens) |
|
|
|
|
|
def test_rust_and_python_full_tokenizers(self): |
|
|
if not self.test_rust_tokenizer: |
|
|
self.skipTest(reason="test_rust_tokenizer is set to False") |
|
|
|
|
|
tokenizer = self.get_tokenizer() |
|
|
rust_tokenizer = self.get_rust_tokenizer() |
|
|
|
|
|
sequence = "I was born in 92000, and this is falsé." |
|
|
|
|
|
tokens = tokenizer.tokenize(sequence) |
|
|
rust_tokens = rust_tokenizer.tokenize(sequence) |
|
|
self.assertListEqual(tokens, rust_tokens) |
|
|
|
|
|
ids = tokenizer.encode(sequence, add_special_tokens=False) |
|
|
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False) |
|
|
self.assertListEqual(ids, rust_ids) |
|
|
|
|
|
rust_tokenizer = self.get_rust_tokenizer() |
|
|
ids = tokenizer.encode(sequence) |
|
|
rust_ids = rust_tokenizer.encode(sequence) |
|
|
self.assertListEqual(ids, rust_ids) |
|
|
|
|
|
@slow |
|
|
def test_tokenizer_integration(self): |
|
|
expected_encoding = {'input_ids': [[5, 54, 7196, 297, 30, 23, 776, 18, 11, 3215, 3705, 8252, 22, 3164, 1181, 2116, 29, 16, 813, 25, 791, 3314, 20, 3446, 38, 27575, 120, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [5, 468, 17, 11, 9088, 20, 1517, 8, 22804, 18818, 10, 38, 629, 607, 607, 142, 19, 7196, 867, 56, 10326, 24, 2267, 20, 416, 5072, 15612, 233, 734, 7, 2399, 27, 16, 3015, 1649, 7, 24, 20, 4338, 2399, 27, 13, 3400, 14, 13, 6189, 8, 930, 9, 6]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} |
|
|
|
|
|
|
|
|
sequences = [ |
|
|
"Le transformeur est un modèle d'apprentissage profond introduit en 2017, " |
|
|
"utilisé principalement dans le domaine du traitement automatique des langues (TAL).", |
|
|
"À l'instar des réseaux de neurones récurrents (RNN), les transformeurs sont conçus " |
|
|
"pour gérer des données séquentielles, telles que le langage naturel, pour des tâches " |
|
|
"telles que la traduction et la synthèse de texte.", |
|
|
] |
|
|
|
|
|
self.tokenizer_integration_test_util( |
|
|
expected_encoding=expected_encoding, |
|
|
model_name="almanach/camembert-base", |
|
|
revision="3a0641d9a1aeb7e848a74299e7e4c4bca216b4cf", |
|
|
sequences=sequences, |
|
|
) |
|
|
|
|
|
|
|
|
def test_added_tokens_serialization(self): |
|
|
self.maxDiff = None |
|
|
|
|
|
|
|
|
def _test_added_vocab_and_eos(expected, tokenizer_class, expected_eos, temp_dir): |
|
|
tokenizer = tokenizer_class.from_pretrained(temp_dir) |
|
|
self.assertTrue(str(expected_eos) not in tokenizer.additional_special_tokens) |
|
|
self.assertIn(new_eos, tokenizer.added_tokens_decoder.values()) |
|
|
self.assertEqual(tokenizer.added_tokens_decoder[tokenizer.eos_token_id], new_eos) |
|
|
self.assertTrue(all(item in tokenizer.added_tokens_decoder.items() for item in expected.items())) |
|
|
return tokenizer |
|
|
|
|
|
new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False, special=True) |
|
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: |
|
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): |
|
|
|
|
|
tokenizer = self.get_tokenizer(pretrained_name, eos_token=new_eos) |
|
|
EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder |
|
|
with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"): |
|
|
self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos) |
|
|
self.assertIn(new_eos, list(tokenizer.added_tokens_decoder.values())) |
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmp_dir_2: |
|
|
tokenizer.save_pretrained(tmp_dir_2) |
|
|
with self.subTest( |
|
|
"Hub -> Slow -> Slow: Test saving this slow tokenizer and reloading it in the fast class" |
|
|
): |
|
|
_test_added_vocab_and_eos( |
|
|
EXPECTED_ADDED_TOKENS_DECODER, self.tokenizer_class, new_eos, tmp_dir_2 |
|
|
) |
|
|
|
|
|
if self.rust_tokenizer_class is not None: |
|
|
with self.subTest( |
|
|
"Hub -> Slow -> Fast: Test saving this slow tokenizer and reloading it in the fast class" |
|
|
): |
|
|
tokenizer_fast = _test_added_vocab_and_eos( |
|
|
EXPECTED_ADDED_TOKENS_DECODER, self.rust_tokenizer_class, new_eos, tmp_dir_2 |
|
|
) |
|
|
with tempfile.TemporaryDirectory() as tmp_dir_3: |
|
|
tokenizer_fast.save_pretrained(tmp_dir_3) |
|
|
with self.subTest( |
|
|
"Hub -> Slow -> Fast -> Fast: Test saving this fast tokenizer and reloading it in the fast class" |
|
|
): |
|
|
_test_added_vocab_and_eos( |
|
|
EXPECTED_ADDED_TOKENS_DECODER, self.rust_tokenizer_class, new_eos, tmp_dir_3 |
|
|
) |
|
|
|
|
|
with self.subTest( |
|
|
"Hub -> Slow -> Fast -> Slow: Test saving this slow tokenizer and reloading it in the slow class" |
|
|
): |
|
|
_test_added_vocab_and_eos( |
|
|
EXPECTED_ADDED_TOKENS_DECODER, self.rust_tokenizer_class, new_eos, tmp_dir_3 |
|
|
) |
|
|
|
|
|
with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"): |
|
|
if self.rust_tokenizer_class is not None: |
|
|
tokenizer_fast = self.get_rust_tokenizer(pretrained_name, eos_token=new_eos, from_slow=True) |
|
|
self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos) |
|
|
self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values())) |
|
|
|
|
|
with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"): |
|
|
with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"): |
|
|
self.assertTrue( |
|
|
all( |
|
|
item in tokenizer.added_tokens_decoder.items() |
|
|
for item in EXPECTED_ADDED_TOKENS_DECODER.items() |
|
|
) |
|
|
) |
|
|
|
|
|
EXPECTED_ADDED_TOKENS_DECODER = tokenizer_fast.added_tokens_decoder |
|
|
with tempfile.TemporaryDirectory() as tmp_dir_4: |
|
|
tokenizer_fast.save_pretrained(tmp_dir_4) |
|
|
with self.subTest("Hub -> Fast -> Fast: saving Fast1 locally and loading"): |
|
|
_test_added_vocab_and_eos( |
|
|
EXPECTED_ADDED_TOKENS_DECODER, self.rust_tokenizer_class, new_eos, tmp_dir_4 |
|
|
) |
|
|
|
|
|
with self.subTest("Hub -> Fast -> Slow: saving Fast1 locally and loading"): |
|
|
_test_added_vocab_and_eos( |
|
|
EXPECTED_ADDED_TOKENS_DECODER, self.tokenizer_class, new_eos, tmp_dir_4 |
|
|
) |
|
|
|