NeMo_Canary / tests /export /test_sentencepiece_tokenizer.py
Respair's picture
Upload folder using huggingface_hub
b386992 verified
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import tempfile
import unittest
from unittest.mock import MagicMock
import numpy as np
import sentencepiece
import torch
from nemo.export.sentencepiece_tokenizer import SentencePieceTokenizer
class TestSentencePieceTokenizer(unittest.TestCase):
@classmethod
def setUpClass(cls):
# Create a temporary directory for test files
cls.test_dir = tempfile.mkdtemp()
# Create a simple sentencepiece model for testing
with open(os.path.join(cls.test_dir, "test.txt"), "w") as f:
f.write("Hello world\nThis is a test\n")
# Train a simple sentencepiece model
sentencepiece.SentencePieceTrainer.Train(
f'--input={os.path.join(cls.test_dir, "test.txt")} '
f'--model_prefix={os.path.join(cls.test_dir, "test_model")} '
'--vocab_size=55 --model_type=bpe'
)
cls.model_path = os.path.join(cls.test_dir, "test_model.model")
@classmethod
def tearDownClass(cls):
# Clean up temporary files
import shutil
shutil.rmtree(cls.test_dir)
def setUp(self):
self.tokenizer = SentencePieceTokenizer(model_path=self.model_path)
def test_initialization(self):
# Test initialization with model path
tokenizer = SentencePieceTokenizer(model_path=self.model_path)
self.assertIsNotNone(tokenizer.tokenizer)
self.assertEqual(tokenizer.original_vocab_size, tokenizer.vocab_size)
# Test initialization with invalid model path
with self.assertRaises(ValueError):
SentencePieceTokenizer(model_path="nonexistent.model")
# Test initialization with both model_path and tokenizer
mock_tokenizer = MagicMock()
with self.assertRaises(ValueError):
SentencePieceTokenizer(model_path=self.model_path, tokenizer=mock_tokenizer)
# Test initialization with neither model_path nor tokenizer
with self.assertRaises(ValueError):
SentencePieceTokenizer()
def test_text_to_tokens(self):
text = "Hello world"
tokens = self.tokenizer.text_to_tokens(text)
self.assertIsInstance(tokens, list)
self.assertTrue(all(isinstance(t, str) for t in tokens))
def test_encode(self):
text = "Hello world"
ids = self.tokenizer.encode(text)
self.assertIsInstance(ids, list)
self.assertTrue(all(isinstance(i, int) for i in ids))
def test_tokens_to_text(self):
text = "Hello world"
tokens = self.tokenizer.text_to_tokens(text)
reconstructed_text = self.tokenizer.tokens_to_text(tokens)
self.assertIsInstance(reconstructed_text, str)
self.assertNotEqual(reconstructed_text, "") # Should not be empty
def test_batch_decode(self):
text = "Hello world"
ids = self.tokenizer.encode(text)
# Test with list
decoded_text = self.tokenizer.batch_decode(ids)
self.assertIsInstance(decoded_text, str)
# Test with numpy array
ids_np = np.array(ids)
decoded_text_np = self.tokenizer.batch_decode(ids_np)
self.assertIsInstance(decoded_text_np, str)
# Test with torch tensor
ids_torch = torch.tensor(ids)
decoded_text_torch = self.tokenizer.batch_decode(ids_torch)
self.assertIsInstance(decoded_text_torch, str)
def test_token_to_id(self):
text = "Hello"
tokens = self.tokenizer.text_to_tokens(text)
token_id = self.tokenizer.token_to_id(tokens[0])
self.assertIsInstance(token_id, int)
def test_ids_to_tokens(self):
text = "Hello world"
ids = self.tokenizer.encode(text)
tokens = self.tokenizer.ids_to_tokens(ids)
self.assertIsInstance(tokens, list)
self.assertTrue(all(isinstance(t, str) for t in tokens))
def test_tokens_to_ids(self):
text = "Hello"
tokens = self.tokenizer.text_to_tokens(text)
ids = self.tokenizer.tokens_to_ids(tokens)
self.assertIsInstance(ids, list)
self.assertTrue(all(isinstance(i, int) for i in ids))
def test_legacy_mode(self):
special_tokens = ["[PAD]", "[BOS]", "[EOS]"]
tokenizer = SentencePieceTokenizer(model_path=self.model_path, special_tokens=special_tokens, legacy=True)
# Test adding special tokens
self.assertGreater(tokenizer.vocab_size, tokenizer.original_vocab_size)
# Test special token encoding
text = "Hello [PAD] world"
tokens = tokenizer.text_to_tokens(text)
self.assertIn("[PAD]", tokens)
# Test special token decoding
ids = tokenizer.encode(text)
decoded_text = tokenizer.batch_decode(ids)
self.assertIn("[PAD]", decoded_text)
def test_properties(self):
# Test pad_id property
self.assertIsInstance(self.tokenizer.pad_id, int)
# Test bos_token_id property
self.assertIsInstance(self.tokenizer.bos_token_id, int)
# Test eos_token_id property
self.assertIsInstance(self.tokenizer.eos_token_id, int)
# Test unk_id property
self.assertIsInstance(self.tokenizer.unk_id, int)
def test_vocab_property(self):
vocab = self.tokenizer.vocab
self.assertIsInstance(vocab, list)
self.assertTrue(all(isinstance(t, str) for t in vocab))
def test_convert_ids_to_tokens(self):
text = "Hello world"
ids = self.tokenizer.encode(text)
tokens = self.tokenizer.convert_ids_to_tokens(ids)
self.assertIsInstance(tokens, list)
self.assertTrue(all(isinstance(t, str) for t in tokens))
def test_convert_tokens_to_string(self):
text = "Hello world"
tokens = self.tokenizer.text_to_tokens(text)
string = self.tokenizer.convert_tokens_to_string(tokens)
self.assertIsInstance(string, str)
def test_len(self):
self.assertEqual(len(self.tokenizer), self.tokenizer.vocab_size)
def test_is_fast(self):
self.assertTrue(self.tokenizer.is_fast)
def test_get_added_vocab(self):
self.assertIsNone(self.tokenizer.get_added_vocab())
if __name__ == '__main__':
unittest.main()