|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import base64 |
|
|
import json |
|
|
import tempfile |
|
|
from pathlib import Path |
|
|
|
|
|
import pytest |
|
|
|
|
|
from nemo.export.tiktoken_tokenizer import TiktokenTokenizer, reload_mergeable_ranks |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def sample_vocab_file(): |
|
|
|
|
|
vocab_data = [ |
|
|
{"rank": i, "token_bytes": base64.b64encode(bytes([i])).decode('utf-8'), "token_str": f"token_{i}"} |
|
|
for i in range(256) |
|
|
] |
|
|
|
|
|
vocab_data.extend( |
|
|
[ |
|
|
{"rank": 256, "token_bytes": base64.b64encode(b"Hello").decode('utf-8'), "token_str": "Hello"}, |
|
|
{"rank": 257, "token_bytes": base64.b64encode(b"World").decode('utf-8'), "token_str": "World"}, |
|
|
] |
|
|
) |
|
|
|
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: |
|
|
json.dump(vocab_data, f) |
|
|
temp_path = f.name |
|
|
|
|
|
yield temp_path |
|
|
Path(temp_path).unlink() |
|
|
|
|
|
|
|
|
def test_reload_mergeable_ranks(sample_vocab_file): |
|
|
ranks = reload_mergeable_ranks(sample_vocab_file) |
|
|
assert len(ranks) == 258 |
|
|
assert ranks[b"Hello"] == 256 |
|
|
assert ranks[b"World"] == 257 |
|
|
|
|
|
|
|
|
def test_tokenizer_initialization(sample_vocab_file): |
|
|
tokenizer = TiktokenTokenizer(sample_vocab_file) |
|
|
assert tokenizer.bos_token_id == 1 |
|
|
assert tokenizer.eos_token_id == 2 |
|
|
assert tokenizer.pad_id == 2 |
|
|
|
|
|
|
|
|
def test_encode_decode(sample_vocab_file): |
|
|
tokenizer = TiktokenTokenizer(sample_vocab_file) |
|
|
text = "Hello World" |
|
|
tokens = tokenizer.encode(text) |
|
|
decoded_text = tokenizer.decode(tokens) |
|
|
assert isinstance(tokens, list) |
|
|
assert all(isinstance(t, int) for t in tokens) |
|
|
assert isinstance(decoded_text, str) |
|
|
|
|
|
|
|
|
def test_batch_decode(sample_vocab_file): |
|
|
tokenizer = TiktokenTokenizer(sample_vocab_file) |
|
|
tokens = [[1000, 1001, 1002]] |
|
|
decoded_text = tokenizer.batch_decode(tokens) |
|
|
assert isinstance(decoded_text, str) |
|
|
|
|
|
|
|
|
def test_special_token_handling(sample_vocab_file): |
|
|
tokenizer = TiktokenTokenizer(sample_vocab_file) |
|
|
|
|
|
tokens = [tokenizer.bos_token_id, 1000, 1001, tokenizer.eos_token_id] |
|
|
decoded_text = tokenizer.decode(tokens) |
|
|
assert decoded_text != "" |
|
|
|
|
|
|
|
|
def test_empty_decode(sample_vocab_file): |
|
|
tokenizer = TiktokenTokenizer(sample_vocab_file) |
|
|
|
|
|
tokens = [tokenizer.bos_token_id, tokenizer.eos_token_id] |
|
|
decoded_text = tokenizer.decode(tokens) |
|
|
assert decoded_text == "" |
|
|
|
|
|
|
|
|
def test_batch_decode_numpy_tensor(sample_vocab_file): |
|
|
import numpy as np |
|
|
import torch |
|
|
|
|
|
tokenizer = TiktokenTokenizer(sample_vocab_file) |
|
|
np_tokens = np.array([[1000, 1001, 1002]]) |
|
|
torch_tokens = torch.tensor([[1000, 1001, 1002]]) |
|
|
|
|
|
np_decoded = tokenizer.batch_decode(np_tokens) |
|
|
torch_decoded = tokenizer.batch_decode(torch_tokens) |
|
|
|
|
|
assert isinstance(np_decoded, str) |
|
|
assert isinstance(torch_decoded, str) |
|
|
assert np_decoded == torch_decoded |
|
|
|