|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import string |
|
|
|
|
|
import numpy as np |
|
|
import pytest |
|
|
|
|
|
from nemo.collections.common.tokenizers.column_coder import ColumnCodes |
|
|
from nemo.collections.common.tokenizers.tabular_tokenizer import TabularTokenizer |
|
|
|
|
|
|
|
|
class TestTabularTokenizer: |
|
|
def setup_method(self, test_method): |
|
|
column_configs = [ |
|
|
{ |
|
|
"name": "col_a", |
|
|
"code_type": "float", |
|
|
"args": {"code_len": 4, "base": 16, "fillall": False, "hasnan": True, "transform": 'yeo-johnson'}, |
|
|
}, |
|
|
{ |
|
|
"name": "col_b", |
|
|
"code_type": "float", |
|
|
"args": {"code_len": 4, "base": 177, "fillall": True, "hasnan": True, "transform": 'quantile'}, |
|
|
}, |
|
|
{ |
|
|
"name": "col_c", |
|
|
"code_type": "int", |
|
|
"args": {"code_len": 3, "base": 12, "fillall": True, "hasnan": True}, |
|
|
}, |
|
|
{"name": "col_d", "code_type": "category",}, |
|
|
] |
|
|
|
|
|
example_arrays = {} |
|
|
np.random.seed(1234) |
|
|
|
|
|
array = np.random.random(100) |
|
|
example_arrays['col_a'] = array |
|
|
|
|
|
array = np.random.random(100) |
|
|
example_arrays['col_b'] = array |
|
|
|
|
|
array = np.random.randint(3, 1000, 100) |
|
|
example_arrays['col_c'] = array |
|
|
|
|
|
ALPHABET = np.array(list(string.ascii_lowercase + ' ')) |
|
|
array = np.char.add(np.random.choice(ALPHABET, 1000), np.random.choice(ALPHABET, 1000)) |
|
|
example_arrays['col_d'] = array |
|
|
|
|
|
self.cc = ColumnCodes.get_column_codes(column_configs, example_arrays) |
|
|
|
|
|
@pytest.mark.unit |
|
|
def test_tabular_tokenizer(self): |
|
|
tab = TabularTokenizer(self.cc, delimiter=',') |
|
|
text = "0.323, 0.1, 232, xy\n0.323, 0.1, 232, xy<|endoftext|>" |
|
|
r = tab.text_to_tokens(text) |
|
|
assert len(r) == 10 |
|
|
assert tab.eod == 1351 |
|
|
assert tab.eor == 1352 |
|
|
assert tab.num_columns == 4 |
|
|
assert self.cc.vocab_size == 1351 |
|
|
assert tab.vocab_size == 1353 |
|
|
r = tab.text_to_ids(text) |
|
|
assert (sum(self.cc.sizes) + 1) * 2 == len(r) |
|
|
assert np.array_equal( |
|
|
np.array(r[0:13]), np.array([49, 32, 29, 15, 584, 417, 305, 76, 787, 780, 773, 1313, 1352]) |
|
|
) |
|
|
assert np.array_equal( |
|
|
np.array(r[13:]), np.array([49, 32, 29, 15, 584, 417, 305, 76, 787, 780, 773, 1313, 1351]) |
|
|
) |
|
|
reversed_text = tab.ids_to_text(r) |
|
|
assert reversed_text == '0.3230,0.0999998,232,xy\n0.3230,0.0999998,232,xy<|endoftext|>' |
|
|
|
|
|
text = "xy\n0.323, 0.1, 232, xy<|endoftext|>" |
|
|
r = tab.text_to_tokens(text) |
|
|
assert len(r) == 7 |
|
|
r = tab.text_to_ids(text) |
|
|
assert sum(self.cc.sizes) + 1 + 2 == len(r) |
|
|
assert np.array_equal(np.array(r[0:2]), np.array([1313, 1352])) |
|
|
assert np.array_equal( |
|
|
np.array(r[2:15]), np.array([49, 32, 29, 15, 584, 417, 305, 76, 787, 780, 773, 1313, 1351]) |
|
|
) |
|
|
reversed_text = tab.ids_to_text(r) |
|
|
assert reversed_text == 'xy\n0.3230,0.0999998,232,xy<|endoftext|>' |
|
|
|
|
|
text = "\n0.323, 0.1, 232, xy<|endoftext|>" |
|
|
r = tab.text_to_tokens(text) |
|
|
assert len(r) == 5 |
|
|
r = tab.text_to_ids(text) |
|
|
assert sum(self.cc.sizes) + 1 == len(r) |
|
|
assert np.array_equal( |
|
|
np.array(r[0:13]), np.array([49, 32, 29, 15, 584, 417, 305, 76, 787, 780, 773, 1313, 1351]) |
|
|
) |
|
|
reversed_text = tab.ids_to_text(r) |
|
|
assert reversed_text == '0.3230,0.0999998,232,xy<|endoftext|>' |
|
|
|
|
|
text = "232, xy\n0.323, 0.1, 232, xy<|endoftext|>" |
|
|
r = tab.text_to_tokens(text) |
|
|
assert len(r) == 8 |
|
|
r = tab.text_to_ids(text) |
|
|
assert sum(self.cc.sizes) + 1 + 5 == len(r) |
|
|
assert np.array_equal(np.array(r[0:5]), np.array([787, 780, 773, 1313, 1352])) |
|
|
assert np.array_equal( |
|
|
np.array(r[5:18]), np.array([49, 32, 29, 15, 584, 417, 305, 76, 787, 780, 773, 1313, 1351]) |
|
|
) |
|
|
reversed_text = tab.ids_to_text(r) |
|
|
assert reversed_text == '232,xy\n0.3230,0.0999998,232,xy<|endoftext|>' |
|
|
|
|
|
text = "0.1, 232, xy\n0.323, 0.1, 232, xy<|endoftext|>" |
|
|
r = tab.text_to_tokens(text) |
|
|
assert len(r) == 9 |
|
|
r = tab.text_to_ids(text) |
|
|
assert sum(self.cc.sizes) + 1 + 9 == len(r) |
|
|
assert np.array_equal(np.array(r[0:9]), np.array([584, 417, 305, 76, 787, 780, 773, 1313, 1352])) |
|
|
assert np.array_equal( |
|
|
np.array(r[9:22]), np.array([49, 32, 29, 15, 584, 417, 305, 76, 787, 780, 773, 1313, 1351]) |
|
|
) |
|
|
reversed_text = tab.ids_to_text(r) |
|
|
assert reversed_text == '0.0999998,232,xy\n0.3230,0.0999998,232,xy<|endoftext|>' |
|
|
|