NeMo / tests /collections /nlp /test_tabular_tokenizer.py
camenduru's picture
thanks to NVIDIA ❤
7934b29
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import string
import numpy as np
import pytest
from nemo.collections.common.tokenizers.column_coder import ColumnCodes
from nemo.collections.common.tokenizers.tabular_tokenizer import TabularTokenizer
class TestTabularTokenizer:
def setup_method(self, test_method):
column_configs = [
{
"name": "col_a",
"code_type": "float",
"args": {"code_len": 4, "base": 16, "fillall": False, "hasnan": True, "transform": 'yeo-johnson'},
},
{
"name": "col_b",
"code_type": "float",
"args": {"code_len": 4, "base": 177, "fillall": True, "hasnan": True, "transform": 'quantile'},
},
{
"name": "col_c",
"code_type": "int",
"args": {"code_len": 3, "base": 12, "fillall": True, "hasnan": True},
},
{"name": "col_d", "code_type": "category",},
]
example_arrays = {}
np.random.seed(1234)
array = np.random.random(100)
example_arrays['col_a'] = array
array = np.random.random(100)
example_arrays['col_b'] = array
array = np.random.randint(3, 1000, 100)
example_arrays['col_c'] = array
ALPHABET = np.array(list(string.ascii_lowercase + ' '))
array = np.char.add(np.random.choice(ALPHABET, 1000), np.random.choice(ALPHABET, 1000))
example_arrays['col_d'] = array
self.cc = ColumnCodes.get_column_codes(column_configs, example_arrays)
@pytest.mark.unit
def test_tabular_tokenizer(self):
tab = TabularTokenizer(self.cc, delimiter=',')
text = "0.323, 0.1, 232, xy\n0.323, 0.1, 232, xy<|endoftext|>"
r = tab.text_to_tokens(text)
assert len(r) == 10
assert tab.eod == 1351
assert tab.eor == 1352
assert tab.num_columns == 4
assert self.cc.vocab_size == 1351
assert tab.vocab_size == 1353
r = tab.text_to_ids(text)
assert (sum(self.cc.sizes) + 1) * 2 == len(r)
assert np.array_equal(
np.array(r[0:13]), np.array([49, 32, 29, 15, 584, 417, 305, 76, 787, 780, 773, 1313, 1352])
)
assert np.array_equal(
np.array(r[13:]), np.array([49, 32, 29, 15, 584, 417, 305, 76, 787, 780, 773, 1313, 1351])
)
reversed_text = tab.ids_to_text(r)
assert reversed_text == '0.3230,0.0999998,232,xy\n0.3230,0.0999998,232,xy<|endoftext|>'
text = "xy\n0.323, 0.1, 232, xy<|endoftext|>"
r = tab.text_to_tokens(text)
assert len(r) == 7
r = tab.text_to_ids(text)
assert sum(self.cc.sizes) + 1 + 2 == len(r)
assert np.array_equal(np.array(r[0:2]), np.array([1313, 1352]))
assert np.array_equal(
np.array(r[2:15]), np.array([49, 32, 29, 15, 584, 417, 305, 76, 787, 780, 773, 1313, 1351])
)
reversed_text = tab.ids_to_text(r)
assert reversed_text == 'xy\n0.3230,0.0999998,232,xy<|endoftext|>'
text = "\n0.323, 0.1, 232, xy<|endoftext|>"
r = tab.text_to_tokens(text)
assert len(r) == 5
r = tab.text_to_ids(text)
assert sum(self.cc.sizes) + 1 == len(r)
assert np.array_equal(
np.array(r[0:13]), np.array([49, 32, 29, 15, 584, 417, 305, 76, 787, 780, 773, 1313, 1351])
)
reversed_text = tab.ids_to_text(r)
assert reversed_text == '0.3230,0.0999998,232,xy<|endoftext|>'
text = "232, xy\n0.323, 0.1, 232, xy<|endoftext|>"
r = tab.text_to_tokens(text)
assert len(r) == 8
r = tab.text_to_ids(text)
assert sum(self.cc.sizes) + 1 + 5 == len(r)
assert np.array_equal(np.array(r[0:5]), np.array([787, 780, 773, 1313, 1352]))
assert np.array_equal(
np.array(r[5:18]), np.array([49, 32, 29, 15, 584, 417, 305, 76, 787, 780, 773, 1313, 1351])
)
reversed_text = tab.ids_to_text(r)
assert reversed_text == '232,xy\n0.3230,0.0999998,232,xy<|endoftext|>'
text = "0.1, 232, xy\n0.323, 0.1, 232, xy<|endoftext|>"
r = tab.text_to_tokens(text)
assert len(r) == 9
r = tab.text_to_ids(text)
assert sum(self.cc.sizes) + 1 + 9 == len(r)
assert np.array_equal(np.array(r[0:9]), np.array([584, 417, 305, 76, 787, 780, 773, 1313, 1352]))
assert np.array_equal(
np.array(r[9:22]), np.array([49, 32, 29, 15, 584, 417, 305, 76, 787, 780, 773, 1313, 1351])
)
reversed_text = tab.ids_to_text(r)
assert reversed_text == '0.0999998,232,xy\n0.3230,0.0999998,232,xy<|endoftext|>'