File size: 5,283 Bytes

7934b29

# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import string

import numpy as np
import pytest

from nemo.collections.common.tokenizers.column_coder import ColumnCodes
from nemo.collections.common.tokenizers.tabular_tokenizer import TabularTokenizer


class TestTabularTokenizer:
    def setup_method(self, test_method):
        column_configs = [
            {
                "name": "col_a",
                "code_type": "float",
                "args": {"code_len": 4, "base": 16, "fillall": False, "hasnan": True, "transform": 'yeo-johnson'},
            },
            {
                "name": "col_b",
                "code_type": "float",
                "args": {"code_len": 4, "base": 177, "fillall": True, "hasnan": True, "transform": 'quantile'},
            },
            {
                "name": "col_c",
                "code_type": "int",
                "args": {"code_len": 3, "base": 12, "fillall": True, "hasnan": True},
            },
            {"name": "col_d", "code_type": "category",},
        ]

        example_arrays = {}
        np.random.seed(1234)

        array = np.random.random(100)
        example_arrays['col_a'] = array

        array = np.random.random(100)
        example_arrays['col_b'] = array

        array = np.random.randint(3, 1000, 100)
        example_arrays['col_c'] = array

        ALPHABET = np.array(list(string.ascii_lowercase + ' '))
        array = np.char.add(np.random.choice(ALPHABET, 1000), np.random.choice(ALPHABET, 1000))
        example_arrays['col_d'] = array

        self.cc = ColumnCodes.get_column_codes(column_configs, example_arrays)

    @pytest.mark.unit
    def test_tabular_tokenizer(self):
        tab = TabularTokenizer(self.cc, delimiter=',')
        text = "0.323, 0.1, 232, xy\n0.323, 0.1, 232, xy<|endoftext|>"
        r = tab.text_to_tokens(text)
        assert len(r) == 10
        assert tab.eod == 1351
        assert tab.eor == 1352
        assert tab.num_columns == 4
        assert self.cc.vocab_size == 1351
        assert tab.vocab_size == 1353
        r = tab.text_to_ids(text)
        assert (sum(self.cc.sizes) + 1) * 2 == len(r)
        assert np.array_equal(
            np.array(r[0:13]), np.array([49, 32, 29, 15, 584, 417, 305, 76, 787, 780, 773, 1313, 1352])
        )
        assert np.array_equal(
            np.array(r[13:]), np.array([49, 32, 29, 15, 584, 417, 305, 76, 787, 780, 773, 1313, 1351])
        )
        reversed_text = tab.ids_to_text(r)
        assert reversed_text == '0.3230,0.0999998,232,xy\n0.3230,0.0999998,232,xy<|endoftext|>'

        text = "xy\n0.323, 0.1, 232, xy<|endoftext|>"
        r = tab.text_to_tokens(text)
        assert len(r) == 7
        r = tab.text_to_ids(text)
        assert sum(self.cc.sizes) + 1 + 2 == len(r)
        assert np.array_equal(np.array(r[0:2]), np.array([1313, 1352]))
        assert np.array_equal(
            np.array(r[2:15]), np.array([49, 32, 29, 15, 584, 417, 305, 76, 787, 780, 773, 1313, 1351])
        )
        reversed_text = tab.ids_to_text(r)
        assert reversed_text == 'xy\n0.3230,0.0999998,232,xy<|endoftext|>'

        text = "\n0.323, 0.1, 232, xy<|endoftext|>"
        r = tab.text_to_tokens(text)
        assert len(r) == 5
        r = tab.text_to_ids(text)
        assert sum(self.cc.sizes) + 1 == len(r)
        assert np.array_equal(
            np.array(r[0:13]), np.array([49, 32, 29, 15, 584, 417, 305, 76, 787, 780, 773, 1313, 1351])
        )
        reversed_text = tab.ids_to_text(r)
        assert reversed_text == '0.3230,0.0999998,232,xy<|endoftext|>'

        text = "232, xy\n0.323, 0.1, 232, xy<|endoftext|>"
        r = tab.text_to_tokens(text)
        assert len(r) == 8
        r = tab.text_to_ids(text)
        assert sum(self.cc.sizes) + 1 + 5 == len(r)
        assert np.array_equal(np.array(r[0:5]), np.array([787, 780, 773, 1313, 1352]))
        assert np.array_equal(
            np.array(r[5:18]), np.array([49, 32, 29, 15, 584, 417, 305, 76, 787, 780, 773, 1313, 1351])
        )
        reversed_text = tab.ids_to_text(r)
        assert reversed_text == '232,xy\n0.3230,0.0999998,232,xy<|endoftext|>'

        text = "0.1, 232, xy\n0.323, 0.1, 232, xy<|endoftext|>"
        r = tab.text_to_tokens(text)
        assert len(r) == 9
        r = tab.text_to_ids(text)
        assert sum(self.cc.sizes) + 1 + 9 == len(r)
        assert np.array_equal(np.array(r[0:9]), np.array([584, 417, 305, 76, 787, 780, 773, 1313, 1352]))
        assert np.array_equal(
            np.array(r[9:22]), np.array([49, 32, 29, 15, 584, 417, 305, 76, 787, 780, 773, 1313, 1351])
        )
        reversed_text = tab.ids_to_text(r)
        assert reversed_text == '0.0999998,232,xy\n0.3230,0.0999998,232,xy<|endoftext|>'