|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import string |
|
|
|
|
|
import numpy as np |
|
|
import pytest |
|
|
|
|
|
from nemo.collections.common.tokenizers.column_coder import CategoryCode, ColumnCodes, FloatCode, IntCode |
|
|
|
|
|
|
|
|
class TestColumnCoder: |
|
|
@pytest.mark.unit |
|
|
def test_float(self): |
|
|
np.random.seed(1234) |
|
|
series = np.random.random(100) |
|
|
float_coder = FloatCode('t', 5, 0, False, 10, True) |
|
|
float_coder.compute_code(data_series=series) |
|
|
r = float_coder.encode('0.323') |
|
|
assert np.array_equal(np.array(r), np.array([40, 32, 25, 17, 3])) |
|
|
decoded = float_coder.decode(r) |
|
|
assert decoded == '0.32290' |
|
|
r = float_coder.encode('1.323') |
|
|
assert np.array_equal(np.array(r), np.array([41, 30, 20, 10, 0])) |
|
|
decoded = float_coder.decode(r) |
|
|
assert decoded == '0.99208' |
|
|
r = float_coder.encode('nan') |
|
|
assert np.array_equal(np.array(r), np.array([42, 39, 29, 19, 9])) |
|
|
decoded = float_coder.decode(r) |
|
|
assert decoded == 'nan' |
|
|
|
|
|
float_coder = FloatCode('t', 5, 0, False, 10, True, 'yeo-johnson') |
|
|
float_coder.compute_code(data_series=series) |
|
|
r = float_coder.encode('0.323') |
|
|
assert np.array_equal(np.array(r), np.array([41, 30, 25, 14, 5])) |
|
|
decoded = float_coder.decode(r) |
|
|
assert decoded == '0.32300' |
|
|
r = float_coder.encode('1.323') |
|
|
assert np.array_equal(np.array(r), np.array([43, 39, 21, 16, 3])) |
|
|
decoded = float_coder.decode(r) |
|
|
assert decoded == '1.08064' |
|
|
r = float_coder.encode('nan') |
|
|
assert np.array_equal(np.array(r), np.array([44, 39, 29, 19, 9])) |
|
|
decoded = float_coder.decode(r) |
|
|
assert decoded == 'nan' |
|
|
|
|
|
float_coder = FloatCode('t', 5, 0, False, 10, True, 'robust') |
|
|
float_coder.compute_code(data_series=series) |
|
|
r = float_coder.encode('0.323') |
|
|
assert np.array_equal(np.array(r), np.array([40, 37, 24, 10, 8])) |
|
|
decoded = float_coder.decode(r) |
|
|
assert decoded == '0.32299' |
|
|
r = float_coder.encode('1.323') |
|
|
assert np.array_equal(np.array(r), np.array([42, 30, 27, 19, 3])) |
|
|
decoded = float_coder.decode(r) |
|
|
assert decoded == '0.89536' |
|
|
r = float_coder.encode('nan') |
|
|
assert np.array_equal(np.array(r), np.array([43, 39, 29, 19, 9])) |
|
|
decoded = float_coder.decode(r) |
|
|
assert decoded == 'nan' |
|
|
|
|
|
float_coder = FloatCode('t', 5, 0, True, 377, True) |
|
|
float_coder.compute_code(data_series=series) |
|
|
r = float_coder.encode('0.323') |
|
|
assert np.array_equal(np.array(r), np.array([1508, 1228, 765, 663, 194])) |
|
|
decoded = float_coder.decode(r) |
|
|
assert decoded == '0.32299999994' |
|
|
r = float_coder.encode('nan') |
|
|
assert np.array_equal(np.array(r), np.array([1885, 1507, 1130, 753, 376])) |
|
|
decoded = float_coder.decode(r) |
|
|
assert decoded == 'nan' |
|
|
assert float_coder.end_id == 1886 |
|
|
assert float_coder.code_range[0] == (1508, 1886) |
|
|
assert float_coder.code_range[1] == (1131, 1508) |
|
|
assert float_coder.code_range[2] == (754, 1131) |
|
|
assert float_coder.code_range[3] == (377, 754) |
|
|
assert float_coder.code_range[4] == (0, 377) |
|
|
|
|
|
float_coder = FloatCode('t', 5, 0, True, 377, False) |
|
|
float_coder.compute_code(data_series=series) |
|
|
assert float_coder.end_id == 1885 |
|
|
assert float_coder.code_range[0] == (1508, 1885) |
|
|
assert float_coder.code_range[1] == (1131, 1508) |
|
|
assert float_coder.code_range[2] == (754, 1131) |
|
|
assert float_coder.code_range[3] == (377, 754) |
|
|
assert float_coder.code_range[4] == (0, 377) |
|
|
try: |
|
|
float_coder.encode('nan') |
|
|
except ValueError as e: |
|
|
assert str(e) == 'colum t cannot handle nan, please set hasnan=True' |
|
|
|
|
|
@pytest.mark.unit |
|
|
def test_int(self): |
|
|
np.random.seed(1234) |
|
|
array = np.random.randint(3, 1000, 100) |
|
|
int_coder = IntCode('i', 3, 0, False, 16, True) |
|
|
int_coder.compute_code(array) |
|
|
|
|
|
r = int_coder.encode('232') |
|
|
assert np.array_equal(np.array(r), np.array([32, 30, 2])) |
|
|
decoded = int_coder.decode(r) |
|
|
assert decoded == '232' |
|
|
|
|
|
r = int_coder.encode('nan') |
|
|
assert np.array_equal(np.array(r), np.array([36, 31, 15])) |
|
|
decoded = int_coder.decode(r) |
|
|
assert decoded == 'nan' |
|
|
|
|
|
@pytest.mark.unit |
|
|
def test_category(self): |
|
|
np.random.seed(1234) |
|
|
ALPHABET = np.array(list(string.ascii_lowercase + ' ')) |
|
|
array = np.char.add(np.random.choice(ALPHABET, 1000), np.random.choice(ALPHABET, 1000)) |
|
|
|
|
|
int_coder = CategoryCode('c', 0) |
|
|
int_coder.compute_code(array) |
|
|
|
|
|
r = int_coder.encode('xy') |
|
|
assert np.array_equal(np.array(r), np.array([509])) |
|
|
decoded = int_coder.decode(r) |
|
|
assert decoded == 'xy' |
|
|
|
|
|
@pytest.mark.unit |
|
|
def test_column_coder(self): |
|
|
column_configs = [ |
|
|
{ |
|
|
"name": "col_a", |
|
|
"code_type": "float", |
|
|
"args": {"code_len": 4, "base": 16, "fillall": False, "hasnan": True, "transform": 'yeo-johnson'}, |
|
|
}, |
|
|
{ |
|
|
"name": "col_b", |
|
|
"code_type": "float", |
|
|
"args": {"code_len": 4, "base": 177, "fillall": True, "hasnan": True, "transform": 'quantile'}, |
|
|
}, |
|
|
{ |
|
|
"name": "col_c", |
|
|
"code_type": "int", |
|
|
"args": {"code_len": 3, "base": 12, "fillall": True, "hasnan": True}, |
|
|
}, |
|
|
{"name": "col_d", "code_type": "category",}, |
|
|
] |
|
|
|
|
|
example_arrays = {} |
|
|
np.random.seed(1234) |
|
|
|
|
|
array = np.random.random(100) |
|
|
example_arrays['col_a'] = array |
|
|
|
|
|
array = np.random.random(100) |
|
|
example_arrays['col_b'] = array |
|
|
|
|
|
array = np.random.randint(3, 1000, 100) |
|
|
example_arrays['col_c'] = array |
|
|
|
|
|
ALPHABET = np.array(list(string.ascii_lowercase + ' ')) |
|
|
array = np.char.add(np.random.choice(ALPHABET, 1000), np.random.choice(ALPHABET, 1000)) |
|
|
example_arrays['col_d'] = array |
|
|
|
|
|
cc = ColumnCodes.get_column_codes(column_configs, example_arrays) |
|
|
|
|
|
rr = cc.encode('col_a', '0.323') |
|
|
assert np.array_equal(np.array(rr), np.array([49, 32, 29, 15])) |
|
|
decoded = cc.decode('col_a', rr) |
|
|
assert decoded == '0.3230' |
|
|
|
|
|
rr = cc.encode('col_b', '0.323') |
|
|
assert np.array_equal(np.array(rr), np.array([584, 457, 235, 110])) |
|
|
decoded = cc.decode('col_b', rr) |
|
|
assert decoded == '0.3229999' |
|
|
|
|
|
rr = cc.encode('col_c', '232') |
|
|
assert np.array_equal(np.array(rr), np.array([787, 780, 773])) |
|
|
decoded = cc.decode('col_c', rr) |
|
|
assert decoded == '232' |
|
|
|
|
|
rr = cc.encode('col_d', 'xy') |
|
|
assert np.array_equal(np.array(rr), np.array([1313])) |
|
|
decoded = cc.decode('col_d', rr) |
|
|
assert decoded == 'xy' |
|
|
|
|
|
|
|
|
|