|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import pytest |
|
|
from nemo.collections.common.tokenizers.text_to_speech.tokenizer_utils import ( |
|
|
any_locale_word_tokenize, |
|
|
english_word_tokenize, |
|
|
) |
|
|
|
|
|
|
|
|
class TestTokenizerUtils: |
|
|
@staticmethod |
|
|
def _create_expected_output(words): |
|
|
return [([word], False) for word in words] |
|
|
|
|
|
@pytest.mark.run_only_on('CPU') |
|
|
@pytest.mark.unit |
|
|
def test_english_word_tokenize(self): |
|
|
input_text = "apple banana pear" |
|
|
expected_output = self._create_expected_output(["apple", " ", "banana", " ", "pear"]) |
|
|
|
|
|
output = english_word_tokenize(input_text) |
|
|
assert output == expected_output |
|
|
|
|
|
@pytest.mark.run_only_on('CPU') |
|
|
@pytest.mark.unit |
|
|
def test_english_word_tokenize_with_punctuation(self): |
|
|
input_text = "Hello, world!" |
|
|
expected_output = self._create_expected_output(["hello", ", ", "world", "!"]) |
|
|
|
|
|
output = english_word_tokenize(input_text) |
|
|
assert output == expected_output |
|
|
|
|
|
@pytest.mark.run_only_on('CPU') |
|
|
@pytest.mark.unit |
|
|
def test_english_word_tokenize_with_contractions(self): |
|
|
input_text = "It's a c'ntr'ction." |
|
|
expected_output = self._create_expected_output(["it's", " ", "a", " ", "c'ntr'ction", "."]) |
|
|
|
|
|
output = english_word_tokenize(input_text) |
|
|
assert output == expected_output |
|
|
|
|
|
@pytest.mark.run_only_on('CPU') |
|
|
@pytest.mark.unit |
|
|
def test_english_word_tokenize_with_compound_words(self): |
|
|
input_text = "Forty-two is no run-off-the-mill number." |
|
|
expected_output = self._create_expected_output( |
|
|
["forty-two", " ", "is", " ", "no", " ", "run-off-the-mill", " ", "number", "."] |
|
|
) |
|
|
|
|
|
output = english_word_tokenize(input_text) |
|
|
assert output == expected_output |
|
|
|
|
|
@pytest.mark.run_only_on('CPU') |
|
|
@pytest.mark.unit |
|
|
def test_english_word_tokenize_with_escaped(self): |
|
|
input_text = "Leave |this part UNCHANGED|." |
|
|
expected_output = [(["leave"], False), ([" "], False), (["this", "part", "UNCHANGED"], True), (["."], False)] |
|
|
|
|
|
output = english_word_tokenize(input_text) |
|
|
assert output == expected_output |
|
|
|
|
|
@pytest.mark.run_only_on('CPU') |
|
|
@pytest.mark.unit |
|
|
def test_any_locale_word_tokenize(self): |
|
|
input_text = "apple banana pear" |
|
|
expected_output = self._create_expected_output(["apple", " ", "banana", " ", "pear"]) |
|
|
|
|
|
output = any_locale_word_tokenize(input_text) |
|
|
assert output == expected_output |
|
|
|
|
|
@pytest.mark.run_only_on('CPU') |
|
|
@pytest.mark.unit |
|
|
def test_any_locale_word_tokenize_with_accents(self): |
|
|
input_text = "The naïve piñata at the café..." |
|
|
expected_output = self._create_expected_output( |
|
|
["The", " ", "naïve", " ", "piñata", " ", "at", " ", "the", " ", "café", "..."] |
|
|
) |
|
|
|
|
|
output = any_locale_word_tokenize(input_text) |
|
|
assert output == expected_output |
|
|
|
|
|
@pytest.mark.run_only_on('CPU') |
|
|
@pytest.mark.unit |
|
|
def test_any_locale_word_tokenize_with_numbers(self): |
|
|
input_text = r"Three times× four^teen ÷divided by [movies] on \slash." |
|
|
expected_output = self._create_expected_output( |
|
|
[ |
|
|
"Three", |
|
|
" ", |
|
|
"times", |
|
|
"× ", |
|
|
"four", |
|
|
"^", |
|
|
"teen", |
|
|
" ÷", |
|
|
"divided", |
|
|
" ", |
|
|
"by", |
|
|
" [", |
|
|
"movies", |
|
|
"] ", |
|
|
"on", |
|
|
" \\", |
|
|
"slash", |
|
|
".", |
|
|
] |
|
|
) |
|
|
|
|
|
output = any_locale_word_tokenize(input_text) |
|
|
assert output == expected_output |
|
|
|