| | |
| | import sys |
| | from os.path import dirname, abspath |
| | sys.path.append(dirname(dirname(abspath(__file__)))) |
| | from nose.tools import raises |
| | from torchmoji.word_generator import WordGenerator |
| |
|
| | IS_PYTHON2 = int(sys.version[0]) == 2 |
| |
|
| | @raises(ValueError) |
| | def test_only_unicode_accepted(): |
| | """ Non-Unicode strings raise a ValueError. |
| | In Python 3 all string are Unicode |
| | """ |
| | if not IS_PYTHON2: |
| | raise ValueError("You are using python 3 so this test should always pass") |
| |
|
| | sentences = [ |
| | u'Hello world', |
| | u'I am unicode', |
| | 'I am not unicode', |
| | ] |
| |
|
| | wg = WordGenerator(sentences) |
| | for w in wg: |
| | pass |
| |
|
| |
|
| | def test_unicode_sentences_ignored_if_set(): |
| | """ Strings with Unicode characters tokenize to empty array if they're not allowed. |
| | """ |
| | sentence = [u'Dobrý den, jak se máš?'] |
| | wg = WordGenerator(sentence, allow_unicode_text=False) |
| | assert wg.get_words(sentence[0]) == [] |
| |
|
| |
|
| | def test_check_ascii(): |
| | """ check_ascii recognises ASCII words properly. |
| | In Python 3 all string are Unicode |
| | """ |
| | if not IS_PYTHON2: |
| | return |
| |
|
| | wg = WordGenerator([]) |
| | assert wg.check_ascii('ASCII') |
| | assert not wg.check_ascii('ščřžýá') |
| | assert not wg.check_ascii('❤ ☀ ☆ ☂ ☻ ♞ ☯ ☭ ☢') |
| |
|
| |
|
| | def test_convert_unicode_word(): |
| | """ convert_unicode_word converts Unicode words correctly. |
| | """ |
| | wg = WordGenerator([], allow_unicode_text=True) |
| |
|
| | result = wg.convert_unicode_word(u'č') |
| | assert result == (True, u'\u010d'), '{}'.format(result) |
| |
|
| |
|
| | def test_convert_unicode_word_ignores_if_set(): |
| | """ convert_unicode_word ignores Unicode words if set. |
| | """ |
| | wg = WordGenerator([], allow_unicode_text=False) |
| |
|
| | result = wg.convert_unicode_word(u'č') |
| | assert result == (False, ''), '{}'.format(result) |
| |
|
| |
|
| | def test_convert_unicode_chars(): |
| | """ convert_unicode_word correctly converts accented characters. |
| | """ |
| | wg = WordGenerator([], allow_unicode_text=True) |
| | result = wg.convert_unicode_word(u'ěščřžýáíé') |
| | assert result == (True, u'\u011b\u0161\u010d\u0159\u017e\xfd\xe1\xed\xe9'), '{}'.format(result) |
| |
|