| | |
| | """ Tokenization tests. |
| | """ |
| | from __future__ import absolute_import, print_function, division, unicode_literals |
| |
|
| | import sys |
| | from nose.tools import nottest |
| | from os.path import dirname, abspath |
| | sys.path.append(dirname(dirname(abspath(__file__)))) |
| | from torchmoji.tokenizer import tokenize |
| |
|
| | TESTS_NORMAL = [ |
| | ('200K words!', ['200', 'K', 'words', '!']), |
| | ] |
| |
|
| | TESTS_EMOJIS = [ |
| | ('i \U0001f496 you to the moon and back', |
| | ['i', '\U0001f496', 'you', 'to', 'the', 'moon', 'and', 'back']), |
| | ("i\U0001f496you to the \u2605's and back", |
| | ['i', '\U0001f496', 'you', 'to', 'the', |
| | '\u2605', "'", 's', 'and', 'back']), |
| | ('~<3~', ['~', '<3', '~']), |
| | ('<333', ['<333']), |
| | (':-)', [':-)']), |
| | ('>:-(', ['>:-(']), |
| | ('\u266b\u266a\u2605\u2606\u2665\u2764\u2661', |
| | ['\u266b', '\u266a', '\u2605', '\u2606', |
| | '\u2665', '\u2764', '\u2661']), |
| | ] |
| |
|
| | TESTS_URLS = [ |
| | ('www.sample.com', ['www.sample.com']), |
| | ('http://endless.horse', ['http://endless.horse']), |
| | ('https://github.mit.ed', ['https://github.mit.ed']), |
| | ] |
| |
|
| | TESTS_TWITTER = [ |
| | ('#blacklivesmatter', ['#blacklivesmatter']), |
| | ('#99_percent.', ['#99_percent', '.']), |
| | ('the#99%', ['the', '#99', '%']), |
| | ('@golden_zenith', ['@golden_zenith']), |
| | ('@99_percent', ['@99_percent']), |
| | ('latte-express@mit.ed', ['latte-express@mit.ed']), |
| | ] |
| |
|
| | TESTS_PHONE_NUMS = [ |
| | ('518)528-0252', ['518', ')', '528', '-', '0252']), |
| | ('1200-0221-0234', ['1200', '-', '0221', '-', '0234']), |
| | ('1200.0221.0234', ['1200', '.', '0221', '.', '0234']), |
| | ] |
| |
|
| | TESTS_DATETIME = [ |
| | ('15:00', ['15', ':', '00']), |
| | ('2:00pm', ['2', ':', '00', 'pm']), |
| | ('9/14/16', ['9', '/', '14', '/', '16']), |
| | ] |
| |
|
| | TESTS_CURRENCIES = [ |
| | ('517.933\xa3', ['517', '.', '933', '\xa3']), |
| | ('$517.87', ['$', '517', '.', '87']), |
| | ('1201.6598', ['1201', '.', '6598']), |
| | ('120,6', ['120', ',', '6']), |
| | ('10,00\u20ac', ['10', ',', '00', '\u20ac']), |
| | ('1,000', ['1', ',', '000']), |
| | ('1200pesos', ['1200', 'pesos']), |
| | ] |
| |
|
| | TESTS_NUM_SYM = [ |
| | ('5162f', ['5162', 'f']), |
| | ('f5162', ['f', '5162']), |
| | ('1203(', ['1203', '(']), |
| | ('(1203)', ['(', '1203', ')']), |
| | ('1200/', ['1200', '/']), |
| | ('1200+', ['1200', '+']), |
| | ('1202o-east', ['1202', 'o-east']), |
| | ('1200r', ['1200', 'r']), |
| | ('1200-1400', ['1200', '-', '1400']), |
| | ('120/today', ['120', '/', 'today']), |
| | ('today/120', ['today', '/', '120']), |
| | ('120/5', ['120', '/', '5']), |
| | ("120'/5", ['120', "'", '/', '5']), |
| | ('120/5pro', ['120', '/', '5', 'pro']), |
| | ("1200's,)", ['1200', "'", 's', ',', ')']), |
| | ('120.76.218.207', ['120', '.', '76', '.', '218', '.', '207']), |
| | ] |
| |
|
| | TESTS_PUNCTUATION = [ |
| | ("don''t", ['don', "''", 't']), |
| | ("don'tcha", ["don'tcha"]), |
| | ('no?!?!;', ['no', '?', '!', '?', '!', ';']), |
| | ('no??!!..', ['no', '??', '!!', '..']), |
| | ('a.m.', ['a.m.']), |
| | ('.s.u', ['.', 's', '.', 'u']), |
| | ('!!i..n__', ['!!', 'i', '..', 'n', '__']), |
| | ('lv(<3)w(3>)u Mr.!', ['lv', '(', '<3', ')', 'w', '(', '3', |
| | '>', ')', 'u', 'Mr.', '!']), |
| | ('-->', ['--', '>']), |
| | ('->', ['-', '>']), |
| | ('<-', ['<', '-']), |
| | ('<--', ['<', '--']), |
| | ('hello (@person)', ['hello', '(', '@person', ')']), |
| | ] |
| |
|
| |
|
| | def test_normal(): |
| | """ Normal/combined usage. |
| | """ |
| | test_base(TESTS_NORMAL) |
| |
|
| |
|
| | def test_emojis(): |
| | """ Tokenizing emojis/emoticons/decorations. |
| | """ |
| | test_base(TESTS_EMOJIS) |
| |
|
| |
|
| | def test_urls(): |
| | """ Tokenizing URLs. |
| | """ |
| | test_base(TESTS_URLS) |
| |
|
| |
|
| | def test_twitter(): |
| | """ Tokenizing hashtags, mentions and emails. |
| | """ |
| | test_base(TESTS_TWITTER) |
| |
|
| |
|
| | def test_phone_nums(): |
| | """ Tokenizing phone numbers. |
| | """ |
| | test_base(TESTS_PHONE_NUMS) |
| |
|
| |
|
| | def test_datetime(): |
| | """ Tokenizing dates and times. |
| | """ |
| | test_base(TESTS_DATETIME) |
| |
|
| |
|
| | def test_currencies(): |
| | """ Tokenizing currencies. |
| | """ |
| | test_base(TESTS_CURRENCIES) |
| |
|
| |
|
| | def test_num_sym(): |
| | """ Tokenizing combinations of numbers and symbols. |
| | """ |
| | test_base(TESTS_NUM_SYM) |
| |
|
| |
|
| | def test_punctuation(): |
| | """ Tokenizing punctuation and contractions. |
| | """ |
| | test_base(TESTS_PUNCTUATION) |
| |
|
| |
|
| | @nottest |
| | def test_base(tests): |
| | """ Base function for running tests. |
| | """ |
| | for (test, expected) in tests: |
| | actual = tokenize(test) |
| | assert actual == expected, \ |
| | "Tokenization of \'{}\' failed, expected: {}, actual: {}"\ |
| | .format(test, expected, actual) |
| |
|