| """ | |
| Check to make sure long tokens are replaced with "UNK" by the tokenization processor | |
| """ | |
| import pytest | |
| import stanza | |
| from stanza.pipeline import tokenize_processor | |
| from stanza.tests import TEST_MODELS_DIR | |
| pytestmark = [pytest.mark.pipeline, pytest.mark.travis] | |
| def test_replace_long_tokens(): | |
| nlp = stanza.Pipeline(lang="en", download_method=None, model_dir=TEST_MODELS_DIR, processors="tokenize") | |
| test_str = "foo " + "x" * 10000 + " bar" | |
| res = nlp(test_str) | |
| assert res.sentences[0].words[1].text == tokenize_processor.TOKEN_TOO_LONG_REPLACEMENT | |
| def test_set_max_len(): | |
| nlp = stanza.Pipeline(**{'processors': 'tokenize', 'dir': TEST_MODELS_DIR, | |
| 'lang': 'en', | |
| 'download_method': None, | |
| 'tokenize_max_seqlen': 20}) | |
| doc = nlp("This is a doc withaverylongtokenthatshouldbereplaced") | |
| assert len(doc.sentences) == 1 | |
| assert len(doc.sentences[0].words) == 5 | |
| assert doc.sentences[0].words[-1].text == tokenize_processor.TOKEN_TOO_LONG_REPLACEMENT | |