Albin Thörn Cleland
Clean initial commit with LFS
19b8775
import lzma
import os
import tempfile
import pytest
import stanza
import stanza.models.common.utils as utils
from stanza.tests import *
pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
def test_wordvec_not_found():
"""
get_wordvec_file should fail if neither word2vec nor fasttext exists
"""
with tempfile.TemporaryDirectory(dir=f'{TEST_WORKING_DIR}/out') as temp_dir:
with pytest.raises(FileNotFoundError):
utils.get_wordvec_file(wordvec_dir=temp_dir, shorthand='en_foo')
def test_word2vec_xz():
"""
Test searching for word2vec and xz files
"""
with tempfile.TemporaryDirectory(dir=f'{TEST_WORKING_DIR}/out') as temp_dir:
# make a fake directory for English word vectors
word2vec_dir = os.path.join(temp_dir, 'word2vec', 'English')
os.makedirs(word2vec_dir)
# make a fake English word vector file
fake_file = os.path.join(word2vec_dir, 'en.vectors.xz')
fout = open(fake_file, 'w')
fout.close()
# get_wordvec_file should now find this fake file
filename = utils.get_wordvec_file(wordvec_dir=temp_dir, shorthand='en_foo')
assert filename == fake_file
def test_fasttext_txt():
"""
Test searching for fasttext and txt files
"""
with tempfile.TemporaryDirectory(dir=f'{TEST_WORKING_DIR}/out') as temp_dir:
# make a fake directory for English word vectors
fasttext_dir = os.path.join(temp_dir, 'fasttext', 'English')
os.makedirs(fasttext_dir)
# make a fake English word vector file
fake_file = os.path.join(fasttext_dir, 'en.vectors.txt')
fout = open(fake_file, 'w')
fout.close()
# get_wordvec_file should now find this fake file
filename = utils.get_wordvec_file(wordvec_dir=temp_dir, shorthand='en_foo')
assert filename == fake_file
def test_wordvec_type():
"""
If we supply our own wordvec type, get_wordvec_file should find that
"""
with tempfile.TemporaryDirectory(dir=f'{TEST_WORKING_DIR}/out') as temp_dir:
# make a fake directory for English word vectors
google_dir = os.path.join(temp_dir, 'google', 'English')
os.makedirs(google_dir)
# make a fake English word vector file
fake_file = os.path.join(google_dir, 'en.vectors.txt')
fout = open(fake_file, 'w')
fout.close()
# get_wordvec_file should now find this fake file
filename = utils.get_wordvec_file(wordvec_dir=temp_dir, shorthand='en_foo', wordvec_type='google')
assert filename == fake_file
# this file won't be found using the normal defaults
with pytest.raises(FileNotFoundError):
utils.get_wordvec_file(wordvec_dir=temp_dir, shorthand='en_foo')
def test_sort_with_indices():
data = [[1, 2, 3], [4, 5], [6]]
ordered, orig_idx = utils.sort_with_indices(data, key=len)
assert ordered == ([6], [4, 5], [1, 2, 3])
assert orig_idx == (2, 1, 0)
unsorted = utils.unsort(ordered, orig_idx)
assert data == unsorted
def test_empty_sort_with_indices():
ordered, orig_idx = utils.sort_with_indices([])
assert len(ordered) == 0
assert len(orig_idx) == 0
unsorted = utils.unsort(ordered, orig_idx)
assert [] == unsorted
def test_split_into_batches():
data = []
for i in range(5):
data.append(["Unban", "mox", "opal", str(i)])
data.append(["Do", "n't", "ban", "Urza", "'s", "Saga", "that", "card", "is", "great"])
data.append(["Ban", "Ragavan"])
# small batches will put one element in each interval
batches = utils.split_into_batches(data, 5)
assert batches == [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7)]
# this one has a batch interrupted in the middle by a large element
batches = utils.split_into_batches(data, 8)
assert batches == [(0, 2), (2, 4), (4, 5), (5, 6), (6, 7)]
# this one has the large element at the start of its own batch
batches = utils.split_into_batches(data[1:], 8)
assert batches == [(0, 2), (2, 4), (4, 5), (5, 6)]
# overloading the test! assert that the key & reverse is working
ordered, orig_idx = utils.sort_with_indices(data, key=len, reverse=True)
assert [len(x) for x in ordered] == [10, 4, 4, 4, 4, 4, 2]
# this has the large element at the start
batches = utils.split_into_batches(ordered, 8)
assert batches == [(0, 1), (1, 3), (3, 5), (5, 7)]
# double check that unsort is working as expected
assert data == utils.unsort(ordered, orig_idx)
def test_find_missing_tags():
assert utils.find_missing_tags(["O", "PER", "LOC"], ["O", "PER", "LOC"]) == []
assert utils.find_missing_tags(["O", "PER", "LOC"], ["O", "PER", "LOC", "ORG"]) == ['ORG']
assert utils.find_missing_tags([["O", "PER"], ["O", "LOC"]], [["O", "PER"], ["LOC", "ORG"]]) == ['ORG']
def test_open_read_text():
"""
test that we can read either .xz or regular txt
"""
TEXT = "this is a test"
with tempfile.TemporaryDirectory() as tempdir:
# test text file
filename = os.path.join(tempdir, "foo.txt")
with open(filename, "w") as fout:
fout.write(TEXT)
with utils.open_read_text(filename) as fin:
in_text = fin.read()
assert TEXT == in_text
assert fin.closed
# the context should close the file when we throw an exception!
try:
with utils.open_read_text(filename) as finex:
assert not finex.closed
raise ValueError("unban mox opal!")
except ValueError:
pass
assert finex.closed
# test xz file
filename = os.path.join(tempdir, "foo.txt.xz")
with lzma.open(filename, "wt") as fout:
fout.write(TEXT)
with utils.open_read_text(filename) as finxz:
in_text = finxz.read()
assert TEXT == in_text
assert finxz.closed
# the context should close the file when we throw an exception!
try:
with utils.open_read_text(filename) as finexxz:
assert not finexxz.closed
raise ValueError("unban mox opal!")
except ValueError:
pass
assert finexxz.closed
def test_checkpoint_name():
"""
Test some expected results for the checkpoint names
"""
# use os.path.split so that the test is agnostic of file separator on Linux or Windows
checkpoint = utils.checkpoint_name("saved_models", "kk_oscar_forward_charlm.pt", None)
assert os.path.split(checkpoint) == ("saved_models", "kk_oscar_forward_charlm_checkpoint.pt")
checkpoint = utils.checkpoint_name("saved_models", "kk_oscar_forward_charlm", None)
assert os.path.split(checkpoint) == ("saved_models", "kk_oscar_forward_charlm_checkpoint")
checkpoint = utils.checkpoint_name("saved_models", "kk_oscar_forward_charlm", "othername.pt")
assert os.path.split(checkpoint) == ("saved_models", "othername.pt")