stanza-digphil / stanza /tests /common /test_utils.py

Albin Thörn Cleland

Clean initial commit with LFS

19b8775 about 2 months ago

7.02 kB

	import lzma
	import os
	import tempfile

	import pytest

	import stanza
	import stanza.models.common.utils as utils
	from stanza.tests import *

	pytestmark = [pytest.mark.travis, pytest.mark.pipeline]

	def test_wordvec_not_found():
	"""
	get_wordvec_file should fail if neither word2vec nor fasttext exists
	"""
	with tempfile.TemporaryDirectory(dir=f'{TEST_WORKING_DIR}/out') as temp_dir:
	with pytest.raises(FileNotFoundError):
	utils.get_wordvec_file(wordvec_dir=temp_dir, shorthand='en_foo')


	def test_word2vec_xz():
	"""
	Test searching for word2vec and xz files
	"""
	with tempfile.TemporaryDirectory(dir=f'{TEST_WORKING_DIR}/out') as temp_dir:
	# make a fake directory for English word vectors
	word2vec_dir = os.path.join(temp_dir, 'word2vec', 'English')
	os.makedirs(word2vec_dir)

	# make a fake English word vector file
	fake_file = os.path.join(word2vec_dir, 'en.vectors.xz')
	fout = open(fake_file, 'w')
	fout.close()

	# get_wordvec_file should now find this fake file
	filename = utils.get_wordvec_file(wordvec_dir=temp_dir, shorthand='en_foo')
	assert filename == fake_file

	def test_fasttext_txt():
	"""
	Test searching for fasttext and txt files
	"""
	with tempfile.TemporaryDirectory(dir=f'{TEST_WORKING_DIR}/out') as temp_dir:
	# make a fake directory for English word vectors
	fasttext_dir = os.path.join(temp_dir, 'fasttext', 'English')
	os.makedirs(fasttext_dir)

	# make a fake English word vector file
	fake_file = os.path.join(fasttext_dir, 'en.vectors.txt')
	fout = open(fake_file, 'w')
	fout.close()

	# get_wordvec_file should now find this fake file
	filename = utils.get_wordvec_file(wordvec_dir=temp_dir, shorthand='en_foo')
	assert filename == fake_file

	def test_wordvec_type():
	"""
	If we supply our own wordvec type, get_wordvec_file should find that
	"""
	with tempfile.TemporaryDirectory(dir=f'{TEST_WORKING_DIR}/out') as temp_dir:
	# make a fake directory for English word vectors
	google_dir = os.path.join(temp_dir, 'google', 'English')
	os.makedirs(google_dir)

	# make a fake English word vector file
	fake_file = os.path.join(google_dir, 'en.vectors.txt')
	fout = open(fake_file, 'w')
	fout.close()

	# get_wordvec_file should now find this fake file
	filename = utils.get_wordvec_file(wordvec_dir=temp_dir, shorthand='en_foo', wordvec_type='google')
	assert filename == fake_file

	# this file won't be found using the normal defaults
	with pytest.raises(FileNotFoundError):
	utils.get_wordvec_file(wordvec_dir=temp_dir, shorthand='en_foo')

	def test_sort_with_indices():
	data = [[1, 2, 3], [4, 5], [6]]
	ordered, orig_idx = utils.sort_with_indices(data, key=len)
	assert ordered == ([6], [4, 5], [1, 2, 3])
	assert orig_idx == (2, 1, 0)

	unsorted = utils.unsort(ordered, orig_idx)
	assert data == unsorted

	def test_empty_sort_with_indices():
	ordered, orig_idx = utils.sort_with_indices([])
	assert len(ordered) == 0
	assert len(orig_idx) == 0

	unsorted = utils.unsort(ordered, orig_idx)
	assert [] == unsorted


	def test_split_into_batches():
	data = []
	for i in range(5):
	data.append(["Unban", "mox", "opal", str(i)])

	data.append(["Do", "n't", "ban", "Urza", "'s", "Saga", "that", "card", "is", "great"])
	data.append(["Ban", "Ragavan"])

	# small batches will put one element in each interval
	batches = utils.split_into_batches(data, 5)
	assert batches == [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7)]

	# this one has a batch interrupted in the middle by a large element
	batches = utils.split_into_batches(data, 8)
	assert batches == [(0, 2), (2, 4), (4, 5), (5, 6), (6, 7)]

	# this one has the large element at the start of its own batch
	batches = utils.split_into_batches(data[1:], 8)
	assert batches == [(0, 2), (2, 4), (4, 5), (5, 6)]

	# overloading the test! assert that the key & reverse is working
	ordered, orig_idx = utils.sort_with_indices(data, key=len, reverse=True)
	assert [len(x) for x in ordered] == [10, 4, 4, 4, 4, 4, 2]

	# this has the large element at the start
	batches = utils.split_into_batches(ordered, 8)
	assert batches == [(0, 1), (1, 3), (3, 5), (5, 7)]

	# double check that unsort is working as expected
	assert data == utils.unsort(ordered, orig_idx)


	def test_find_missing_tags():
	assert utils.find_missing_tags(["O", "PER", "LOC"], ["O", "PER", "LOC"]) == []
	assert utils.find_missing_tags(["O", "PER", "LOC"], ["O", "PER", "LOC", "ORG"]) == ['ORG']
	assert utils.find_missing_tags([["O", "PER"], ["O", "LOC"]], [["O", "PER"], ["LOC", "ORG"]]) == ['ORG']


	def test_open_read_text():
	"""
	test that we can read either .xz or regular txt
	"""
	TEXT = "this is a test"
	with tempfile.TemporaryDirectory() as tempdir:
	# test text file
	filename = os.path.join(tempdir, "foo.txt")
	with open(filename, "w") as fout:
	fout.write(TEXT)
	with utils.open_read_text(filename) as fin:
	in_text = fin.read()
	assert TEXT == in_text

	assert fin.closed

	# the context should close the file when we throw an exception!
	try:
	with utils.open_read_text(filename) as finex:
	assert not finex.closed
	raise ValueError("unban mox opal!")
	except ValueError:
	pass
	assert finex.closed

	# test xz file
	filename = os.path.join(tempdir, "foo.txt.xz")
	with lzma.open(filename, "wt") as fout:
	fout.write(TEXT)
	with utils.open_read_text(filename) as finxz:
	in_text = finxz.read()
	assert TEXT == in_text

	assert finxz.closed

	# the context should close the file when we throw an exception!
	try:
	with utils.open_read_text(filename) as finexxz:
	assert not finexxz.closed
	raise ValueError("unban mox opal!")
	except ValueError:
	pass
	assert finexxz.closed


	def test_checkpoint_name():
	"""
	Test some expected results for the checkpoint names
	"""
	# use os.path.split so that the test is agnostic of file separator on Linux or Windows
	checkpoint = utils.checkpoint_name("saved_models", "kk_oscar_forward_charlm.pt", None)
	assert os.path.split(checkpoint) == ("saved_models", "kk_oscar_forward_charlm_checkpoint.pt")

	checkpoint = utils.checkpoint_name("saved_models", "kk_oscar_forward_charlm", None)
	assert os.path.split(checkpoint) == ("saved_models", "kk_oscar_forward_charlm_checkpoint")

	checkpoint = utils.checkpoint_name("saved_models", "kk_oscar_forward_charlm", "othername.pt")
	assert os.path.split(checkpoint) == ("saved_models", "othername.pt")