stanza-digphil / stanza /tests /ner /test_ner_utils.py

Albin Thörn Cleland

Clean initial commit with LFS

19b8775 about 2 months ago

6.64 kB

	import pytest

	from stanza.tests import *

	from stanza.models.common.vocab import EMPTY
	from stanza.models.ner import utils

	pytestmark = [pytest.mark.travis, pytest.mark.pipeline]

	WORDS = [["Unban", "Mox", "Opal"], ["Ragavan", "is", "red"], ["Urza", "Lord", "High", "Artificer", "goes", "infinite", "with", "Thopter", "Sword"]]
	BIO_TAGS = [["O", "B-ART", "I-ART"], ["B-MONKEY", "O", "B-COLOR"], ["B-PER", "I-PER", "I-PER", "I-PER", "O", "O", "O", "B-WEAPON", "B-WEAPON"]]
	BIO_U_TAGS = [["O", "B_ART", "I_ART"], ["B_MONKEY", "O", "B_COLOR"], ["B_PER", "I_PER", "I_PER", "I_PER", "O", "O", "O", "B_WEAPON", "B_WEAPON"]]
	BIOES_TAGS = [["O", "B-ART", "E-ART"], ["S-MONKEY", "O", "S-COLOR"], ["B-PER", "I-PER", "I-PER", "E-PER", "O", "O", "O", "S-WEAPON", "S-WEAPON"]]
	# note the problem with not using BIO tags - the consecutive tags for thopter/sword get treated as one item
	BASIC_TAGS = [["O", "ART", "ART"], ["MONKEY", "O", "COLOR"], [ "PER", "PER", "PER", "PER", "O", "O", "O", "WEAPON", "WEAPON"]]
	BASIC_BIOES = [["O", "B-ART", "E-ART"], ["S-MONKEY", "O", "S-COLOR"], ["B-PER", "I-PER", "I-PER", "E-PER", "O", "O", "O", "B-WEAPON", "E-WEAPON"]]
	ALT_BIO = [["O", "B-MANA", "I-MANA"], ["B-CRE", "O", "O"], ["B-CRE", "I-CRE", "I-CRE", "I-CRE", "O", "O", "O", "B-ART", "B-ART"]]
	ALT_BIOES = [["O", "B-MANA", "E-MANA"], ["S-CRE", "O", "O"], ["B-CRE", "I-CRE", "I-CRE", "E-CRE", "O", "O", "O", "S-ART", "S-ART"]]
	NONE_BIO = [["O", "B-MANA", "I-MANA"], [None, None, None], ["B-CRE", "I-CRE", "I-CRE", "I-CRE", "O", "O", "O", "B-ART", "B-ART"]]
	NONE_BIOES = [["O", "B-MANA", "E-MANA"], [None, None, None], ["B-CRE", "I-CRE", "I-CRE", "E-CRE", "O", "O", "O", "S-ART", "S-ART"]]
	EMPTY_BIO = [["O", "B-MANA", "I-MANA"], [EMPTY, EMPTY, EMPTY], ["B-CRE", "I-CRE", "I-CRE", "I-CRE", "O", "O", "O", "B-ART", "B-ART"]]

	def test_normalize_empty_tags():
	sentences = [[(word[0], (word[1],)) for word in zip(*sentence)] for sentence in zip(WORDS, NONE_BIO)]
	new_sentences = utils.normalize_empty_tags(sentences)
	expected = [[(word[0], (word[1],)) for word in zip(*sentence)] for sentence in zip(WORDS, EMPTY_BIO)]
	assert new_sentences == expected

	def check_reprocessed_tags(words, input_tags, expected_tags):
	sentences = [list(zip(x, y)) for x, y in zip(words, input_tags)]
	retagged = utils.process_tags(sentences=sentences, scheme="bioes")
	# process_tags selectively returns tuples or strings based on the input
	# so we don't need to fiddle with the expected output format here
	expected_retagged = [list(zip(x, y)) for x, y in zip(words, expected_tags)]
	assert retagged == expected_retagged

	def test_process_tags_bio():
	check_reprocessed_tags(WORDS, BIO_TAGS, BIOES_TAGS)
	# check that the alternate version is correct as well
	# that way we can independently check the two layer version
	check_reprocessed_tags(WORDS, ALT_BIO, ALT_BIOES)

	def test_process_tags_with_none():
	# if there is a block of tags with None in them, the Nones should be skipped over
	check_reprocessed_tags(WORDS, NONE_BIO, NONE_BIOES)

	def merge_tags(*tags):
	merged_tags = [[tuple(x) for x in zip(*sentences)] # combine tags such as ("O", "O"), ("B-ART", "B-MANA"), ...
	for sentences in zip(*tags)] # ... for each set of sentences
	return merged_tags

	def test_combined_tags_bio():
	bio_tags = merge_tags(BIO_TAGS, ALT_BIO)
	expected = merge_tags(BIOES_TAGS, ALT_BIOES)
	check_reprocessed_tags(WORDS, bio_tags, expected)

	def test_combined_tags_mixed():
	bio_tags = merge_tags(BIO_TAGS, ALT_BIOES)
	expected = merge_tags(BIOES_TAGS, ALT_BIOES)
	check_reprocessed_tags(WORDS, bio_tags, expected)

	def test_process_tags_basic():
	check_reprocessed_tags(WORDS, BASIC_TAGS, BASIC_BIOES)

	def test_process_tags_bioes():
	"""
	This one should not change, naturally
	"""
	check_reprocessed_tags(WORDS, BIOES_TAGS, BIOES_TAGS)
	check_reprocessed_tags(WORDS, BASIC_BIOES, BASIC_BIOES)

	def run_flattened(fn, tags):
	return fn([x for x in y for y in tags])

	def test_check_bio():
	assert utils.is_bio_scheme([x for y in BIO_TAGS for x in y])
	assert not utils.is_bio_scheme([x for y in BIOES_TAGS for x in y])
	assert not utils.is_bio_scheme([x for y in BASIC_TAGS for x in y])
	assert not utils.is_bio_scheme([x for y in BASIC_BIOES for x in y])

	def test_check_basic():
	assert not utils.is_basic_scheme([x for y in BIO_TAGS for x in y])
	assert not utils.is_basic_scheme([x for y in BIOES_TAGS for x in y])
	assert utils.is_basic_scheme([x for y in BASIC_TAGS for x in y])
	assert not utils.is_basic_scheme([x for y in BASIC_BIOES for x in y])

	def test_underscores():
	"""
	Check that the methods work if the inputs are underscores instead of dashes
	"""
	assert not utils.is_basic_scheme([x for y in BIO_U_TAGS for x in y])
	check_reprocessed_tags(WORDS, BIO_U_TAGS, BIOES_TAGS)

	def test_merge_tags():
	"""
	Check a few versions of the tag sequence merging
	"""
	seq1 = [ "O", "O", "O", "B-FOO", "E-FOO", "O"]
	seq2 = [ "S-FOO", "O", "B-FOO", "E-FOO", "O", "O"]
	seq3 = [ "B-FOO", "E-FOO", "B-FOO", "E-FOO", "O", "O"]
	seq_err = [ "O", "B-FOO", "O", "B-FOO", "E-FOO", "O"]
	seq_err2 = [ "O", "B-FOO", "O", "B-FOO", "B-FOO", "O"]
	seq_err3 = [ "O", "B-FOO", "O", "B-FOO", "I-FOO", "O"]
	seq_err4 = [ "O", "B-FOO", "O", "B-FOO", "I-FOO", "I-FOO"]

	result = utils.merge_tags(seq1, seq2)
	expected = [ "S-FOO", "O", "O", "B-FOO", "E-FOO", "O"]
	assert result == expected

	result = utils.merge_tags(seq2, seq1)
	expected = [ "S-FOO", "O", "B-FOO", "E-FOO", "O", "O"]
	assert result == expected

	result = utils.merge_tags(seq1, seq3)
	expected = [ "B-FOO", "E-FOO", "O", "B-FOO", "E-FOO", "O"]
	assert result == expected

	with pytest.raises(ValueError):
	result = utils.merge_tags(seq1, seq_err)

	with pytest.raises(ValueError):
	result = utils.merge_tags(seq1, seq_err2)

	with pytest.raises(ValueError):
	result = utils.merge_tags(seq1, seq_err3)

	with pytest.raises(ValueError):
	result = utils.merge_tags(seq1, seq_err4)