stanza-digphil / stanza /tests /ner /test_ner_utils.py
Albin Thörn Cleland
Clean initial commit with LFS
19b8775
import pytest
from stanza.tests import *
from stanza.models.common.vocab import EMPTY
from stanza.models.ner import utils
pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
WORDS = [["Unban", "Mox", "Opal"], ["Ragavan", "is", "red"], ["Urza", "Lord", "High", "Artificer", "goes", "infinite", "with", "Thopter", "Sword"]]
BIO_TAGS = [["O", "B-ART", "I-ART"], ["B-MONKEY", "O", "B-COLOR"], ["B-PER", "I-PER", "I-PER", "I-PER", "O", "O", "O", "B-WEAPON", "B-WEAPON"]]
BIO_U_TAGS = [["O", "B_ART", "I_ART"], ["B_MONKEY", "O", "B_COLOR"], ["B_PER", "I_PER", "I_PER", "I_PER", "O", "O", "O", "B_WEAPON", "B_WEAPON"]]
BIOES_TAGS = [["O", "B-ART", "E-ART"], ["S-MONKEY", "O", "S-COLOR"], ["B-PER", "I-PER", "I-PER", "E-PER", "O", "O", "O", "S-WEAPON", "S-WEAPON"]]
# note the problem with not using BIO tags - the consecutive tags for thopter/sword get treated as one item
BASIC_TAGS = [["O", "ART", "ART"], ["MONKEY", "O", "COLOR"], [ "PER", "PER", "PER", "PER", "O", "O", "O", "WEAPON", "WEAPON"]]
BASIC_BIOES = [["O", "B-ART", "E-ART"], ["S-MONKEY", "O", "S-COLOR"], ["B-PER", "I-PER", "I-PER", "E-PER", "O", "O", "O", "B-WEAPON", "E-WEAPON"]]
ALT_BIO = [["O", "B-MANA", "I-MANA"], ["B-CRE", "O", "O"], ["B-CRE", "I-CRE", "I-CRE", "I-CRE", "O", "O", "O", "B-ART", "B-ART"]]
ALT_BIOES = [["O", "B-MANA", "E-MANA"], ["S-CRE", "O", "O"], ["B-CRE", "I-CRE", "I-CRE", "E-CRE", "O", "O", "O", "S-ART", "S-ART"]]
NONE_BIO = [["O", "B-MANA", "I-MANA"], [None, None, None], ["B-CRE", "I-CRE", "I-CRE", "I-CRE", "O", "O", "O", "B-ART", "B-ART"]]
NONE_BIOES = [["O", "B-MANA", "E-MANA"], [None, None, None], ["B-CRE", "I-CRE", "I-CRE", "E-CRE", "O", "O", "O", "S-ART", "S-ART"]]
EMPTY_BIO = [["O", "B-MANA", "I-MANA"], [EMPTY, EMPTY, EMPTY], ["B-CRE", "I-CRE", "I-CRE", "I-CRE", "O", "O", "O", "B-ART", "B-ART"]]
def test_normalize_empty_tags():
sentences = [[(word[0], (word[1],)) for word in zip(*sentence)] for sentence in zip(WORDS, NONE_BIO)]
new_sentences = utils.normalize_empty_tags(sentences)
expected = [[(word[0], (word[1],)) for word in zip(*sentence)] for sentence in zip(WORDS, EMPTY_BIO)]
assert new_sentences == expected
def check_reprocessed_tags(words, input_tags, expected_tags):
sentences = [list(zip(x, y)) for x, y in zip(words, input_tags)]
retagged = utils.process_tags(sentences=sentences, scheme="bioes")
# process_tags selectively returns tuples or strings based on the input
# so we don't need to fiddle with the expected output format here
expected_retagged = [list(zip(x, y)) for x, y in zip(words, expected_tags)]
assert retagged == expected_retagged
def test_process_tags_bio():
check_reprocessed_tags(WORDS, BIO_TAGS, BIOES_TAGS)
# check that the alternate version is correct as well
# that way we can independently check the two layer version
check_reprocessed_tags(WORDS, ALT_BIO, ALT_BIOES)
def test_process_tags_with_none():
# if there is a block of tags with None in them, the Nones should be skipped over
check_reprocessed_tags(WORDS, NONE_BIO, NONE_BIOES)
def merge_tags(*tags):
merged_tags = [[tuple(x) for x in zip(*sentences)] # combine tags such as ("O", "O"), ("B-ART", "B-MANA"), ...
for sentences in zip(*tags)] # ... for each set of sentences
return merged_tags
def test_combined_tags_bio():
bio_tags = merge_tags(BIO_TAGS, ALT_BIO)
expected = merge_tags(BIOES_TAGS, ALT_BIOES)
check_reprocessed_tags(WORDS, bio_tags, expected)
def test_combined_tags_mixed():
bio_tags = merge_tags(BIO_TAGS, ALT_BIOES)
expected = merge_tags(BIOES_TAGS, ALT_BIOES)
check_reprocessed_tags(WORDS, bio_tags, expected)
def test_process_tags_basic():
check_reprocessed_tags(WORDS, BASIC_TAGS, BASIC_BIOES)
def test_process_tags_bioes():
"""
This one should not change, naturally
"""
check_reprocessed_tags(WORDS, BIOES_TAGS, BIOES_TAGS)
check_reprocessed_tags(WORDS, BASIC_BIOES, BASIC_BIOES)
def run_flattened(fn, tags):
return fn([x for x in y for y in tags])
def test_check_bio():
assert utils.is_bio_scheme([x for y in BIO_TAGS for x in y])
assert not utils.is_bio_scheme([x for y in BIOES_TAGS for x in y])
assert not utils.is_bio_scheme([x for y in BASIC_TAGS for x in y])
assert not utils.is_bio_scheme([x for y in BASIC_BIOES for x in y])
def test_check_basic():
assert not utils.is_basic_scheme([x for y in BIO_TAGS for x in y])
assert not utils.is_basic_scheme([x for y in BIOES_TAGS for x in y])
assert utils.is_basic_scheme([x for y in BASIC_TAGS for x in y])
assert not utils.is_basic_scheme([x for y in BASIC_BIOES for x in y])
def test_underscores():
"""
Check that the methods work if the inputs are underscores instead of dashes
"""
assert not utils.is_basic_scheme([x for y in BIO_U_TAGS for x in y])
check_reprocessed_tags(WORDS, BIO_U_TAGS, BIOES_TAGS)
def test_merge_tags():
"""
Check a few versions of the tag sequence merging
"""
seq1 = [ "O", "O", "O", "B-FOO", "E-FOO", "O"]
seq2 = [ "S-FOO", "O", "B-FOO", "E-FOO", "O", "O"]
seq3 = [ "B-FOO", "E-FOO", "B-FOO", "E-FOO", "O", "O"]
seq_err = [ "O", "B-FOO", "O", "B-FOO", "E-FOO", "O"]
seq_err2 = [ "O", "B-FOO", "O", "B-FOO", "B-FOO", "O"]
seq_err3 = [ "O", "B-FOO", "O", "B-FOO", "I-FOO", "O"]
seq_err4 = [ "O", "B-FOO", "O", "B-FOO", "I-FOO", "I-FOO"]
result = utils.merge_tags(seq1, seq2)
expected = [ "S-FOO", "O", "O", "B-FOO", "E-FOO", "O"]
assert result == expected
result = utils.merge_tags(seq2, seq1)
expected = [ "S-FOO", "O", "B-FOO", "E-FOO", "O", "O"]
assert result == expected
result = utils.merge_tags(seq1, seq3)
expected = [ "B-FOO", "E-FOO", "O", "B-FOO", "E-FOO", "O"]
assert result == expected
with pytest.raises(ValueError):
result = utils.merge_tags(seq1, seq_err)
with pytest.raises(ValueError):
result = utils.merge_tags(seq1, seq_err2)
with pytest.raises(ValueError):
result = utils.merge_tags(seq1, seq_err3)
with pytest.raises(ValueError):
result = utils.merge_tags(seq1, seq_err4)