import pytest from stanza.tests import * from stanza.models.common.vocab import EMPTY from stanza.models.ner import utils pytestmark = [pytest.mark.travis, pytest.mark.pipeline] WORDS = [["Unban", "Mox", "Opal"], ["Ragavan", "is", "red"], ["Urza", "Lord", "High", "Artificer", "goes", "infinite", "with", "Thopter", "Sword"]] BIO_TAGS = [["O", "B-ART", "I-ART"], ["B-MONKEY", "O", "B-COLOR"], ["B-PER", "I-PER", "I-PER", "I-PER", "O", "O", "O", "B-WEAPON", "B-WEAPON"]] BIO_U_TAGS = [["O", "B_ART", "I_ART"], ["B_MONKEY", "O", "B_COLOR"], ["B_PER", "I_PER", "I_PER", "I_PER", "O", "O", "O", "B_WEAPON", "B_WEAPON"]] BIOES_TAGS = [["O", "B-ART", "E-ART"], ["S-MONKEY", "O", "S-COLOR"], ["B-PER", "I-PER", "I-PER", "E-PER", "O", "O", "O", "S-WEAPON", "S-WEAPON"]] # note the problem with not using BIO tags - the consecutive tags for thopter/sword get treated as one item BASIC_TAGS = [["O", "ART", "ART"], ["MONKEY", "O", "COLOR"], [ "PER", "PER", "PER", "PER", "O", "O", "O", "WEAPON", "WEAPON"]] BASIC_BIOES = [["O", "B-ART", "E-ART"], ["S-MONKEY", "O", "S-COLOR"], ["B-PER", "I-PER", "I-PER", "E-PER", "O", "O", "O", "B-WEAPON", "E-WEAPON"]] ALT_BIO = [["O", "B-MANA", "I-MANA"], ["B-CRE", "O", "O"], ["B-CRE", "I-CRE", "I-CRE", "I-CRE", "O", "O", "O", "B-ART", "B-ART"]] ALT_BIOES = [["O", "B-MANA", "E-MANA"], ["S-CRE", "O", "O"], ["B-CRE", "I-CRE", "I-CRE", "E-CRE", "O", "O", "O", "S-ART", "S-ART"]] NONE_BIO = [["O", "B-MANA", "I-MANA"], [None, None, None], ["B-CRE", "I-CRE", "I-CRE", "I-CRE", "O", "O", "O", "B-ART", "B-ART"]] NONE_BIOES = [["O", "B-MANA", "E-MANA"], [None, None, None], ["B-CRE", "I-CRE", "I-CRE", "E-CRE", "O", "O", "O", "S-ART", "S-ART"]] EMPTY_BIO = [["O", "B-MANA", "I-MANA"], [EMPTY, EMPTY, EMPTY], ["B-CRE", "I-CRE", "I-CRE", "I-CRE", "O", "O", "O", "B-ART", "B-ART"]] def test_normalize_empty_tags(): sentences = [[(word[0], (word[1],)) for word in zip(*sentence)] for sentence in zip(WORDS, NONE_BIO)] new_sentences = utils.normalize_empty_tags(sentences) expected = [[(word[0], (word[1],)) for word in zip(*sentence)] for sentence in zip(WORDS, EMPTY_BIO)] assert new_sentences == expected def check_reprocessed_tags(words, input_tags, expected_tags): sentences = [list(zip(x, y)) for x, y in zip(words, input_tags)] retagged = utils.process_tags(sentences=sentences, scheme="bioes") # process_tags selectively returns tuples or strings based on the input # so we don't need to fiddle with the expected output format here expected_retagged = [list(zip(x, y)) for x, y in zip(words, expected_tags)] assert retagged == expected_retagged def test_process_tags_bio(): check_reprocessed_tags(WORDS, BIO_TAGS, BIOES_TAGS) # check that the alternate version is correct as well # that way we can independently check the two layer version check_reprocessed_tags(WORDS, ALT_BIO, ALT_BIOES) def test_process_tags_with_none(): # if there is a block of tags with None in them, the Nones should be skipped over check_reprocessed_tags(WORDS, NONE_BIO, NONE_BIOES) def merge_tags(*tags): merged_tags = [[tuple(x) for x in zip(*sentences)] # combine tags such as ("O", "O"), ("B-ART", "B-MANA"), ... for sentences in zip(*tags)] # ... for each set of sentences return merged_tags def test_combined_tags_bio(): bio_tags = merge_tags(BIO_TAGS, ALT_BIO) expected = merge_tags(BIOES_TAGS, ALT_BIOES) check_reprocessed_tags(WORDS, bio_tags, expected) def test_combined_tags_mixed(): bio_tags = merge_tags(BIO_TAGS, ALT_BIOES) expected = merge_tags(BIOES_TAGS, ALT_BIOES) check_reprocessed_tags(WORDS, bio_tags, expected) def test_process_tags_basic(): check_reprocessed_tags(WORDS, BASIC_TAGS, BASIC_BIOES) def test_process_tags_bioes(): """ This one should not change, naturally """ check_reprocessed_tags(WORDS, BIOES_TAGS, BIOES_TAGS) check_reprocessed_tags(WORDS, BASIC_BIOES, BASIC_BIOES) def run_flattened(fn, tags): return fn([x for x in y for y in tags]) def test_check_bio(): assert utils.is_bio_scheme([x for y in BIO_TAGS for x in y]) assert not utils.is_bio_scheme([x for y in BIOES_TAGS for x in y]) assert not utils.is_bio_scheme([x for y in BASIC_TAGS for x in y]) assert not utils.is_bio_scheme([x for y in BASIC_BIOES for x in y]) def test_check_basic(): assert not utils.is_basic_scheme([x for y in BIO_TAGS for x in y]) assert not utils.is_basic_scheme([x for y in BIOES_TAGS for x in y]) assert utils.is_basic_scheme([x for y in BASIC_TAGS for x in y]) assert not utils.is_basic_scheme([x for y in BASIC_BIOES for x in y]) def test_underscores(): """ Check that the methods work if the inputs are underscores instead of dashes """ assert not utils.is_basic_scheme([x for y in BIO_U_TAGS for x in y]) check_reprocessed_tags(WORDS, BIO_U_TAGS, BIOES_TAGS) def test_merge_tags(): """ Check a few versions of the tag sequence merging """ seq1 = [ "O", "O", "O", "B-FOO", "E-FOO", "O"] seq2 = [ "S-FOO", "O", "B-FOO", "E-FOO", "O", "O"] seq3 = [ "B-FOO", "E-FOO", "B-FOO", "E-FOO", "O", "O"] seq_err = [ "O", "B-FOO", "O", "B-FOO", "E-FOO", "O"] seq_err2 = [ "O", "B-FOO", "O", "B-FOO", "B-FOO", "O"] seq_err3 = [ "O", "B-FOO", "O", "B-FOO", "I-FOO", "O"] seq_err4 = [ "O", "B-FOO", "O", "B-FOO", "I-FOO", "I-FOO"] result = utils.merge_tags(seq1, seq2) expected = [ "S-FOO", "O", "O", "B-FOO", "E-FOO", "O"] assert result == expected result = utils.merge_tags(seq2, seq1) expected = [ "S-FOO", "O", "B-FOO", "E-FOO", "O", "O"] assert result == expected result = utils.merge_tags(seq1, seq3) expected = [ "B-FOO", "E-FOO", "O", "B-FOO", "E-FOO", "O"] assert result == expected with pytest.raises(ValueError): result = utils.merge_tags(seq1, seq_err) with pytest.raises(ValueError): result = utils.merge_tags(seq1, seq_err2) with pytest.raises(ValueError): result = utils.merge_tags(seq1, seq_err3) with pytest.raises(ValueError): result = utils.merge_tags(seq1, seq_err4)