|
|
import pytest |
|
|
|
|
|
from stanza.tests import * |
|
|
|
|
|
from stanza.models.common.vocab import EMPTY |
|
|
from stanza.models.ner import utils |
|
|
|
|
|
pytestmark = [pytest.mark.travis, pytest.mark.pipeline] |
|
|
|
|
|
WORDS = [["Unban", "Mox", "Opal"], ["Ragavan", "is", "red"], ["Urza", "Lord", "High", "Artificer", "goes", "infinite", "with", "Thopter", "Sword"]] |
|
|
BIO_TAGS = [["O", "B-ART", "I-ART"], ["B-MONKEY", "O", "B-COLOR"], ["B-PER", "I-PER", "I-PER", "I-PER", "O", "O", "O", "B-WEAPON", "B-WEAPON"]] |
|
|
BIO_U_TAGS = [["O", "B_ART", "I_ART"], ["B_MONKEY", "O", "B_COLOR"], ["B_PER", "I_PER", "I_PER", "I_PER", "O", "O", "O", "B_WEAPON", "B_WEAPON"]] |
|
|
BIOES_TAGS = [["O", "B-ART", "E-ART"], ["S-MONKEY", "O", "S-COLOR"], ["B-PER", "I-PER", "I-PER", "E-PER", "O", "O", "O", "S-WEAPON", "S-WEAPON"]] |
|
|
|
|
|
BASIC_TAGS = [["O", "ART", "ART"], ["MONKEY", "O", "COLOR"], [ "PER", "PER", "PER", "PER", "O", "O", "O", "WEAPON", "WEAPON"]] |
|
|
BASIC_BIOES = [["O", "B-ART", "E-ART"], ["S-MONKEY", "O", "S-COLOR"], ["B-PER", "I-PER", "I-PER", "E-PER", "O", "O", "O", "B-WEAPON", "E-WEAPON"]] |
|
|
ALT_BIO = [["O", "B-MANA", "I-MANA"], ["B-CRE", "O", "O"], ["B-CRE", "I-CRE", "I-CRE", "I-CRE", "O", "O", "O", "B-ART", "B-ART"]] |
|
|
ALT_BIOES = [["O", "B-MANA", "E-MANA"], ["S-CRE", "O", "O"], ["B-CRE", "I-CRE", "I-CRE", "E-CRE", "O", "O", "O", "S-ART", "S-ART"]] |
|
|
NONE_BIO = [["O", "B-MANA", "I-MANA"], [None, None, None], ["B-CRE", "I-CRE", "I-CRE", "I-CRE", "O", "O", "O", "B-ART", "B-ART"]] |
|
|
NONE_BIOES = [["O", "B-MANA", "E-MANA"], [None, None, None], ["B-CRE", "I-CRE", "I-CRE", "E-CRE", "O", "O", "O", "S-ART", "S-ART"]] |
|
|
EMPTY_BIO = [["O", "B-MANA", "I-MANA"], [EMPTY, EMPTY, EMPTY], ["B-CRE", "I-CRE", "I-CRE", "I-CRE", "O", "O", "O", "B-ART", "B-ART"]] |
|
|
|
|
|
def test_normalize_empty_tags(): |
|
|
sentences = [[(word[0], (word[1],)) for word in zip(*sentence)] for sentence in zip(WORDS, NONE_BIO)] |
|
|
new_sentences = utils.normalize_empty_tags(sentences) |
|
|
expected = [[(word[0], (word[1],)) for word in zip(*sentence)] for sentence in zip(WORDS, EMPTY_BIO)] |
|
|
assert new_sentences == expected |
|
|
|
|
|
def check_reprocessed_tags(words, input_tags, expected_tags): |
|
|
sentences = [list(zip(x, y)) for x, y in zip(words, input_tags)] |
|
|
retagged = utils.process_tags(sentences=sentences, scheme="bioes") |
|
|
|
|
|
|
|
|
expected_retagged = [list(zip(x, y)) for x, y in zip(words, expected_tags)] |
|
|
assert retagged == expected_retagged |
|
|
|
|
|
def test_process_tags_bio(): |
|
|
check_reprocessed_tags(WORDS, BIO_TAGS, BIOES_TAGS) |
|
|
|
|
|
|
|
|
check_reprocessed_tags(WORDS, ALT_BIO, ALT_BIOES) |
|
|
|
|
|
def test_process_tags_with_none(): |
|
|
|
|
|
check_reprocessed_tags(WORDS, NONE_BIO, NONE_BIOES) |
|
|
|
|
|
def merge_tags(*tags): |
|
|
merged_tags = [[tuple(x) for x in zip(*sentences)] |
|
|
for sentences in zip(*tags)] |
|
|
return merged_tags |
|
|
|
|
|
def test_combined_tags_bio(): |
|
|
bio_tags = merge_tags(BIO_TAGS, ALT_BIO) |
|
|
expected = merge_tags(BIOES_TAGS, ALT_BIOES) |
|
|
check_reprocessed_tags(WORDS, bio_tags, expected) |
|
|
|
|
|
def test_combined_tags_mixed(): |
|
|
bio_tags = merge_tags(BIO_TAGS, ALT_BIOES) |
|
|
expected = merge_tags(BIOES_TAGS, ALT_BIOES) |
|
|
check_reprocessed_tags(WORDS, bio_tags, expected) |
|
|
|
|
|
def test_process_tags_basic(): |
|
|
check_reprocessed_tags(WORDS, BASIC_TAGS, BASIC_BIOES) |
|
|
|
|
|
def test_process_tags_bioes(): |
|
|
""" |
|
|
This one should not change, naturally |
|
|
""" |
|
|
check_reprocessed_tags(WORDS, BIOES_TAGS, BIOES_TAGS) |
|
|
check_reprocessed_tags(WORDS, BASIC_BIOES, BASIC_BIOES) |
|
|
|
|
|
def run_flattened(fn, tags): |
|
|
return fn([x for x in y for y in tags]) |
|
|
|
|
|
def test_check_bio(): |
|
|
assert utils.is_bio_scheme([x for y in BIO_TAGS for x in y]) |
|
|
assert not utils.is_bio_scheme([x for y in BIOES_TAGS for x in y]) |
|
|
assert not utils.is_bio_scheme([x for y in BASIC_TAGS for x in y]) |
|
|
assert not utils.is_bio_scheme([x for y in BASIC_BIOES for x in y]) |
|
|
|
|
|
def test_check_basic(): |
|
|
assert not utils.is_basic_scheme([x for y in BIO_TAGS for x in y]) |
|
|
assert not utils.is_basic_scheme([x for y in BIOES_TAGS for x in y]) |
|
|
assert utils.is_basic_scheme([x for y in BASIC_TAGS for x in y]) |
|
|
assert not utils.is_basic_scheme([x for y in BASIC_BIOES for x in y]) |
|
|
|
|
|
def test_underscores(): |
|
|
""" |
|
|
Check that the methods work if the inputs are underscores instead of dashes |
|
|
""" |
|
|
assert not utils.is_basic_scheme([x for y in BIO_U_TAGS for x in y]) |
|
|
check_reprocessed_tags(WORDS, BIO_U_TAGS, BIOES_TAGS) |
|
|
|
|
|
def test_merge_tags(): |
|
|
""" |
|
|
Check a few versions of the tag sequence merging |
|
|
""" |
|
|
seq1 = [ "O", "O", "O", "B-FOO", "E-FOO", "O"] |
|
|
seq2 = [ "S-FOO", "O", "B-FOO", "E-FOO", "O", "O"] |
|
|
seq3 = [ "B-FOO", "E-FOO", "B-FOO", "E-FOO", "O", "O"] |
|
|
seq_err = [ "O", "B-FOO", "O", "B-FOO", "E-FOO", "O"] |
|
|
seq_err2 = [ "O", "B-FOO", "O", "B-FOO", "B-FOO", "O"] |
|
|
seq_err3 = [ "O", "B-FOO", "O", "B-FOO", "I-FOO", "O"] |
|
|
seq_err4 = [ "O", "B-FOO", "O", "B-FOO", "I-FOO", "I-FOO"] |
|
|
|
|
|
result = utils.merge_tags(seq1, seq2) |
|
|
expected = [ "S-FOO", "O", "O", "B-FOO", "E-FOO", "O"] |
|
|
assert result == expected |
|
|
|
|
|
result = utils.merge_tags(seq2, seq1) |
|
|
expected = [ "S-FOO", "O", "B-FOO", "E-FOO", "O", "O"] |
|
|
assert result == expected |
|
|
|
|
|
result = utils.merge_tags(seq1, seq3) |
|
|
expected = [ "B-FOO", "E-FOO", "O", "B-FOO", "E-FOO", "O"] |
|
|
assert result == expected |
|
|
|
|
|
with pytest.raises(ValueError): |
|
|
result = utils.merge_tags(seq1, seq_err) |
|
|
|
|
|
with pytest.raises(ValueError): |
|
|
result = utils.merge_tags(seq1, seq_err2) |
|
|
|
|
|
with pytest.raises(ValueError): |
|
|
result = utils.merge_tags(seq1, seq_err3) |
|
|
|
|
|
with pytest.raises(ValueError): |
|
|
result = utils.merge_tags(seq1, seq_err4) |
|
|
|
|
|
|