stanza-digphil / stanza /tests /ner /test_split_wikiner.py
Albin Thörn Cleland
Clean initial commit with LFS
19b8775
"""
Runs a few tests on the split_wikiner file
"""
import os
import tempfile
import pytest
from stanza.utils.datasets.ner import split_wikiner
from stanza.tests import *
pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
# two sentences from the Italian dataset, split into many pieces
# to test the splitting functionality
FBK_SAMPLE = """
Il O
Papa O
si O
aggrava O
Le O
condizioni O
di O
Papa O
Giovanni PER
Paolo PER
II PER
si O
sono O
aggravate O
in O
il O
corso O
di O
la O
giornata O
di O
giovedì O
. O
Il O
portavoce O
Navarro PER
Valls PER
ha O
dichiarato O
che O
il O
Santo O
Padre O
in O
la O
giornata O
di O
oggi O
è O
stato O
colpito O
da O
una O
affezione O
altamente O
febbrile O
provocata O
da O
una O
infezione O
documentata O
di O
le O
vie O
urinarie O
. O
A O
il O
momento O
non O
è O
previsto O
il O
ricovero O
a O
il O
Policlinico LOC
Gemelli LOC
, O
come O
ha O
precisato O
il O
responsabile O
di O
il O
dipartimento O
di O
emergenza O
professor O
Rodolfo PER
Proietti PER
. O
"""
def test_read_sentences():
with tempfile.TemporaryDirectory() as tempdir:
raw_filename = os.path.join(tempdir, "raw.tsv")
with open(raw_filename, "w") as fout:
fout.write(FBK_SAMPLE)
sentences = split_wikiner.read_sentences(raw_filename, "utf-8")
assert len(sentences) == 20
text = [["\t".join(word) for word in sent] for sent in sentences]
text = ["\n".join(sent) for sent in text]
text = "\n\n".join(text)
assert FBK_SAMPLE.strip() == text
def test_write_sentences():
with tempfile.TemporaryDirectory() as tempdir:
raw_filename = os.path.join(tempdir, "raw.tsv")
with open(raw_filename, "w") as fout:
fout.write(FBK_SAMPLE)
sentences = split_wikiner.read_sentences(raw_filename, "utf-8")
copy_filename = os.path.join(tempdir, "copy.tsv")
split_wikiner.write_sentences_to_file(sentences, copy_filename)
sent2 = split_wikiner.read_sentences(raw_filename, "utf-8")
assert sent2 == sentences
def run_split_wikiner(expected_train=14, expected_dev=3, expected_test=3, **kwargs):
"""
Runs a test using various parameters to check the results of the splitting process
"""
with tempfile.TemporaryDirectory() as indir:
raw_filename = os.path.join(indir, "raw.tsv")
with open(raw_filename, "w") as fout:
fout.write(FBK_SAMPLE)
with tempfile.TemporaryDirectory() as outdir:
split_wikiner.split_wikiner(outdir, raw_filename, **kwargs)
train_file = os.path.join(outdir, "it_fbk.train.bio")
dev_file = os.path.join(outdir, "it_fbk.dev.bio")
test_file = os.path.join(outdir, "it_fbk.test.bio")
assert os.path.exists(train_file)
assert os.path.exists(dev_file)
if kwargs["test_section"]:
assert os.path.exists(test_file)
else:
assert not os.path.exists(test_file)
train_sent = split_wikiner.read_sentences(train_file, "utf-8")
dev_sent = split_wikiner.read_sentences(dev_file, "utf-8")
assert len(train_sent) == expected_train
assert len(dev_sent) == expected_dev
if kwargs["test_section"]:
test_sent = split_wikiner.read_sentences(test_file, "utf-8")
assert len(test_sent) == expected_test
else:
test_sent = []
if kwargs["shuffle"]:
orig_sents = sorted(split_wikiner.read_sentences(raw_filename, "utf-8"))
split_sents = sorted(train_sent + dev_sent + test_sent)
else:
orig_sents = split_wikiner.read_sentences(raw_filename, "utf-8")
split_sents = train_sent + dev_sent + test_sent
assert orig_sents == split_sents
def test_no_shuffle_split():
run_split_wikiner(prefix="it_fbk", shuffle=False, test_section=True)
def test_shuffle_split():
run_split_wikiner(prefix="it_fbk", shuffle=True, test_section=True)
def test_resize():
run_split_wikiner(expected_train=12, expected_dev=2, expected_test=6, train_fraction=0.6, dev_fraction=0.1, prefix="it_fbk", shuffle=True, test_section=True)
def test_no_test_split():
run_split_wikiner(expected_train=17, train_fraction=0.85, prefix="it_fbk", shuffle=False, test_section=False)