|
|
""" |
|
|
Runs a few tests on the split_wikiner file |
|
|
""" |
|
|
|
|
|
import os |
|
|
import tempfile |
|
|
|
|
|
import pytest |
|
|
|
|
|
from stanza.utils.datasets.ner import split_wikiner |
|
|
|
|
|
from stanza.tests import * |
|
|
|
|
|
pytestmark = [pytest.mark.pipeline, pytest.mark.travis] |
|
|
|
|
|
|
|
|
|
|
|
FBK_SAMPLE = """ |
|
|
Il O |
|
|
Papa O |
|
|
si O |
|
|
aggrava O |
|
|
|
|
|
Le O |
|
|
condizioni O |
|
|
di O |
|
|
|
|
|
Papa O |
|
|
Giovanni PER |
|
|
Paolo PER |
|
|
II PER |
|
|
si O |
|
|
|
|
|
sono O |
|
|
aggravate O |
|
|
in O |
|
|
il O |
|
|
corso O |
|
|
|
|
|
di O |
|
|
la O |
|
|
giornata O |
|
|
di O |
|
|
giovedì O |
|
|
. O |
|
|
|
|
|
Il O |
|
|
portavoce O |
|
|
Navarro PER |
|
|
Valls PER |
|
|
|
|
|
ha O |
|
|
dichiarato O |
|
|
che O |
|
|
|
|
|
il O |
|
|
Santo O |
|
|
Padre O |
|
|
|
|
|
in O |
|
|
la O |
|
|
giornata O |
|
|
|
|
|
di O |
|
|
oggi O |
|
|
è O |
|
|
stato O |
|
|
|
|
|
colpito O |
|
|
da O |
|
|
una O |
|
|
affezione O |
|
|
|
|
|
altamente O |
|
|
febbrile O |
|
|
provocata O |
|
|
da O |
|
|
una O |
|
|
|
|
|
infezione O |
|
|
documentata O |
|
|
|
|
|
di O |
|
|
le O |
|
|
vie O |
|
|
urinarie O |
|
|
. O |
|
|
|
|
|
A O |
|
|
il O |
|
|
momento O |
|
|
|
|
|
non O |
|
|
è O |
|
|
previsto O |
|
|
il O |
|
|
ricovero O |
|
|
|
|
|
a O |
|
|
il O |
|
|
Policlinico LOC |
|
|
Gemelli LOC |
|
|
, O |
|
|
|
|
|
come O |
|
|
ha O |
|
|
precisato O |
|
|
il O |
|
|
|
|
|
responsabile O |
|
|
di O |
|
|
il O |
|
|
dipartimento O |
|
|
|
|
|
di O |
|
|
emergenza O |
|
|
professor O |
|
|
Rodolfo PER |
|
|
Proietti PER |
|
|
. O |
|
|
""" |
|
|
|
|
|
|
|
|
def test_read_sentences(): |
|
|
with tempfile.TemporaryDirectory() as tempdir: |
|
|
raw_filename = os.path.join(tempdir, "raw.tsv") |
|
|
with open(raw_filename, "w") as fout: |
|
|
fout.write(FBK_SAMPLE) |
|
|
|
|
|
sentences = split_wikiner.read_sentences(raw_filename, "utf-8") |
|
|
assert len(sentences) == 20 |
|
|
text = [["\t".join(word) for word in sent] for sent in sentences] |
|
|
text = ["\n".join(sent) for sent in text] |
|
|
text = "\n\n".join(text) |
|
|
assert FBK_SAMPLE.strip() == text |
|
|
|
|
|
def test_write_sentences(): |
|
|
with tempfile.TemporaryDirectory() as tempdir: |
|
|
raw_filename = os.path.join(tempdir, "raw.tsv") |
|
|
with open(raw_filename, "w") as fout: |
|
|
fout.write(FBK_SAMPLE) |
|
|
|
|
|
sentences = split_wikiner.read_sentences(raw_filename, "utf-8") |
|
|
copy_filename = os.path.join(tempdir, "copy.tsv") |
|
|
split_wikiner.write_sentences_to_file(sentences, copy_filename) |
|
|
|
|
|
sent2 = split_wikiner.read_sentences(raw_filename, "utf-8") |
|
|
assert sent2 == sentences |
|
|
|
|
|
def run_split_wikiner(expected_train=14, expected_dev=3, expected_test=3, **kwargs): |
|
|
""" |
|
|
Runs a test using various parameters to check the results of the splitting process |
|
|
""" |
|
|
with tempfile.TemporaryDirectory() as indir: |
|
|
raw_filename = os.path.join(indir, "raw.tsv") |
|
|
with open(raw_filename, "w") as fout: |
|
|
fout.write(FBK_SAMPLE) |
|
|
|
|
|
with tempfile.TemporaryDirectory() as outdir: |
|
|
split_wikiner.split_wikiner(outdir, raw_filename, **kwargs) |
|
|
|
|
|
train_file = os.path.join(outdir, "it_fbk.train.bio") |
|
|
dev_file = os.path.join(outdir, "it_fbk.dev.bio") |
|
|
test_file = os.path.join(outdir, "it_fbk.test.bio") |
|
|
|
|
|
assert os.path.exists(train_file) |
|
|
assert os.path.exists(dev_file) |
|
|
if kwargs["test_section"]: |
|
|
assert os.path.exists(test_file) |
|
|
else: |
|
|
assert not os.path.exists(test_file) |
|
|
|
|
|
train_sent = split_wikiner.read_sentences(train_file, "utf-8") |
|
|
dev_sent = split_wikiner.read_sentences(dev_file, "utf-8") |
|
|
assert len(train_sent) == expected_train |
|
|
assert len(dev_sent) == expected_dev |
|
|
if kwargs["test_section"]: |
|
|
test_sent = split_wikiner.read_sentences(test_file, "utf-8") |
|
|
assert len(test_sent) == expected_test |
|
|
else: |
|
|
test_sent = [] |
|
|
|
|
|
if kwargs["shuffle"]: |
|
|
orig_sents = sorted(split_wikiner.read_sentences(raw_filename, "utf-8")) |
|
|
split_sents = sorted(train_sent + dev_sent + test_sent) |
|
|
else: |
|
|
orig_sents = split_wikiner.read_sentences(raw_filename, "utf-8") |
|
|
split_sents = train_sent + dev_sent + test_sent |
|
|
assert orig_sents == split_sents |
|
|
|
|
|
def test_no_shuffle_split(): |
|
|
run_split_wikiner(prefix="it_fbk", shuffle=False, test_section=True) |
|
|
|
|
|
def test_shuffle_split(): |
|
|
run_split_wikiner(prefix="it_fbk", shuffle=True, test_section=True) |
|
|
|
|
|
def test_resize(): |
|
|
run_split_wikiner(expected_train=12, expected_dev=2, expected_test=6, train_fraction=0.6, dev_fraction=0.1, prefix="it_fbk", shuffle=True, test_section=True) |
|
|
|
|
|
def test_no_test_split(): |
|
|
run_split_wikiner(expected_train=17, train_fraction=0.85, prefix="it_fbk", shuffle=False, test_section=False) |
|
|
|
|
|
|