stanza-digphil / stanza /tests /ner /test_split_wikiner.py

Albin Thörn Cleland

Clean initial commit with LFS

19b8775 about 2 months ago

4.43 kB

	"""
	Runs a few tests on the split_wikiner file
	"""

	import os
	import tempfile

	import pytest

	from stanza.utils.datasets.ner import split_wikiner

	from stanza.tests import *

	pytestmark = [pytest.mark.pipeline, pytest.mark.travis]

	# two sentences from the Italian dataset, split into many pieces
	# to test the splitting functionality
	FBK_SAMPLE = """
	Il O
	Papa O
	si O
	aggrava O

	Le O
	condizioni O
	di O

	Papa O
	Giovanni PER
	Paolo PER
	II PER
	si O

	sono O
	aggravate O
	in O
	il O
	corso O

	di O
	la O
	giornata O
	di O
	giovedì O
	. O

	Il O
	portavoce O
	Navarro PER
	Valls PER

	ha O
	dichiarato O
	che O

	il O
	Santo O
	Padre O

	in O
	la O
	giornata O

	di O
	oggi O
	è O
	stato O

	colpito O
	da O
	una O
	affezione O

	altamente O
	febbrile O
	provocata O
	da O
	una O

	infezione O
	documentata O

	di O
	le O
	vie O
	urinarie O
	. O

	A O
	il O
	momento O

	non O
	è O
	previsto O
	il O
	ricovero O

	a O
	il O
	Policlinico LOC
	Gemelli LOC
	, O

	come O
	ha O
	precisato O
	il O

	responsabile O
	di O
	il O
	dipartimento O

	di O
	emergenza O
	professor O
	Rodolfo PER
	Proietti PER
	. O
	"""


	def test_read_sentences():
	with tempfile.TemporaryDirectory() as tempdir:
	raw_filename = os.path.join(tempdir, "raw.tsv")
	with open(raw_filename, "w") as fout:
	fout.write(FBK_SAMPLE)

	sentences = split_wikiner.read_sentences(raw_filename, "utf-8")
	assert len(sentences) == 20
	text = [["\t".join(word) for word in sent] for sent in sentences]
	text = ["\n".join(sent) for sent in text]
	text = "\n\n".join(text)
	assert FBK_SAMPLE.strip() == text

	def test_write_sentences():
	with tempfile.TemporaryDirectory() as tempdir:
	raw_filename = os.path.join(tempdir, "raw.tsv")
	with open(raw_filename, "w") as fout:
	fout.write(FBK_SAMPLE)

	sentences = split_wikiner.read_sentences(raw_filename, "utf-8")
	copy_filename = os.path.join(tempdir, "copy.tsv")
	split_wikiner.write_sentences_to_file(sentences, copy_filename)

	sent2 = split_wikiner.read_sentences(raw_filename, "utf-8")
	assert sent2 == sentences

	def run_split_wikiner(expected_train=14, expected_dev=3, expected_test=3, **kwargs):
	"""
	Runs a test using various parameters to check the results of the splitting process
	"""
	with tempfile.TemporaryDirectory() as indir:
	raw_filename = os.path.join(indir, "raw.tsv")
	with open(raw_filename, "w") as fout:
	fout.write(FBK_SAMPLE)

	with tempfile.TemporaryDirectory() as outdir:
	split_wikiner.split_wikiner(outdir, raw_filename, **kwargs)

	train_file = os.path.join(outdir, "it_fbk.train.bio")
	dev_file = os.path.join(outdir, "it_fbk.dev.bio")
	test_file = os.path.join(outdir, "it_fbk.test.bio")

	assert os.path.exists(train_file)
	assert os.path.exists(dev_file)
	if kwargs["test_section"]:
	assert os.path.exists(test_file)
	else:
	assert not os.path.exists(test_file)

	train_sent = split_wikiner.read_sentences(train_file, "utf-8")
	dev_sent = split_wikiner.read_sentences(dev_file, "utf-8")
	assert len(train_sent) == expected_train
	assert len(dev_sent) == expected_dev
	if kwargs["test_section"]:
	test_sent = split_wikiner.read_sentences(test_file, "utf-8")
	assert len(test_sent) == expected_test
	else:
	test_sent = []

	if kwargs["shuffle"]:
	orig_sents = sorted(split_wikiner.read_sentences(raw_filename, "utf-8"))
	split_sents = sorted(train_sent + dev_sent + test_sent)
	else:
	orig_sents = split_wikiner.read_sentences(raw_filename, "utf-8")
	split_sents = train_sent + dev_sent + test_sent
	assert orig_sents == split_sents

	def test_no_shuffle_split():
	run_split_wikiner(prefix="it_fbk", shuffle=False, test_section=True)

	def test_shuffle_split():
	run_split_wikiner(prefix="it_fbk", shuffle=True, test_section=True)

	def test_resize():
	run_split_wikiner(expected_train=12, expected_dev=2, expected_test=6, train_fraction=0.6, dev_fraction=0.1, prefix="it_fbk", shuffle=True, test_section=True)

	def test_no_test_split():
	run_split_wikiner(expected_train=17, train_fraction=0.85, prefix="it_fbk", shuffle=False, test_section=False)