stanza-digphil / stanza /tests /constituency /test_text_processing.py

Albin Thörn Cleland

Clean initial commit with LFS

19b8775 about 2 months ago

4.35 kB

	"""
	Run through the various text processing methods for using the parser on text files / directories

	Uses a simple tree where the parser should always get it right, but things could potentially go wrong
	"""

	import glob
	import os
	import pytest

	from stanza import Pipeline

	from stanza.models.constituency import text_processing
	from stanza.models.constituency import tree_reader
	from stanza.tests import TEST_MODELS_DIR

	pytestmark = [pytest.mark.pipeline, pytest.mark.travis]

	@pytest.fixture(scope="module")
	def pipeline():
	return Pipeline(dir=TEST_MODELS_DIR, lang="en", processors="tokenize, pos, constituency", tokenize_pretokenized=True)

	def test_read_tokenized_file(tmp_path):
	filename = str(tmp_path / "test_input.txt")
	with open(filename, "w") as fout:
	# test that the underscore token comes back with spaces
	fout.write("This is a_small test\nLine two\n")
	text, ids = text_processing.read_tokenized_file(filename)
	assert text == [['This', 'is', 'a small', 'test'], ['Line', 'two']]
	assert ids == [None, None]

	def test_parse_tokenized_sentences(pipeline):
	con_processor = pipeline.processors["constituency"]
	model = con_processor._model
	args = model.args

	sentences = [["This", "is", "a", "test"]]
	trees = text_processing.parse_tokenized_sentences(args, model, [pipeline], sentences)
	predictions = [x.predictions for x in trees]
	assert len(predictions) == 1
	scored_trees = predictions[0]
	assert len(scored_trees) == 1
	result = "{}".format(scored_trees[0].tree)
	expected = "(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN test)))))"
	assert result == expected

	def test_parse_text(tmp_path, pipeline):
	con_processor = pipeline.processors["constituency"]
	model = con_processor._model
	args = model.args

	raw_file = str(tmp_path / "test_input.txt")
	with open(raw_file, "w") as fout:
	fout.write("This is a test\nThis is another test\n")
	output_file = str(tmp_path / "test_output.txt")
	text_processing.parse_text(args, model, [pipeline], tokenized_file=raw_file, predict_file=output_file)

	trees = tree_reader.read_treebank(output_file)
	trees = ["{}".format(x) for x in trees]
	expected_trees = ["(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN test)))))",
	"(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT another) (NN test)))))"]
	assert trees == expected_trees

	def test_parse_dir(tmp_path, pipeline):
	con_processor = pipeline.processors["constituency"]
	model = con_processor._model
	args = model.args

	raw_dir = str(tmp_path / "input")
	os.makedirs(raw_dir)
	raw_f1 = str(tmp_path / "input" / "f1.txt")
	raw_f2 = str(tmp_path / "input" / "f2.txt")
	output_dir = str(tmp_path / "output")

	with open(raw_f1, "w") as fout:
	fout.write("This is a test")
	with open(raw_f2, "w") as fout:
	fout.write("This is another test")

	text_processing.parse_dir(args, model, [pipeline], raw_dir, output_dir)
	output_files = sorted(glob.glob(os.path.join(output_dir, "*")))
	expected_trees = ["(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN test)))))",
	"(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT another) (NN test)))))"]
	for output_file, expected_tree in zip(output_files, expected_trees):
	trees = tree_reader.read_treebank(output_file)
	assert len(trees) == 1
	assert "{}".format(trees[0]) == expected_tree

	def test_parse_text(tmp_path, pipeline):
	con_processor = pipeline.processors["constituency"]
	model = con_processor._model
	args = dict(model.args)

	model_path = con_processor._config['model_path']

	raw_file = str(tmp_path / "test_input.txt")
	with open(raw_file, "w") as fout:
	fout.write("This is a test\nThis is another test\n")
	output_file = str(tmp_path / "test_output.txt")

	args['tokenized_file'] = raw_file
	args['predict_file'] = output_file

	text_processing.load_model_parse_text(args, model_path, [pipeline])
	trees = tree_reader.read_treebank(output_file)
	trees = ["{}".format(x) for x in trees]
	expected_trees = ["(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN test)))))",
	"(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT another) (NN test)))))"]
	assert trees == expected_trees