File size: 4,352 Bytes
19b8775 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
"""
Run through the various text processing methods for using the parser on text files / directories
Uses a simple tree where the parser should always get it right, but things could potentially go wrong
"""
import glob
import os
import pytest
from stanza import Pipeline
from stanza.models.constituency import text_processing
from stanza.models.constituency import tree_reader
from stanza.tests import TEST_MODELS_DIR
pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
@pytest.fixture(scope="module")
def pipeline():
return Pipeline(dir=TEST_MODELS_DIR, lang="en", processors="tokenize, pos, constituency", tokenize_pretokenized=True)
def test_read_tokenized_file(tmp_path):
filename = str(tmp_path / "test_input.txt")
with open(filename, "w") as fout:
# test that the underscore token comes back with spaces
fout.write("This is a_small test\nLine two\n")
text, ids = text_processing.read_tokenized_file(filename)
assert text == [['This', 'is', 'a small', 'test'], ['Line', 'two']]
assert ids == [None, None]
def test_parse_tokenized_sentences(pipeline):
con_processor = pipeline.processors["constituency"]
model = con_processor._model
args = model.args
sentences = [["This", "is", "a", "test"]]
trees = text_processing.parse_tokenized_sentences(args, model, [pipeline], sentences)
predictions = [x.predictions for x in trees]
assert len(predictions) == 1
scored_trees = predictions[0]
assert len(scored_trees) == 1
result = "{}".format(scored_trees[0].tree)
expected = "(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN test)))))"
assert result == expected
def test_parse_text(tmp_path, pipeline):
con_processor = pipeline.processors["constituency"]
model = con_processor._model
args = model.args
raw_file = str(tmp_path / "test_input.txt")
with open(raw_file, "w") as fout:
fout.write("This is a test\nThis is another test\n")
output_file = str(tmp_path / "test_output.txt")
text_processing.parse_text(args, model, [pipeline], tokenized_file=raw_file, predict_file=output_file)
trees = tree_reader.read_treebank(output_file)
trees = ["{}".format(x) for x in trees]
expected_trees = ["(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN test)))))",
"(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT another) (NN test)))))"]
assert trees == expected_trees
def test_parse_dir(tmp_path, pipeline):
con_processor = pipeline.processors["constituency"]
model = con_processor._model
args = model.args
raw_dir = str(tmp_path / "input")
os.makedirs(raw_dir)
raw_f1 = str(tmp_path / "input" / "f1.txt")
raw_f2 = str(tmp_path / "input" / "f2.txt")
output_dir = str(tmp_path / "output")
with open(raw_f1, "w") as fout:
fout.write("This is a test")
with open(raw_f2, "w") as fout:
fout.write("This is another test")
text_processing.parse_dir(args, model, [pipeline], raw_dir, output_dir)
output_files = sorted(glob.glob(os.path.join(output_dir, "*")))
expected_trees = ["(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN test)))))",
"(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT another) (NN test)))))"]
for output_file, expected_tree in zip(output_files, expected_trees):
trees = tree_reader.read_treebank(output_file)
assert len(trees) == 1
assert "{}".format(trees[0]) == expected_tree
def test_parse_text(tmp_path, pipeline):
con_processor = pipeline.processors["constituency"]
model = con_processor._model
args = dict(model.args)
model_path = con_processor._config['model_path']
raw_file = str(tmp_path / "test_input.txt")
with open(raw_file, "w") as fout:
fout.write("This is a test\nThis is another test\n")
output_file = str(tmp_path / "test_output.txt")
args['tokenized_file'] = raw_file
args['predict_file'] = output_file
text_processing.load_model_parse_text(args, model_path, [pipeline])
trees = tree_reader.read_treebank(output_file)
trees = ["{}".format(x) for x in trees]
expected_trees = ["(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN test)))))",
"(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT another) (NN test)))))"]
assert trees == expected_trees
|