stanza-digphil / stanza /tests /server /test_java_protobuf_requests.py
Albin Thörn Cleland
Clean initial commit with LFS
19b8775
import tempfile
import pytest
from stanza.models.common.utils import misc_to_space_after, space_after_to_misc
from stanza.models.constituency import tree_reader
from stanza.server import java_protobuf_requests
from stanza.tests import *
from stanza.utils.conll import CoNLL
from stanza.protobuf import DependencyGraph
pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
def check_tree(proto_tree, py_tree, py_score):
tree, tree_score = java_protobuf_requests.from_tree(proto_tree)
assert tree_score == py_score
assert tree == py_tree
def test_build_tree():
text="((S (VP (VB Unban)) (NP (NNP Mox) (NNP Opal))))\n( (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))"
trees = tree_reader.read_trees(text)
assert len(trees) == 2
for tree in trees:
proto_tree = java_protobuf_requests.build_tree(trees[0], 1.0)
check_tree(proto_tree, trees[0], 1.0)
ESTONIAN_EMPTY_DEPS = """
# sent_id = ewtb2_000035_15
# text = Ja paari aasta pärast rôômalt maasikatele ...
1 Ja ja CCONJ J _ 3 cc 5.1:cc _
2 paari paar NUM N Case=Gen|Number=Sing|NumForm=Word|NumType=Card 3 nummod 3:nummod _
3 aasta aasta NOUN S Case=Gen|Number=Sing 0 root 5.1:obl _
4 pärast pärast ADP K AdpType=Post 3 case 3:case _
5 rôômalt rõõmsalt ADV D Typo=Yes 3 advmod 5.1:advmod Orphan=Yes|CorrectForm=rõõmsalt
5.1 panna panema VERB V VerbForm=Inf _ _ 0:root Empty=5.1
6 maasikatele maasikas NOUN S Case=All|Number=Plur 3 obl 5.1:obl Orphan=Yes
7 ... ... PUNCT Z _ 3 punct 5.1:punct _
""".strip()
def test_convert_networkx_graph():
doc = CoNLL.conll2doc(input_str=ESTONIAN_EMPTY_DEPS, ignore_gapping=False)
deps = doc.sentences[0]._enhanced_dependencies
graph = DependencyGraph()
java_protobuf_requests.convert_networkx_graph(graph, doc.sentences[0], 0)
assert len(graph.rootNode) == 1
assert graph.rootNode[0] == 0
nodes = sorted([(x.index, x.emptyIndex) for x in graph.node])
expected_nodes = [(1,0), (2,0), (3,0), (4,0), (5,0), (5,1), (6,0), (7,0)]
assert nodes == expected_nodes
edges = [(x.target, x.dep) for x in graph.edge if x.source == 5 and x.sourceEmpty == 1]
edges = sorted(edges)
expected_edges = [(1, 'cc'), (3, 'obl'), (5, 'advmod'), (6, 'obl'), (7, 'punct')]
assert edges == expected_edges
ENGLISH_NBSP_SAMPLE="""
# sent_id = newsgroup-groups.google.com_n3td3v_e874a1e5eb995654_ENG_20060120_052200-0011
# text = Please note that neither the e-mail address nor name of the sender have been verified.
1 Please please INTJ UH _ 2 discourse _ _
2 note note VERB VB Mood=Imp|VerbForm=Fin 0 root _ _
3 that that SCONJ IN _ 15 mark _ _
4 neither neither CCONJ CC _ 7 cc:preconj _ _
5 the the DET DT Definite=Def|PronType=Art 7 det _ _
6 e-mail e-mail NOUN NN Number=Sing 7 compound _ _
7 address address NOUN NN Number=Sing 15 nsubj:pass _ _
8 nor nor CCONJ CC _ 9 cc _ _
9 name name NOUN NN Number=Sing 7 conj _ _
10 of of ADP IN _ 12 case _ _
11 the the DET DT Definite=Def|PronType=Art 12 det _ _
12 sender sender NOUN NN Number=Sing 7 nmod _ _
13 have have AUX VBP Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 15 aux _ SpacesAfter=\\u00A0
14 been be AUX VBN Tense=Past|VerbForm=Part 15 aux:pass _ _
15 verified verify VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 2 ccomp _ SpaceAfter=No
16 . . PUNCT . _ 2 punct _ _
""".strip()
def test_nbsp_doc():
"""
Test that the space conversion methods will convert to and from NBSP
"""
doc = CoNLL.conll2doc(input_str=ENGLISH_NBSP_SAMPLE)
assert doc.sentences[0].text == "Please note that neither the e-mail address nor name of the sender have been verified."
assert doc.sentences[0].tokens[12].spaces_after == " "
assert misc_to_space_after("SpacesAfter=\\u00A0") == ' '
assert space_after_to_misc(' ') == "SpacesAfter=\\u00A0"
conllu = "{:C}".format(doc)
assert conllu == ENGLISH_NBSP_SAMPLE