| import tempfile | |
| import pytest | |
| from stanza.models.common.utils import misc_to_space_after, space_after_to_misc | |
| from stanza.models.constituency import tree_reader | |
| from stanza.server import java_protobuf_requests | |
| from stanza.tests import * | |
| from stanza.utils.conll import CoNLL | |
| from stanza.protobuf import DependencyGraph | |
| pytestmark = [pytest.mark.travis, pytest.mark.pipeline] | |
| def check_tree(proto_tree, py_tree, py_score): | |
| tree, tree_score = java_protobuf_requests.from_tree(proto_tree) | |
| assert tree_score == py_score | |
| assert tree == py_tree | |
| def test_build_tree(): | |
| text="((S (VP (VB Unban)) (NP (NNP Mox) (NNP Opal))))\n( (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))" | |
| trees = tree_reader.read_trees(text) | |
| assert len(trees) == 2 | |
| for tree in trees: | |
| proto_tree = java_protobuf_requests.build_tree(trees[0], 1.0) | |
| check_tree(proto_tree, trees[0], 1.0) | |
| ESTONIAN_EMPTY_DEPS = """ | |
| # sent_id = ewtb2_000035_15 | |
| # text = Ja paari aasta pärast rôômalt maasikatele ... | |
| 1 Ja ja CCONJ J _ 3 cc 5.1:cc _ | |
| 2 paari paar NUM N Case=Gen|Number=Sing|NumForm=Word|NumType=Card 3 nummod 3:nummod _ | |
| 3 aasta aasta NOUN S Case=Gen|Number=Sing 0 root 5.1:obl _ | |
| 4 pärast pärast ADP K AdpType=Post 3 case 3:case _ | |
| 5 rôômalt rõõmsalt ADV D Typo=Yes 3 advmod 5.1:advmod Orphan=Yes|CorrectForm=rõõmsalt | |
| 5.1 panna panema VERB V VerbForm=Inf _ _ 0:root Empty=5.1 | |
| 6 maasikatele maasikas NOUN S Case=All|Number=Plur 3 obl 5.1:obl Orphan=Yes | |
| 7 ... ... PUNCT Z _ 3 punct 5.1:punct _ | |
| """.strip() | |
| def test_convert_networkx_graph(): | |
| doc = CoNLL.conll2doc(input_str=ESTONIAN_EMPTY_DEPS, ignore_gapping=False) | |
| deps = doc.sentences[0]._enhanced_dependencies | |
| graph = DependencyGraph() | |
| java_protobuf_requests.convert_networkx_graph(graph, doc.sentences[0], 0) | |
| assert len(graph.rootNode) == 1 | |
| assert graph.rootNode[0] == 0 | |
| nodes = sorted([(x.index, x.emptyIndex) for x in graph.node]) | |
| expected_nodes = [(1,0), (2,0), (3,0), (4,0), (5,0), (5,1), (6,0), (7,0)] | |
| assert nodes == expected_nodes | |
| edges = [(x.target, x.dep) for x in graph.edge if x.source == 5 and x.sourceEmpty == 1] | |
| edges = sorted(edges) | |
| expected_edges = [(1, 'cc'), (3, 'obl'), (5, 'advmod'), (6, 'obl'), (7, 'punct')] | |
| assert edges == expected_edges | |
| ENGLISH_NBSP_SAMPLE=""" | |
| # sent_id = newsgroup-groups.google.com_n3td3v_e874a1e5eb995654_ENG_20060120_052200-0011 | |
| # text = Please note that neither the e-mail address nor name of the sender have been verified. | |
| 1 Please please INTJ UH _ 2 discourse _ _ | |
| 2 note note VERB VB Mood=Imp|VerbForm=Fin 0 root _ _ | |
| 3 that that SCONJ IN _ 15 mark _ _ | |
| 4 neither neither CCONJ CC _ 7 cc:preconj _ _ | |
| 5 the the DET DT Definite=Def|PronType=Art 7 det _ _ | |
| 6 e-mail e-mail NOUN NN Number=Sing 7 compound _ _ | |
| 7 address address NOUN NN Number=Sing 15 nsubj:pass _ _ | |
| 8 nor nor CCONJ CC _ 9 cc _ _ | |
| 9 name name NOUN NN Number=Sing 7 conj _ _ | |
| 10 of of ADP IN _ 12 case _ _ | |
| 11 the the DET DT Definite=Def|PronType=Art 12 det _ _ | |
| 12 sender sender NOUN NN Number=Sing 7 nmod _ _ | |
| 13 have have AUX VBP Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 15 aux _ SpacesAfter=\\u00A0 | |
| 14 been be AUX VBN Tense=Past|VerbForm=Part 15 aux:pass _ _ | |
| 15 verified verify VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 2 ccomp _ SpaceAfter=No | |
| 16 . . PUNCT . _ 2 punct _ _ | |
| """.strip() | |
| def test_nbsp_doc(): | |
| """ | |
| Test that the space conversion methods will convert to and from NBSP | |
| """ | |
| doc = CoNLL.conll2doc(input_str=ENGLISH_NBSP_SAMPLE) | |
| assert doc.sentences[0].text == "Please note that neither the e-mail address nor name of the sender have been verified." | |
| assert doc.sentences[0].tokens[12].spaces_after == " " | |
| assert misc_to_space_after("SpacesAfter=\\u00A0") == ' ' | |
| assert space_after_to_misc(' ') == "SpacesAfter=\\u00A0" | |
| conllu = "{:C}".format(doc) | |
| assert conllu == ENGLISH_NBSP_SAMPLE | |