"""
Basic tests of the data conversion
"""

import io
import pytest
import tempfile
from zipfile import ZipFile

import stanza
from stanza.utils.conll import CoNLL
from stanza.models.common.doc import Document
from stanza.tests import *

pytestmark = pytest.mark.pipeline

# data for testing
CONLL = [[['1', 'Nous', 'il', 'PRON', '_', 'Number=Plur|Person=1|PronType=Prs', '3', 'nsubj', '_', 'start_char=0|end_char=4'],
          ['2', 'avons', 'avoir', 'AUX', '_', 'Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin', '3', 'aux:tense', '_', 'start_char=5|end_char=10'],
          ['3', 'atteint', 'atteindre', 'VERB', '_', 'Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part', '0', 'root', '_', 'start_char=11|end_char=18'],
          ['4', 'la', 'le', 'DET', '_', 'Definite=Def|Gender=Fem|Number=Sing|PronType=Art', '5', 'det', '_', 'start_char=19|end_char=21'],
          ['5', 'fin', 'fin', 'NOUN', '_', 'Gender=Fem|Number=Sing', '3', 'obj', '_', 'start_char=22|end_char=25'],
          ['6-7', 'du', '_', '_', '_', '_', '_', '_', '_', 'start_char=26|end_char=28'],
          ['6', 'de', 'de', 'ADP', '_', '_', '8', 'case', '_', '_'],
          ['7', 'le', 'le', 'DET', '_', 'Definite=Def|Gender=Masc|Number=Sing|PronType=Art', '8', 'det', '_', '_'],
          ['8', 'sentier', 'sentier', 'NOUN', '_', 'Gender=Masc|Number=Sing', '5', 'nmod', '_', 'start_char=29|end_char=36'],
          ['9', '.', '.', 'PUNCT', '_', '_', '3', 'punct', '_', 'start_char=36|end_char=37']]]


DICT = [[{'id': (1,), 'text': 'Nous', 'lemma': 'il', 'upos': 'PRON', 'feats': 'Number=Plur|Person=1|PronType=Prs', 'head': 3, 'deprel': 'nsubj', 'misc': 'start_char=0|end_char=4'},
         {'id': (2,), 'text': 'avons', 'lemma': 'avoir', 'upos': 'AUX', 'feats': 'Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin', 'head': 3, 'deprel': 'aux:tense', 'misc': 'start_char=5|end_char=10'},
         {'id': (3,), 'text': 'atteint', 'lemma': 'atteindre', 'upos': 'VERB', 'feats': 'Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part', 'head': 0, 'deprel': 'root', 'misc': 'start_char=11|end_char=18'},
         {'id': (4,), 'text': 'la', 'lemma': 'le', 'upos': 'DET', 'feats': 'Definite=Def|Gender=Fem|Number=Sing|PronType=Art', 'head': 5, 'deprel': 'det', 'misc': 'start_char=19|end_char=21'},
         {'id': (5,), 'text': 'fin', 'lemma': 'fin', 'upos': 'NOUN', 'feats': 'Gender=Fem|Number=Sing', 'head': 3, 'deprel': 'obj', 'misc': 'start_char=22|end_char=25'},
         {'id': (6, 7), 'text': 'du', 'misc': 'start_char=26|end_char=28'},
         {'id': (6,), 'text': 'de', 'lemma': 'de', 'upos': 'ADP', 'head': 8, 'deprel': 'case'},
         {'id': (7,), 'text': 'le', 'lemma': 'le', 'upos': 'DET', 'feats': 'Definite=Def|Gender=Masc|Number=Sing|PronType=Art', 'head': 8, 'deprel': 'det'},
         {'id': (8,), 'text': 'sentier', 'lemma': 'sentier', 'upos': 'NOUN', 'feats': 'Gender=Masc|Number=Sing', 'head': 5, 'deprel': 'nmod', 'misc': 'start_char=29|end_char=36'},
         {'id': (9,), 'text': '.', 'lemma': '.', 'upos': 'PUNCT', 'head': 3, 'deprel': 'punct', 'misc': 'start_char=36|end_char=37'}]]

def test_conll_to_dict():
    dicts, empty = CoNLL.convert_conll(CONLL)
    assert dicts == DICT
    assert len(dicts) == len(empty)
    assert all(len(x) == 0 for x in empty)

def test_dict_to_conll():
    document = Document(DICT)
    # :c = no comments
    conll = [[sentence.split("\t") for sentence in doc.split("\n")] for doc in "{:c}".format(document).split("\n\n")]
    assert conll == CONLL

def test_dict_to_doc_and_doc_to_dict():
    """
    Test the conversion from raw dict to Document and back

    This code path will first turn start_char|end_char into start_char & end_char fields in the Document
    That version to a dict will have separate fields for each of those
    Finally, the conversion from that dict to a list of conll entries should convert that back to misc
    """
    document = Document(DICT)
    dicts = document.to_dict()
    document = Document(dicts)
    conll = [[sentence.split("\t") for sentence in doc.split("\n")] for doc in "{:c}".format(document).split("\n\n")]
    assert conll == CONLL

# sample is two sentences long so that the tests check multiple sentences
RUSSIAN_SAMPLE="""
# sent_id = yandex.reviews-f-8xh5zqnmwak3t6p68y4rhwd4e0-1969-9253
# genre = review
# text = Как- то слишком мало цветов получают актёры после спектакля.
1	Как	как-то	ADV	_	Degree=Pos|PronType=Ind	7	advmod	_	SpaceAfter=No
2	-	-	PUNCT	_	_	3	punct	_	_
3	то	то	PART	_	_	1	list	_	deprel=list:goeswith
4	слишком	слишком	ADV	_	Degree=Pos	5	advmod	_	_
5	мало	мало	ADV	_	Degree=Pos	6	advmod	_	_
6	цветов	цветок	NOUN	_	Animacy=Inan|Case=Gen|Gender=Masc|Number=Plur	7	obj	_	_
7	получают	получать	VERB	_	Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act	0	root	_	_
8	актёры	актер	NOUN	_	Animacy=Anim|Case=Nom|Gender=Masc|Number=Plur	7	nsubj	_	_
9	после	после	ADP	_	_	10	case	_	_
10	спектакля	спектакль	NOUN	_	Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing	7	obl	_	SpaceAfter=No
11	.	.	PUNCT	_	_	7	punct	_	_

# sent_id = 4
# genre = social
# text = В женщине важна верность, а не красота.
1	В	в	ADP	_	_	2	case	_	_
2	женщине	женщина	NOUN	_	Animacy=Anim|Case=Loc|Gender=Fem|Number=Sing	3	obl	_	_
3	важна	важный	ADJ	_	Degree=Pos|Gender=Fem|Number=Sing|Variant=Short	0	root	_	_
4	верность	верность	NOUN	_	Animacy=Inan|Case=Nom|Gender=Fem|Number=Sing	3	nsubj	_	SpaceAfter=No
5	,	,	PUNCT	_	_	8	punct	_	_
6	а	а	CCONJ	_	_	8	cc	_	_
7	не	не	PART	_	Polarity=Neg	8	advmod	_	_
8	красота	красота	NOUN	_	Animacy=Inan|Case=Nom|Gender=Fem|Number=Sing	4	conj	_	SpaceAfter=No
9	.	.	PUNCT	_	_	3	punct	_	_
""".strip()

RUSSIAN_TEXT = ["Как- то слишком мало цветов получают актёры после спектакля.", "В женщине важна верность, а не красота."]
RUSSIAN_IDS = ["yandex.reviews-f-8xh5zqnmwak3t6p68y4rhwd4e0-1969-9253", "4"]

def check_russian_doc(doc):
    """
    Refactored the test for the Russian doc so we can use it to test various file methods
    """
    lines = RUSSIAN_SAMPLE.split("\n")
    assert len(doc.sentences) == 2
    assert lines[0] == doc.sentences[0].comments[0]
    assert lines[1] == doc.sentences[0].comments[1]
    assert lines[2] == doc.sentences[0].comments[2]
    for sent_idx, (expected_text, expected_id, sentence) in enumerate(zip(RUSSIAN_TEXT, RUSSIAN_IDS, doc.sentences)):
        assert expected_text == sentence.text
        assert expected_id == sentence.sent_id
        assert sent_idx == sentence.index
        assert len(sentence.comments) == 3
        assert not sentence.has_enhanced_dependencies()

    sentences = "{:C}".format(doc)
    sentences = sentences.split("\n\n")
    assert len(sentences) == 2

    sentence = sentences[0].split("\n")
    assert len(sentence) == 14
    assert lines[0] == sentence[0]
    assert lines[1] == sentence[1]
    assert lines[2] == sentence[2]

    # assert that the weird deprel=list:goeswith was properly handled
    assert doc.sentences[0].words[2].head == 1
    assert doc.sentences[0].words[2].deprel == "list:goeswith"

def test_write_russian_doc(tmp_path):
    """
    Specifically test the write_doc2conll method
    """
    filename = tmp_path / "russian.conll"
    doc = CoNLL.conll2doc(input_str=RUSSIAN_SAMPLE)
    check_russian_doc(doc)
    CoNLL.write_doc2conll(doc, filename)

    with open(filename, encoding="utf-8") as fin:
        text = fin.read()

    # the conll docs have to end with \n\n
    assert text.endswith("\n\n")

    # but to compare against the original, strip off the whitespace
    text = text.strip()

    # we skip the first sentence because the "deprel=list:goeswith" is weird
    # note that the deprel itself is checked in check_russian_doc
    text = text[text.find("# sent_id = 4"):]
    sample = RUSSIAN_SAMPLE[RUSSIAN_SAMPLE.find("# sent_id = 4"):]
    assert text == sample

    doc2 = CoNLL.conll2doc(filename)
    check_russian_doc(doc2)

# random sentence from EN_Pronouns
ENGLISH_SAMPLE = """
# newdoc
# sent_id = 1
# text = It is hers.
# previous = Which person owns this?
# comment = copular subject
1	It	it	PRON	PRP	Number=Sing|Person=3|PronType=Prs	3	nsubj	_	_
2	is	be	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	3	cop	_	_
3	hers	hers	PRON	PRP	Gender=Fem|Number=Sing|Person=3|Poss=Yes|PronType=Prs	0	root	_	SpaceAfter=No
4	.	.	PUNCT	.	_	3	punct	_	_
""".strip()

def test_write_to_io():
    doc = CoNLL.conll2doc(input_str=ENGLISH_SAMPLE)
    output = io.StringIO()
    CoNLL.write_doc2conll(doc, output)
    output_value = output.getvalue()
    assert output_value.endswith("\n\n")
    assert output_value.strip() == ENGLISH_SAMPLE

def test_write_doc2conll_append(tmp_path):
    doc = CoNLL.conll2doc(input_str=ENGLISH_SAMPLE)
    filename = tmp_path / "english.conll"
    CoNLL.write_doc2conll(doc, filename)
    CoNLL.write_doc2conll(doc, filename, mode="a")

    with open(filename) as fin:
        text = fin.read()
    expected = ENGLISH_SAMPLE + "\n\n" + ENGLISH_SAMPLE + "\n\n"
    assert text == expected

def test_doc_with_comments():
    """
    Test that a doc with comments gets converted back with comments
    """
    doc = CoNLL.conll2doc(input_str=RUSSIAN_SAMPLE)
    check_russian_doc(doc)

def test_unusual_misc():
    """
    The above RUSSIAN_SAMPLE resulted in a blank misc field in one particular implementation of the conll code
    (the below test would fail)
    """
    doc = CoNLL.conll2doc(input_str=RUSSIAN_SAMPLE)
    sentences = "{:C}".format(doc).split("\n\n")
    assert len(sentences) == 2
    sentence = sentences[0].split("\n")
    assert len(sentence) == 14

    for word in sentence:
        pieces = word.split("\t")
        assert len(pieces) == 1 or len(pieces) == 10
        if len(pieces) == 10:
            assert all(piece for piece in pieces)

def test_file():
    """
    Test loading a doc from a file
    """
    with tempfile.TemporaryDirectory() as tempdir:
        filename = os.path.join(tempdir, "russian.conll")
        with open(filename, "w", encoding="utf-8") as fout:
            fout.write(RUSSIAN_SAMPLE)
        doc = CoNLL.conll2doc(input_file=filename)
        check_russian_doc(doc)

def test_zip_file():
    """
    Test loading a doc from a zip file
    """
    with tempfile.TemporaryDirectory() as tempdir:
        zip_file = os.path.join(tempdir, "russian.zip")
        filename = "russian.conll"
        with ZipFile(zip_file, "w") as zout:
            with zout.open(filename, "w") as fout:
                fout.write(RUSSIAN_SAMPLE.encode())

        doc = CoNLL.conll2doc(input_file=filename, zip_file=zip_file)
        check_russian_doc(doc)

SIMPLE_NER = """
# text = Teferi's best friend is Karn
# sent_id = 0
1	Teferi	_	_	_	_	0	_	_	start_char=0|end_char=6|ner=S-PERSON
2	's	_	_	_	_	1	_	_	start_char=6|end_char=8|ner=O
3	best	_	_	_	_	2	_	_	start_char=9|end_char=13|ner=O
4	friend	_	_	_	_	3	_	_	start_char=14|end_char=20|ner=O
5	is	_	_	_	_	4	_	_	start_char=21|end_char=23|ner=O
6	Karn	_	_	_	_	5	_	_	start_char=24|end_char=28|ner=S-PERSON
""".strip()

def test_simple_ner_conversion():
    """
    Test that tokens get properly created with NER tags
    """
    doc = CoNLL.conll2doc(input_str=SIMPLE_NER)
    assert len(doc.sentences) == 1
    sentence = doc.sentences[0]
    assert len(sentence.tokens) == 6
    EXPECTED_NER = ["S-PERSON", "O", "O", "O", "O", "S-PERSON"]
    for token, ner in zip(sentence.tokens, EXPECTED_NER):
        assert token.ner == ner
        # check that the ner, start_char, end_char fields were not put on the token's misc
        # those should all be set as specific fields on the token
        assert not token.misc
        assert len(token.words) == 1
        # they should also not reach the word's misc field
        assert not token.words[0].misc

    conll = "{:C}".format(doc)
    assert conll == SIMPLE_NER

MWT_NER = """
# text = This makes John's headache worse
# sent_id = 0
1	This	_	_	_	_	0	_	_	start_char=0|end_char=4|ner=O
2	makes	_	_	_	_	1	_	_	start_char=5|end_char=10|ner=O
3-4	John's	_	_	_	_	_	_	_	start_char=11|end_char=17|ner=S-PERSON
3	John	_	_	_	_	2	_	_	_
4	's	_	_	_	_	3	_	_	_
5	headache	_	_	_	_	4	_	_	start_char=18|end_char=26|ner=O
6	worse	_	_	_	_	5	_	_	start_char=27|end_char=32|ner=O
""".strip()

def test_mwt_ner_conversion():
    """
    Test that tokens including MWT get properly created with NER tags

    Note that this kind of thing happens with the EWT tokenizer for English, for example
    """
    doc = CoNLL.conll2doc(input_str=MWT_NER)
    assert len(doc.sentences) == 1
    sentence = doc.sentences[0]
    assert len(sentence.tokens) == 5
    assert not sentence.has_enhanced_dependencies()
    EXPECTED_NER = ["O", "O", "S-PERSON", "O", "O"]
    EXPECTED_WORDS = [1, 1, 2, 1, 1]
    for token, ner, expected_words in zip(sentence.tokens, EXPECTED_NER, EXPECTED_WORDS):
        assert token.ner == ner
        # check that the ner, start_char, end_char fields were not put on the token's misc
        # those should all be set as specific fields on the token
        assert not token.misc
        assert len(token.words) == expected_words
        # they should also not reach the word's misc field
        assert not token.words[0].misc

    conll = "{:C}".format(doc)
    assert conll == MWT_NER

ALL_OFFSETS_CONLLU = """
# text = This makes John's headache worse
# sent_id = 0
1	This	_	_	_	_	0	_	_	start_char=0|end_char=4
2	makes	_	_	_	_	1	_	_	start_char=5|end_char=10
3-4	John's	_	_	_	_	_	_	_	start_char=11|end_char=17
3	John	_	_	_	_	2	_	_	start_char=11|end_char=15
4	's	_	_	_	_	3	_	_	start_char=15|end_char=17
5	headache	_	_	_	_	4	_	_	start_char=18|end_char=26
6	worse	_	_	_	_	5	_	_	SpaceAfter=No|start_char=27|end_char=32
""".strip()

NO_OFFSETS_CONLLU = """
# text = This makes John's headache worse
# sent_id = 0
1	This	_	_	_	_	0	_	_	_
2	makes	_	_	_	_	1	_	_	_
3-4	John's	_	_	_	_	_	_	_	_
3	John	_	_	_	_	2	_	_	_
4	's	_	_	_	_	3	_	_	_
5	headache	_	_	_	_	4	_	_	_
6	worse	_	_	_	_	5	_	_	SpaceAfter=No
""".strip()

NO_COMMENTS_NO_OFFSETS_CONLLU = """
1	This	_	_	_	_	0	_	_	_
2	makes	_	_	_	_	1	_	_	_
3-4	John's	_	_	_	_	_	_	_	_
3	John	_	_	_	_	2	_	_	_
4	's	_	_	_	_	3	_	_	_
5	headache	_	_	_	_	4	_	_	_
6	worse	_	_	_	_	5	_	_	SpaceAfter=No
""".strip()


def test_no_offsets_output():
    doc = CoNLL.conll2doc(input_str=ALL_OFFSETS_CONLLU)
    assert len(doc.sentences) == 1
    sentence = doc.sentences[0]
    assert len(sentence.tokens) == 5

    conll = "{:C}".format(doc)
    assert conll == ALL_OFFSETS_CONLLU

    conll = "{:C-o}".format(doc)
    assert conll == NO_OFFSETS_CONLLU

    conll = "{:c-o}".format(doc)
    assert conll == NO_COMMENTS_NO_OFFSETS_CONLLU

# A random sentence from et_ewt-ud-train.conllu
# which we use to test the deps conversion for multiple deps
ESTONIAN_DEPS = """
# newpar
# sent_id = aia_foorum_37
# text = Sestpeale ei mõistagi neid, kes koduaias sortidega tegelevad.
1	Sestpeale	sest_peale	ADV	D	_	3	advmod	3:advmod	_
2	ei	ei	AUX	V	Polarity=Neg	3	aux	3:aux	_
3	mõistagi	mõistma	VERB	V	Connegative=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act	0	root	0:root	_
4	neid	tema	PRON	P	Case=Par|Number=Plur|Person=3|PronType=Prs	3	obj	3:obj|9:nsubj	SpaceAfter=No
5	,	,	PUNCT	Z	_	9	punct	9:punct	_
6	kes	kes	PRON	P	Case=Nom|Number=Plur|PronType=Int,Rel	9	nsubj	4:ref	_
7	koduaias	kodu_aed	NOUN	S	Case=Ine|Number=Sing	9	obl	9:obl	_
8	sortidega	sort	NOUN	S	Case=Com|Number=Plur	9	obl	9:obl	_
9	tegelevad	tegelema	VERB	V	Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act	4	acl:relcl	4:acl	SpaceAfter=No
10	.	.	PUNCT	Z	_	3	punct	3:punct	_
""".strip()

def test_deps_conversion():
    doc = CoNLL.conll2doc(input_str=ESTONIAN_DEPS)
    assert len(doc.sentences) == 1
    sentence = doc.sentences[0]
    assert len(sentence.tokens) == 10
    assert sentence.has_enhanced_dependencies()

    word = doc.sentences[0].words[3]
    assert word.deps == "3:obj|9:nsubj"

    conll = "{:C}".format(doc)
    assert conll == ESTONIAN_DEPS

ESTONIAN_EMPTY_DEPS = """
# sent_id = ewtb2_000035_15
# text = Ja paari aasta pärast rôômalt maasikatele ...
1	Ja	ja	CCONJ	J	_	3	cc	5.1:cc	_
2	paari	paar	NUM	N	Case=Gen|Number=Sing|NumForm=Word|NumType=Card	3	nummod	3:nummod	_
3	aasta	aasta	NOUN	S	Case=Gen|Number=Sing	0	root	5.1:obl	_
4	pärast	pärast	ADP	K	AdpType=Post	3	case	3:case	_
5	rôômalt	rõõmsalt	ADV	D	Typo=Yes	3	advmod	5.1:advmod	Orphan=Yes|CorrectForm=rõõmsalt
5.1	panna	panema	VERB	V	VerbForm=Inf	_	_	0:root	Empty=5.1
6	maasikatele	maasikas	NOUN	S	Case=All|Number=Plur	3	obl	5.1:obl	Orphan=Yes
7	...	...	PUNCT	Z	_	3	punct	5.1:punct	_
""".strip()

ESTONIAN_EMPTY_END_DEPS = """
# sent_id = ewtb2_000035_15
# text = Ja paari aasta pärast rôômalt maasikatele ...
1	Ja	ja	CCONJ	J	_	3	cc	5.1:cc	_
2	paari	paar	NUM	N	Case=Gen|Number=Sing|NumForm=Word|NumType=Card	3	nummod	3:nummod	_
3	aasta	aasta	NOUN	S	Case=Gen|Number=Sing	0	root	5.1:obl	_
4	pärast	pärast	ADP	K	AdpType=Post	3	case	3:case	_
5	rôômalt	rõõmsalt	ADV	D	Typo=Yes	3	advmod	5.1:advmod	Orphan=Yes|CorrectForm=rõõmsalt
5.1	panna	panema	VERB	V	VerbForm=Inf	_	_	0:root	Empty=5.1
""".strip()

def test_empty_deps_conversion():
    """
    Check that we can read and then output a sentence with empty dependencies
    """
    check_empty_deps_conversion(ESTONIAN_EMPTY_DEPS, 7)

def test_empty_deps_at_end_conversion():
    """
    The empty deps conversion should also work if the empty dep is at the end
    """
    check_empty_deps_conversion(ESTONIAN_EMPTY_END_DEPS, 5)

def check_empty_deps_conversion(input_str, expected_words):
    doc = CoNLL.conll2doc(input_str=input_str, ignore_gapping=False)
    assert len(doc.sentences) == 1
    assert len(doc.sentences[0].tokens) == expected_words
    assert len(doc.sentences[0].words) == expected_words
    assert len(doc.sentences[0].empty_words) == 1

    sentence = doc.sentences[0]
    conll = "{:C}".format(doc)
    assert conll == input_str

    sentence_dict = doc.sentences[0].to_dict()
    assert len(sentence_dict) == expected_words + 1
    # currently this is true for both of the examples we run
    assert sentence_dict[5]['id'] == (5, 1)

    # redo the above checks to make sure
    # there are no weird bugs in the accessors
    assert len(doc.sentences) == 1
    assert len(doc.sentences[0].tokens) == expected_words
    assert len(doc.sentences[0].words) == expected_words
    assert len(doc.sentences[0].empty_words) == 1


ESTONIAN_DOC_ID = """
# doc_id = this_is_a_doc
# sent_id = ewtb2_000035_15
# text = Ja paari aasta pärast rôômalt maasikatele ...
1	Ja	ja	CCONJ	J	_	3	cc	5.1:cc	_
2	paari	paar	NUM	N	Case=Gen|Number=Sing|NumForm=Word|NumType=Card	3	nummod	3:nummod	_
3	aasta	aasta	NOUN	S	Case=Gen|Number=Sing	0	root	5.1:obl	_
4	pärast	pärast	ADP	K	AdpType=Post	3	case	3:case	_
5	rôômalt	rõõmsalt	ADV	D	Typo=Yes	3	advmod	5.1:advmod	Orphan=Yes|CorrectForm=rõõmsalt
5.1	panna	panema	VERB	V	VerbForm=Inf	_	_	0:root	Empty=5.1
6	maasikatele	maasikas	NOUN	S	Case=All|Number=Plur	3	obl	5.1:obl	Orphan=Yes
7	...	...	PUNCT	Z	_	3	punct	5.1:punct	_
""".strip()

def test_read_doc_id():
    doc = CoNLL.conll2doc(input_str=ESTONIAN_DOC_ID, ignore_gapping=False)
    assert "{:C}".format(doc) == ESTONIAN_DOC_ID
    assert doc.sentences[0].doc_id == 'this_is_a_doc'

SIMPLE_DEPENDENCY_INDEX_ERROR = """
# text = Teferi's best friend is Karn
# sent_id = 0
# notes = this sentence has a dependency index outside the sentence.  it should throw an IndexError
1	Teferi	_	_	_	_	0	root	_	start_char=0|end_char=6|ner=S-PERSON
2	's	_	_	_	_	1	dep	_	start_char=6|end_char=8|ner=O
3	best	_	_	_	_	2	dep	_	start_char=9|end_char=13|ner=O
4	friend	_	_	_	_	3	dep	_	start_char=14|end_char=20|ner=O
5	is	_	_	_	_	4	dep	_	start_char=21|end_char=23|ner=O
6	Karn	_	_	_	_	8	dep	_	start_char=24|end_char=28|ner=S-PERSON
""".strip()

def test_read_dependency_errors():
    with pytest.raises(IndexError):
        doc = CoNLL.conll2doc(input_str=SIMPLE_DEPENDENCY_INDEX_ERROR)

MULTIPLE_DOC_IDS = """
# doc_id = doc_1
# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0020
# text = His mother was also killed in the attack.
1	His	his	PRON	PRP$	Case=Gen|Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs	2	nmod:poss	2:nmod:poss	_
2	mother	mother	NOUN	NN	Number=Sing	5	nsubj:pass	5:nsubj:pass	_
3	was	be	AUX	VBD	Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin	5	aux:pass	5:aux:pass	_
4	also	also	ADV	RB	_	5	advmod	5:advmod	_
5	killed	kill	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	0	root	0:root	_
6	in	in	ADP	IN	_	8	case	8:case	_
7	the	the	DET	DT	Definite=Def|PronType=Art	8	det	8:det	_
8	attack	attack	NOUN	NN	Number=Sing	5	obl	5:obl:in	SpaceAfter=No
9	.	.	PUNCT	.	_	5	punct	5:punct	_

# doc_id = doc_1
# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0028
# text = This item is a small one and easily missed.
1	This	this	DET	DT	Number=Sing|PronType=Dem	2	det	2:det	_
2	item	item	NOUN	NN	Number=Sing	6	nsubj	6:nsubj|9:nsubj:pass	_
3	is	be	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	6	cop	6:cop	_
4	a	a	DET	DT	Definite=Ind|PronType=Art	6	det	6:det	_
5	small	small	ADJ	JJ	Degree=Pos	6	amod	6:amod	_
6	one	one	NOUN	NN	Number=Sing	0	root	0:root	_
7	and	and	CCONJ	CC	_	9	cc	9:cc	_
8	easily	easily	ADV	RB	_	9	advmod	9:advmod	_
9	missed	miss	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	6	conj	6:conj:and	SpaceAfter=No
10	.	.	PUNCT	.	_	6	punct	6:punct	_

# doc_id = doc_2
# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0029
# text = But in my view it is highly significant.
1	But	but	CCONJ	CC	_	8	cc	8:cc	_
2	in	in	ADP	IN	_	4	case	4:case	_
3	my	my	PRON	PRP$	Case=Gen|Number=Sing|Person=1|Poss=Yes|PronType=Prs	4	nmod:poss	4:nmod:poss	_
4	view	view	NOUN	NN	Number=Sing	8	obl	8:obl:in	_
5	it	it	PRON	PRP	Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs	8	nsubj	8:nsubj	_
6	is	be	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	8	cop	8:cop	_
7	highly	highly	ADV	RB	_	8	advmod	8:advmod	_
8	significant	significant	ADJ	JJ	Degree=Pos	0	root	0:root	SpaceAfter=No
9	.	.	PUNCT	.	_	8	punct	8:punct	_

# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0040
# text = The trial begins again Nov.28.
1	The	the	DET	DT	Definite=Def|PronType=Art	2	det	2:det	_
2	trial	trial	NOUN	NN	Number=Sing	3	nsubj	3:nsubj	_
3	begins	begin	VERB	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	0	root	0:root	_
4	again	again	ADV	RB	_	3	advmod	3:advmod	_
5	Nov.	November	PROPN	NNP	Abbr=Yes|Number=Sing	3	obl:tmod	3:obl:tmod	SpaceAfter=No
6	28	28	NUM	CD	NumForm=Digit|NumType=Card	5	nummod	5:nummod	SpaceAfter=No
7	.	.	PUNCT	.	_	3	punct	3:punct	_

""".lstrip()

def test_read_multiple_doc_ids():
    docs = CoNLL.conll2multi_docs(input_str=MULTIPLE_DOC_IDS)
    assert len(docs) == 2
    assert len(docs[0].sentences) == 2
    assert len(docs[1].sentences) == 2

    # remove the first doc_id comment
    text = "\n".join(MULTIPLE_DOC_IDS.split("\n")[1:])
    docs = CoNLL.conll2multi_docs(input_str=text)
    assert len(docs) == 3
    assert len(docs[0].sentences) == 1
    assert len(docs[1].sentences) == 1
    assert len(docs[2].sentences) == 2

ENGLISH_TEST_SENTENCE = """
# text = This is a test
# sent_id = 0
1	This	this	PRON	DT	Number=Sing|PronType=Dem	4	nsubj	_	start_char=0|end_char=4
2	is	be	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	4	cop	_	start_char=5|end_char=7
3	a	a	DET	DT	Definite=Ind|PronType=Art	4	det	_	start_char=8|end_char=9
4	test	test	NOUN	NN	Number=Sing	0	root	_	SpaceAfter=No|start_char=10|end_char=14
""".lstrip()

def test_convert_dict():
    doc = CoNLL.conll2doc(input_str=ENGLISH_TEST_SENTENCE)
    converted = CoNLL.convert_dict(doc.to_dict())

    expected = [[['1', 'This', 'this', 'PRON', 'DT', 'Number=Sing|PronType=Dem', '4', 'nsubj', '_', 'start_char=0|end_char=4'],
                 ['2', 'is', 'be', 'AUX', 'VBZ', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', '4', 'cop', '_', 'start_char=5|end_char=7'],
                 ['3', 'a', 'a', 'DET', 'DT', 'Definite=Ind|PronType=Art', '4', 'det', '_', 'start_char=8|end_char=9'],
                 ['4', 'test', 'test', 'NOUN', 'NN', 'Number=Sing', '0', 'root', '_', 'SpaceAfter=No|start_char=10|end_char=14']]]

    assert converted == expected

def test_line_numbers():
    doc = CoNLL.conll2doc(input_str=ENGLISH_TEST_SENTENCE, keep_line_numbers=True)
    # currently the line numbers are not output in the conllu format
    doc_conllu = "{:C}\n".format(doc)
    assert doc_conllu == ENGLISH_TEST_SENTENCE

    # currently the line numbers are not output in the dict format
    converted = CoNLL.convert_dict(doc.to_dict())
    expected = [[['1', 'This', 'this', 'PRON', 'DT', 'Number=Sing|PronType=Dem', '4', 'nsubj', '_', 'start_char=0|end_char=4'],
                 ['2', 'is', 'be', 'AUX', 'VBZ', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', '4', 'cop', '_', 'start_char=5|end_char=7'],
                 ['3', 'a', 'a', 'DET', 'DT', 'Definite=Ind|PronType=Art', '4', 'det', '_', 'start_char=8|end_char=9'],
                 ['4', 'test', 'test', 'NOUN', 'NN', 'Number=Sing', '0', 'root', '_', 'SpaceAfter=No|start_char=10|end_char=14']]]
    assert converted == expected

    for word_idx, word in enumerate(doc.sentences[0].words):
        # the test sentence has two comments in it
        assert word.line_number == word_idx + 2