"""
Runs a few tests on the split_wikiner file
"""

import os
import tempfile

import pytest

from stanza.utils.datasets.ner import split_wikiner

from stanza.tests import *

pytestmark = [pytest.mark.pipeline, pytest.mark.travis]

# two sentences from the Italian dataset, split into many pieces
# to test the splitting functionality
FBK_SAMPLE = """
Il	O
Papa	O
si	O
aggrava	O

Le	O
condizioni	O
di	O

Papa	O
Giovanni	PER
Paolo	PER
II	PER
si	O

sono	O
aggravate	O
in	O
il	O
corso	O

di	O
la	O
giornata	O
di	O
giovedì	O
.	O

Il	O
portavoce	O
Navarro	PER
Valls	PER

ha	O
dichiarato	O
che	O

il	O
Santo	O
Padre	O

in	O
la	O
giornata	O

di	O
oggi	O
è	O
stato	O

colpito	O
da	O
una	O
affezione	O

altamente	O
febbrile	O
provocata	O
da	O
una	O

infezione	O
documentata	O

di	O
le	O
vie	O
urinarie	O
.	O

A	O
il	O
momento	O

non	O
è	O
previsto	O
il	O
ricovero	O

a	O
il	O
Policlinico	LOC
Gemelli	LOC
,	O

come	O
ha	O
precisato	O
il	O

responsabile	O
di	O
il	O
dipartimento	O

di	O
emergenza	O
professor	O
Rodolfo	PER
Proietti	PER
.	O
"""


def test_read_sentences():
    with tempfile.TemporaryDirectory() as tempdir:
        raw_filename = os.path.join(tempdir, "raw.tsv")
        with open(raw_filename, "w") as fout:
            fout.write(FBK_SAMPLE)

        sentences = split_wikiner.read_sentences(raw_filename, "utf-8")
        assert len(sentences) == 20
        text = [["\t".join(word) for word in sent] for sent in sentences]
        text = ["\n".join(sent) for sent in text]
        text = "\n\n".join(text)
        assert FBK_SAMPLE.strip() == text

def test_write_sentences():
    with tempfile.TemporaryDirectory() as tempdir:
        raw_filename = os.path.join(tempdir, "raw.tsv")
        with open(raw_filename, "w") as fout:
            fout.write(FBK_SAMPLE)

        sentences = split_wikiner.read_sentences(raw_filename, "utf-8")
        copy_filename = os.path.join(tempdir, "copy.tsv")
        split_wikiner.write_sentences_to_file(sentences, copy_filename)

        sent2 = split_wikiner.read_sentences(raw_filename, "utf-8")
        assert sent2 == sentences

def run_split_wikiner(expected_train=14, expected_dev=3, expected_test=3, **kwargs):
    """
    Runs a test using various parameters to check the results of the splitting process
    """
    with tempfile.TemporaryDirectory() as indir:
        raw_filename = os.path.join(indir, "raw.tsv")
        with open(raw_filename, "w") as fout:
            fout.write(FBK_SAMPLE)

        with tempfile.TemporaryDirectory() as outdir:
            split_wikiner.split_wikiner(outdir, raw_filename, **kwargs)

            train_file = os.path.join(outdir, "it_fbk.train.bio")
            dev_file = os.path.join(outdir, "it_fbk.dev.bio")
            test_file = os.path.join(outdir, "it_fbk.test.bio")

            assert os.path.exists(train_file)
            assert os.path.exists(dev_file)
            if kwargs["test_section"]:
                assert os.path.exists(test_file)
            else:
                assert not os.path.exists(test_file)

            train_sent = split_wikiner.read_sentences(train_file, "utf-8")
            dev_sent = split_wikiner.read_sentences(dev_file, "utf-8")
            assert len(train_sent) == expected_train
            assert len(dev_sent) == expected_dev
            if kwargs["test_section"]:
                test_sent = split_wikiner.read_sentences(test_file, "utf-8")
                assert len(test_sent) == expected_test
            else:
                test_sent = []

            if kwargs["shuffle"]:
                orig_sents = sorted(split_wikiner.read_sentences(raw_filename, "utf-8"))
                split_sents = sorted(train_sent + dev_sent + test_sent)
            else:
                orig_sents = split_wikiner.read_sentences(raw_filename, "utf-8")
                split_sents = train_sent + dev_sent + test_sent
            assert orig_sents == split_sents

def test_no_shuffle_split():
    run_split_wikiner(prefix="it_fbk", shuffle=False, test_section=True)

def test_shuffle_split():
    run_split_wikiner(prefix="it_fbk", shuffle=True, test_section=True)

def test_resize():
    run_split_wikiner(expected_train=12, expected_dev=2, expected_test=6, train_fraction=0.6, dev_fraction=0.1, prefix="it_fbk", shuffle=True, test_section=True)

def test_no_test_split():
    run_split_wikiner(expected_train=17, train_fraction=0.85, prefix="it_fbk", shuffle=False, test_section=False)