import collections
import numpy as np
import string

import logging
import json
import os
import sys
import evaluate

from dataclasses import dataclass, field
from typing import Optional

from transformers import (
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    EvalPrediction,
    TrainingArguments,
    DefaultDataCollator,
)

from utils_qa import load_dataset
from utils_qa import postprocess_qa_predictions

from trainer_qa import QuestionAnsweringTrainer

dataset_path = 'data/train.json'
model_checkpoint = 'xlm-roberta-base'

if __name__ == '__main__':
    # Load the raw dataset which contains context, question and answers
    raw_dataset = load_dataset(dataset_path)

    # Load the pretrained tokenizer and model from huggingface.co
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model     = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

    # Validation preprocessing
    def preprocess_function(examples):
        '''
        help to create a tokenized dataset which should finally be used to train the my question answering model
        '''
        examples['question'] = [q.lstrip() for q in examples['question']]
        tokenized_examples = tokenizer(
            examples['question'],
            examples['context'],
            truncation = "only_second",
            max_length = tokenizer.model_max_length,
            return_offsets_mapping = True,
            padding = "max_length",
        )
        # The offset mappings will give us a map from token to character position in the original context. This will
        # help us compute the start_positions and end_positions.
        offset_mapping = tokenized_examples.pop("offset_mapping")

        assert(len(offset_mapping) == len(tokenized_examples['input_ids']))

        # Let's label those examples!
        tokenized_examples["start_positions"] = []
        tokenized_examples["end_positions"] = []

        for i, offset in enumerate(offset_mapping):
            input_ids = tokenized_examples["input_ids"][i]
            cls_index = input_ids.index(tokenizer.cls_token_id)

            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples.sequence_ids(i)
            answers = examples['answers'][i]

            # If no answers are given, set the cls_index as answer.
            
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)

            if len(answers) == 0:
                continue
            
            # Find the start and end of the context
            context_start = sequence_ids.index(1)
            context_end   = sequence_ids[context_start:].index(None) + context_start - 1
            
            start_char = answers[0]["answer_start"]
            end_char   = start_char + len(answers[0]["text"])

            # If the answer is not fully inside the context, label it (0, 0)
            if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
                continue
            
            # Otherwise it's the start and end token positions
            token_start_index = context_start
            token_end_index   = context_end

            while token_start_index < len(offset) and offset[token_start_index][0] <= start_char:   token_start_index += 1
            while token_end_index  >= 0           and offset[token_end_index][1]   >= end_char:     token_end_index -= 1

            tokenized_examples["start_positions"][-1] = token_start_index - 1
            tokenized_examples["end_positions"][-1]   = token_end_index + 1

        return  tokenized_examples
    
    # Create train features from raw dataset
    tokenized_dataset = raw_dataset.map(preprocess_function, batched = True, remove_columns = ['title', 'context', 'question'])

    # Post-processing:
    def post_processing_function(features, tokenizer, predictions, stage = "eval"):
        # Post-processing: we match the start logits and end logits to answers in the original context.
        predictions = postprocess_qa_predictions(
            features = features,
            tokenizer = tokenizer,
            predictions = predictions
        )
        formatted_predictions = [
            {"id": k,
            "prediction_text": v,
            "no_answer_probability": 0.0
            } for k, v in predictions.items()
        ]
        references = [{"id": ft["id"], "answers": ft["answers"]} for ft in features]

        return  EvalPrediction(predictions = formatted_predictions, label_ids = references)

    metric = evaluate.load("squad_v2")

    def compute_metrics(p: EvalPrediction):
        return metric.compute(predictions = p.predictions,
                              references  = p.label_ids)
    
    data_collator = DefaultDataCollator()

    training_args = TrainingArguments(
        output_dir = "./results",
        evaluation_strategy = 'steps',
        learning_rate = 2e-5,
        per_device_train_batch_size = 16,
        per_device_eval_batch_size  = 16,
        save_total_limit = 1,
        save_steps = 1000, 
        eval_steps = 1000,
        num_train_epochs = 10,
        weight_decay = 0.01,
    )
    trainer = QuestionAnsweringTrainer(
        model=model,
        args=training_args,
        train_dataset = tokenized_dataset["train"],
        eval_dataset  = tokenized_dataset["valid"],
        tokenizer = tokenizer,
        data_collator = data_collator,
        post_process_function=post_processing_function,
        compute_metrics = compute_metrics,
    )
    trainer.train()