import dataclasses
import logging
import math
import os
import random
import re

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

import stanza.models.classifiers.data as data
from stanza.models.classifiers.base_classifier import BaseClassifier
from stanza.models.classifiers.config import CNNConfig
from stanza.models.classifiers.data import SentimentDatum
from stanza.models.classifiers.utils import ExtraVectors, ModelType, build_output_layers
from stanza.models.common.bert_embedding import extract_bert_embeddings
from stanza.models.common.data import get_long_tensor, sort_all
from stanza.models.common.utils import attach_bert_model
from stanza.models.common.vocab import PAD_ID, UNK_ID

"""
The CNN classifier is based on Yoon Kim's work:

https://arxiv.org/abs/1408.5882

Also included are maxpool 2d, conv 2d, and a bilstm, as in

Text Classification Improved by Integrating Bidirectional LSTM
with Two-dimensional Max Pooling
https://aclanthology.org/C16-1329.pdf

The architecture is simple:

- Embedding at the bottom layer
  - separate learnable entry for UNK, since many of the embeddings we have use 0 for UNK
- maybe a bilstm layer, as per a command line flag
- Some number of conv2d layers over the embedding
- Maxpool layers over small windows, window size being a parameter
- FC layer to the classification layer

One experiment which was run and found to be a bit of a negative was
putting a layer on top of the pretrain.  You would think that might
help, but dev performance went down for each variation of
  - trans(emb)
  - relu(trans(emb))
  - dropout(trans(emb))
  - dropout(relu(trans(emb)))
"""

logger = logging.getLogger('stanza')
tlogger = logging.getLogger('stanza.classifiers.trainer')

class CNNClassifier(BaseClassifier):
    def __init__(self, pretrain, extra_vocab, labels,
                 charmodel_forward, charmodel_backward, elmo_model, bert_model, bert_tokenizer, force_bert_saved, peft_name,
                 args):
        """
        pretrain is a pretrained word embedding.  should have .emb and .vocab

        extra_vocab is a collection of words in the training data to
        be used for the delta word embedding, if used.  can be set to
        None if delta word embedding is not used.

        labels is the list of labels we expect in the training data.
        Used to derive the number of classes.  Saving it in the model
        will let us check that test data has the same labels

        args is either the complete arguments when training, or the
        subset of arguments stored in the model save file
        """
        super(CNNClassifier, self).__init__()
        self.labels = labels
        bert_finetune = args.bert_finetune
        use_peft = args.use_peft
        force_bert_saved = force_bert_saved or bert_finetune
        logger.debug("bert_finetune %s / force_bert_saved %s", bert_finetune, force_bert_saved)

        # this may change when loaded in a new Pipeline, so it's not part of the config
        self.peft_name = peft_name

        # we build a separate config out of the args so that we can easily save it in torch
        self.config = CNNConfig(filter_channels = args.filter_channels,
                                filter_sizes = args.filter_sizes,
                                fc_shapes = args.fc_shapes,
                                dropout = args.dropout,
                                num_classes = len(labels),
                                wordvec_type = args.wordvec_type,
                                extra_wordvec_method = args.extra_wordvec_method,
                                extra_wordvec_dim = args.extra_wordvec_dim,
                                extra_wordvec_max_norm = args.extra_wordvec_max_norm,
                                char_lowercase = args.char_lowercase,
                                charlm_projection = args.charlm_projection,
                                has_charlm_forward = charmodel_forward is not None,
                                has_charlm_backward = charmodel_backward is not None,
                                use_elmo = args.use_elmo,
                                elmo_projection = args.elmo_projection,
                                bert_model = args.bert_model,
                                bert_finetune = bert_finetune,
                                bert_hidden_layers = args.bert_hidden_layers,
                                force_bert_saved = force_bert_saved,

                                use_peft = use_peft,
                                lora_rank = args.lora_rank,
                                lora_alpha = args.lora_alpha,
                                lora_dropout = args.lora_dropout,
                                lora_modules_to_save = args.lora_modules_to_save,
                                lora_target_modules = args.lora_target_modules,

                                bilstm = args.bilstm,
                                bilstm_hidden_dim = args.bilstm_hidden_dim,
                                maxpool_width = args.maxpool_width,
                                model_type = ModelType.CNN)

        self.char_lowercase = args.char_lowercase

        self.unsaved_modules = []

        emb_matrix = pretrain.emb
        self.add_unsaved_module('embedding', nn.Embedding.from_pretrained(emb_matrix, freeze=True))
        self.add_unsaved_module('elmo_model', elmo_model)
        self.vocab_size = emb_matrix.shape[0]
        self.embedding_dim = emb_matrix.shape[1]

        self.add_unsaved_module('forward_charlm', charmodel_forward)
        if charmodel_forward is not None:
            tlogger.debug("Got forward char model of dimension {}".format(charmodel_forward.hidden_dim()))
            if not charmodel_forward.is_forward_lm:
                raise ValueError("Got a backward charlm as a forward charlm!")
        self.add_unsaved_module('backward_charlm', charmodel_backward)
        if charmodel_backward is not None:
            tlogger.debug("Got backward char model of dimension {}".format(charmodel_backward.hidden_dim()))
            if charmodel_backward.is_forward_lm:
                raise ValueError("Got a forward charlm as a backward charlm!")

        attach_bert_model(self, bert_model, bert_tokenizer, self.config.use_peft, force_bert_saved)

        # The Pretrain has PAD and UNK already (indices 0 and 1), but we
        # possibly want to train UNK while freezing the rest of the embedding
        # note that the /10.0 operation has to be inside nn.Parameter unless
        # you want to spend a long time debugging this
        self.unk = nn.Parameter(torch.randn(self.embedding_dim) / np.sqrt(self.embedding_dim) / 10.0)

        # replacing NBSP picks up a whole bunch of words for VI
        self.vocab_map = { word.replace('\xa0', ' '): i for i, word in enumerate(pretrain.vocab) }

        if self.config.extra_wordvec_method is not ExtraVectors.NONE:
            if not extra_vocab:
                raise ValueError("Should have had extra_vocab set for extra_wordvec_method {}".format(self.config.extra_wordvec_method))
            if not args.extra_wordvec_dim:
                self.config.extra_wordvec_dim = self.embedding_dim
            if self.config.extra_wordvec_method is ExtraVectors.SUM:
                if self.config.extra_wordvec_dim != self.embedding_dim:
                    raise ValueError("extra_wordvec_dim must equal embedding_dim for {}".format(self.config.extra_wordvec_method))

            self.extra_vocab = list(extra_vocab)
            self.extra_vocab_map = { word: i for i, word in enumerate(self.extra_vocab) }
            # TODO: possibly add regularization specifically on the extra embedding?
            # note: it looks like a bug that this doesn't add UNK or PAD, but actually
            # those are expected to already be the first two entries
            self.extra_embedding = nn.Embedding(num_embeddings = len(extra_vocab),
                                                embedding_dim = self.config.extra_wordvec_dim,
                                                max_norm = self.config.extra_wordvec_max_norm,
                                                padding_idx = 0)
            tlogger.debug("Extra embedding size: {}".format(self.extra_embedding.weight.shape))
        else:
            self.extra_vocab = None
            self.extra_vocab_map = None
            self.config.extra_wordvec_dim = 0
            self.extra_embedding = None

        # Pytorch is "aware" of the existence of the nn.Modules inside
        # an nn.ModuleList in terms of parameters() etc
        if self.config.extra_wordvec_method is ExtraVectors.NONE:
            total_embedding_dim = self.embedding_dim
        elif self.config.extra_wordvec_method is ExtraVectors.SUM:
            total_embedding_dim = self.embedding_dim
        elif self.config.extra_wordvec_method is ExtraVectors.CONCAT:
            total_embedding_dim = self.embedding_dim + self.config.extra_wordvec_dim
        else:
            raise ValueError("unable to handle {}".format(self.config.extra_wordvec_method))

        if charmodel_forward is not None:
            if args.charlm_projection:
                self.charmodel_forward_projection = nn.Linear(charmodel_forward.hidden_dim(), args.charlm_projection)
                total_embedding_dim += args.charlm_projection
            else:
                self.charmodel_forward_projection = None
                total_embedding_dim += charmodel_forward.hidden_dim()

        if charmodel_backward is not None:
            if args.charlm_projection:
                self.charmodel_backward_projection = nn.Linear(charmodel_backward.hidden_dim(), args.charlm_projection)
                total_embedding_dim += args.charlm_projection
            else:
                self.charmodel_backward_projection = None
                total_embedding_dim += charmodel_backward.hidden_dim()

        if self.config.use_elmo:
            if elmo_model is None:
                raise ValueError("Model requires elmo, but elmo_model not passed in")
            elmo_dim = elmo_model.sents2elmo([["Test"]])[0].shape[1]

            # this mapping will combine 3 layers of elmo to 1 layer of features
            self.elmo_combine_layers = nn.Linear(in_features=3, out_features=1, bias=False)
            if self.config.elmo_projection:
                self.elmo_projection = nn.Linear(in_features=elmo_dim, out_features=self.config.elmo_projection)
                total_embedding_dim = total_embedding_dim + self.config.elmo_projection
            else:
                total_embedding_dim = total_embedding_dim + elmo_dim

        if bert_model is not None:
            if self.config.bert_hidden_layers:
                # The average will be offset by 1/N so that the default zeros
                # repressents an average of the N layers
                if self.config.bert_hidden_layers > bert_model.config.num_hidden_layers:
                    # limit ourselves to the number of layers actually available
                    # note that we can +1 because of the initial embedding layer
                    self.config.bert_hidden_layers = bert_model.config.num_hidden_layers + 1
                self.bert_layer_mix = nn.Linear(self.config.bert_hidden_layers, 1, bias=False)
                nn.init.zeros_(self.bert_layer_mix.weight)
            else:
                # an average of layers 2, 3, 4 will be used
                # (for historic reasons)
                self.bert_layer_mix = None

            if bert_tokenizer is None:
                raise ValueError("Cannot have a bert model without a tokenizer")
            self.bert_dim = self.bert_model.config.hidden_size
            total_embedding_dim += self.bert_dim

        if self.config.bilstm:
            conv_input_dim = self.config.bilstm_hidden_dim * 2
            self.bilstm = nn.LSTM(batch_first=True,
                                  input_size=total_embedding_dim,
                                  hidden_size=self.config.bilstm_hidden_dim,
                                  num_layers=2,
                                  bidirectional=True,
                                  dropout=0.2)
        else:
            conv_input_dim = total_embedding_dim
            self.bilstm = None

        self.fc_input_size = 0
        self.conv_layers = nn.ModuleList()
        self.max_window = 0
        for filter_idx, filter_size in enumerate(self.config.filter_sizes):
            if isinstance(filter_size, int):
                self.max_window = max(self.max_window, filter_size)
                if isinstance(self.config.filter_channels, int):
                    filter_channels = self.config.filter_channels
                else:
                    filter_channels = self.config.filter_channels[filter_idx]
                fc_delta = filter_channels // self.config.maxpool_width
                tlogger.debug("Adding full width filter %d.  Output channels: %d -> %d", filter_size, filter_channels, fc_delta)
                self.fc_input_size += fc_delta
                self.conv_layers.append(nn.Conv2d(in_channels=1,
                                                  out_channels=filter_channels,
                                                  kernel_size=(filter_size, conv_input_dim)))
            elif isinstance(filter_size, tuple) and len(filter_size) == 2:
                filter_height, filter_width = filter_size
                self.max_window = max(self.max_window, filter_width)
                if isinstance(self.config.filter_channels, int):
                    filter_channels = max(1, self.config.filter_channels // (conv_input_dim // filter_width))
                else:
                    filter_channels = self.config.filter_channels[filter_idx]
                fc_delta = filter_channels * (conv_input_dim // filter_width) // self.config.maxpool_width
                tlogger.debug("Adding filter %s.  Output channels: %d -> %d", filter_size, filter_channels, fc_delta)
                self.fc_input_size += fc_delta
                self.conv_layers.append(nn.Conv2d(in_channels=1,
                                                  out_channels=filter_channels,
                                                  stride=(1, filter_width),
                                                  kernel_size=(filter_height, filter_width)))
            else:
                raise ValueError("Expected int or 2d tuple for conv size")

        tlogger.debug("Input dim to FC layers: %d", self.fc_input_size)
        self.fc_layers = build_output_layers(self.fc_input_size, self.config.fc_shapes, self.config.num_classes)

        self.dropout = nn.Dropout(self.config.dropout)

    def add_unsaved_module(self, name, module):
        self.unsaved_modules += [name]
        setattr(self, name, module)

        if module is not None and (name in ('forward_charlm', 'backward_charlm') or
                                   (name == 'bert_model' and not self.config.use_peft)):
            # if we are using peft, we should not save the transformer directly
            # instead, the peft parameters only will be saved later
            for _, parameter in module.named_parameters():
                parameter.requires_grad = False

    def is_unsaved_module(self, name):
        return name.split('.')[0] in self.unsaved_modules

    def log_configuration(self):
        """
        Log some essential information about the model configuration to the training logger
        """
        tlogger.info("Filter sizes: %s" % str(self.config.filter_sizes))
        tlogger.info("Filter channels: %s" % str(self.config.filter_channels))
        tlogger.info("Intermediate layers: %s" % str(self.config.fc_shapes))

    def log_norms(self):
        lines = ["NORMS FOR MODEL PARAMTERS"]
        for name, param in self.named_parameters():
            if param.requires_grad and name.split(".")[0] not in ('forward_charlm', 'backward_charlm'):
                lines.append("%s %.6g" % (name, torch.norm(param).item()))
        logger.info("\n".join(lines))

    def build_char_reps(self, inputs, max_phrase_len, charlm, projection, begin_paddings, device):
        char_reps = charlm.build_char_representation(inputs)
        if projection is not None:
            char_reps = [projection(x) for x in char_reps]
        char_inputs = torch.zeros((len(inputs), max_phrase_len, char_reps[0].shape[-1]), device=device)
        for idx, rep in enumerate(char_reps):
            start = begin_paddings[idx]
            end = start + rep.shape[0]
            char_inputs[idx, start:end, :] = rep
        return char_inputs

    def extract_bert_embeddings(self, inputs, max_phrase_len, begin_paddings, device):
        bert_embeddings = extract_bert_embeddings(self.config.bert_model, self.bert_tokenizer, self.bert_model, inputs, device,
                                                  keep_endpoints=False,
                                                  num_layers=self.bert_layer_mix.in_features if self.bert_layer_mix is not None else None,
                                                  detach=not self.config.bert_finetune,
                                                  peft_name=self.peft_name)
        if self.bert_layer_mix is not None:
            # add the average so that the default behavior is to
            # take an average of the N layers, and anything else
            # other than that needs to be learned
            bert_embeddings = [self.bert_layer_mix(feature).squeeze(2) + feature.sum(axis=2) / self.bert_layer_mix.in_features for feature in bert_embeddings]
        bert_inputs = torch.zeros((len(inputs), max_phrase_len, bert_embeddings[0].shape[-1]), device=device)
        for idx, rep in enumerate(bert_embeddings):
            start = begin_paddings[idx]
            end = start + rep.shape[0]
            bert_inputs[idx, start:end, :] = rep
        return bert_inputs

    def forward(self, inputs):
        # assume all pieces are on the same device
        device = next(self.parameters()).device

        vocab_map = self.vocab_map
        def map_word(word):
            idx = vocab_map.get(word, None)
            if idx is not None:
                return idx
            if word[-1] == "'":
                idx = vocab_map.get(word[:-1], None)
                if idx is not None:
                    return idx
            return vocab_map.get(word.lower(), UNK_ID)

        inputs = [x.text if isinstance(x, SentimentDatum) else x for x in inputs]
        # we will pad each phrase so either it matches the longest
        # conv or the longest phrase in the input, whichever is longer
        max_phrase_len = max(len(x) for x in inputs)
        if self.max_window > max_phrase_len:
            max_phrase_len = self.max_window

        batch_indices = []
        batch_unknowns = []
        extra_batch_indices = []
        begin_paddings = []
        end_paddings = []

        elmo_batch_words = []

        for phrase in inputs:
            # we use random at training time to try to learn different
            # positions of padding.  at test time, though, we want to
            # have consistent results, so we set that to 0 begin_pad
            if self.training:
                begin_pad_width = random.randint(0, max_phrase_len - len(phrase))
            else:
                begin_pad_width = 0
            end_pad_width = max_phrase_len - begin_pad_width - len(phrase)

            begin_paddings.append(begin_pad_width)
            end_paddings.append(end_pad_width)

            # the initial lists are the length of the begin padding
            sentence_indices = [PAD_ID] * begin_pad_width
            sentence_indices.extend([map_word(x) for x in phrase])
            sentence_indices.extend([PAD_ID] * end_pad_width)

            # the "unknowns" will be the locations of the unknown words.
            # these locations will get the specially trained unknown vector
            # TODO: split UNK based on part of speech?  might be an interesting experiment
            sentence_unknowns = [idx for idx, word in enumerate(sentence_indices) if word == UNK_ID]

            batch_indices.append(sentence_indices)
            batch_unknowns.append(sentence_unknowns)

            if self.extra_vocab:
                extra_sentence_indices = [PAD_ID] * begin_pad_width
                for word in phrase:
                    if word in self.extra_vocab_map:
                        # the extra vocab is initialized from the
                        # words in the training set, which means there
                        # would be no unknown words.  to occasionally
                        # train the extra vocab's unknown words, we
                        # replace 1% of the words with UNK
                        # we don't do that for the original embedding
                        # on the assumption that there may be some
                        # unknown words in the training set anyway
                        # TODO: maybe train unk for the original embedding?
                        if self.training and random.random() < 0.01:
                            extra_sentence_indices.append(UNK_ID)
                        else:
                            extra_sentence_indices.append(self.extra_vocab_map[word])
                    else:
                        extra_sentence_indices.append(UNK_ID)
                extra_sentence_indices.extend([PAD_ID] * end_pad_width)
                extra_batch_indices.append(extra_sentence_indices)

            if self.config.use_elmo:
                elmo_phrase_words = [""] * begin_pad_width
                for word in phrase:
                    elmo_phrase_words.append(word)
                elmo_phrase_words.extend([""] * end_pad_width)
                elmo_batch_words.append(elmo_phrase_words)

        # creating a single large list with all the indices lets us
        # create a single tensor, which is much faster than creating
        # many tiny tensors
        # we can convert this to the input to the CNN
        # it is padded at one or both ends so that it is now num_phrases x max_len x emb_size
        # there are two ways in which this padding is suboptimal
        # the first is that for short sentences, smaller windows will
        #   be padded to the point that some windows are entirely pad
        # the second is that a sentence S will have more or less padding
        #   depending on what other sentences are in its batch
        # we assume these effects are pretty minimal
        batch_indices = torch.tensor(batch_indices, requires_grad=False, device=device)
        input_vectors = self.embedding(batch_indices)
        # we use the random unk so that we are not necessarily
        # learning to match 0s for unk
        for phrase_num, sentence_unknowns in enumerate(batch_unknowns):
            input_vectors[phrase_num][sentence_unknowns] = self.unk

        if self.extra_vocab:
            extra_batch_indices = torch.tensor(extra_batch_indices, requires_grad=False, device=device)
            extra_input_vectors = self.extra_embedding(extra_batch_indices)
            if self.config.extra_wordvec_method is ExtraVectors.CONCAT:
                all_inputs = [input_vectors, extra_input_vectors]
            elif self.config.extra_wordvec_method is ExtraVectors.SUM:
                all_inputs = [input_vectors + extra_input_vectors]
            else:
                raise ValueError("unable to handle {}".format(self.config.extra_wordvec_method))
        else:
            all_inputs = [input_vectors]

        if self.forward_charlm is not None:
            char_reps_forward = self.build_char_reps(inputs, max_phrase_len, self.forward_charlm, self.charmodel_forward_projection, begin_paddings, device)
            all_inputs.append(char_reps_forward)

        if self.backward_charlm is not None:
            char_reps_backward = self.build_char_reps(inputs, max_phrase_len, self.backward_charlm, self.charmodel_backward_projection, begin_paddings, device)
            all_inputs.append(char_reps_backward)

        if self.config.use_elmo:
            # this will be N arrays of 3xMx1024 where M is the number of words
            # and N is the number of sentences (and 1024 is actually the number of weights)
            elmo_arrays = self.elmo_model.sents2elmo(elmo_batch_words, output_layer=-2)
            elmo_tensors = [torch.tensor(x).to(device=device) for x in elmo_arrays]
            # elmo_tensor will now be Nx3xMx1024
            elmo_tensor = torch.stack(elmo_tensors)
            # Nx1024xMx3
            elmo_tensor = torch.transpose(elmo_tensor, 1, 3)
            # NxMx1024x3
            elmo_tensor = torch.transpose(elmo_tensor, 1, 2)
            # NxMx1024x1
            elmo_tensor = self.elmo_combine_layers(elmo_tensor)
            # NxMx1024
            elmo_tensor = elmo_tensor.squeeze(3)
            if self.config.elmo_projection:
                elmo_tensor = self.elmo_projection(elmo_tensor)
            all_inputs.append(elmo_tensor)

        if self.bert_model is not None:
            bert_embeddings = self.extract_bert_embeddings(inputs, max_phrase_len, begin_paddings, device)
            all_inputs.append(bert_embeddings)

        # still works even if there's just one item
        input_vectors = torch.cat(all_inputs, dim=2)

        if self.config.bilstm:
            input_vectors, _ = self.bilstm(self.dropout(input_vectors))

        # reshape to fit the input tensors
        x = input_vectors.unsqueeze(1)

        conv_outs = []
        for conv, filter_size in zip(self.conv_layers, self.config.filter_sizes):
            if isinstance(filter_size, int):
                conv_out = self.dropout(F.relu(conv(x).squeeze(3)))
                conv_outs.append(conv_out)
            else:
                conv_out = conv(x).transpose(2, 3).flatten(1, 2)
                conv_out = self.dropout(F.relu(conv_out))
                conv_outs.append(conv_out)
        pool_outs = [F.max_pool2d(out, (self.config.maxpool_width, out.shape[2])).squeeze(2) for out in conv_outs]
        pooled = torch.cat(pool_outs, dim=1)

        previous_layer = pooled
        for fc in self.fc_layers[:-1]:
            previous_layer = self.dropout(F.relu(fc(previous_layer)))
        out = self.fc_layers[-1](previous_layer)
        # note that we return the raw logits rather than use a softmax
        # https://discuss.pytorch.org/t/multi-class-cross-entropy-loss-and-softmax-in-pytorch/24920/4
        return out

    def get_params(self, skip_modules=True):
        model_state = self.state_dict()
        # skip saving modules like pretrained embeddings, because they are large and will be saved in a separate file
        if skip_modules:
            skipped = [k for k in model_state.keys() if self.is_unsaved_module(k)]
            for k in skipped:
                del model_state[k]

        config = dataclasses.asdict(self.config)
        config['wordvec_type'] = config['wordvec_type'].name
        config['extra_wordvec_method'] = config['extra_wordvec_method'].name
        config['model_type'] = config['model_type'].name

        params = {
            'model':        model_state,
            'config':       config,
            'labels':       self.labels,
            'extra_vocab':  self.extra_vocab,
        }
        if self.config.use_peft:
            # Hide import so that peft dependency is optional
            from peft import get_peft_model_state_dict
            params["bert_lora"] = get_peft_model_state_dict(self.bert_model, adapter_name=self.peft_name)
        return params

    def preprocess_data(self, sentences):
        sentences = [data.update_text(s, self.config.wordvec_type) for s in sentences]
        return sentences

    def extract_sentences(self, doc):
        # TODO: tokens or words better here?
        return [[token.text for token in sentence.tokens] for sentence in doc.sentences]