from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from distutils.dir_util import copy_tree
from underthesea import word_tokenize
from utils.data_preprocessing import *
from vncorenlp import VnCoreNLP
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
import pandas as pd
import numpy as np
from optimum.bettertransformer import BetterTransformer
from stqdm import stqdm

MODEL_PATH = "D:\\Thesis Topic modelling\\Phobert-base-v2-shopee"
TOKENIZE_PATH = "./vncorenlp/VnCoreNLP-1.1.1.jar"

def get_prediction(predictions, threshold=0.5):
    """
    Produce probability from the classification model

    Parameters
    ----------
    predictions : torch.tensor
        output from the last linear layer

    Returns
    ----------
    numpy.array
        an array containing probabilities for each label     
    """
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    return y_pred


class InferencePhobert:
  def __init__(self, tokenize_model = "underthesea", classification_model = MODEL_PATH):
    """
    A class for inferencing PhoBERT model

    Parameters
    ----------
    tokenize_model : string
        choosing which model to tokenize text (underthesea or rdrsegementer)
    
    classification_model: string
        path to model weight

    """
    labels = ["Quality",	"Serve",	"Pack",	"Shipping", "Price", "Other"]
    id2label = {idx:label for idx, label in enumerate(labels)}
    label2id = {label:idx for idx, label in enumerate(labels)}
    model = AutoModelForSequenceClassification.from_pretrained(classification_model, problem_type="multi_label_classification",
                                                          num_labels=len(labels),
                                                          id2label=id2label,
                                                          label2id=label2id)
    model.eval()
    self.model = BetterTransformer.transform(model, keep_original_model=True)
    self.tokenizer = AutoTokenizer.from_pretrained(classification_model)
    self.segmenter_path = tokenize_model

  def rdrsegment(self, text):
    """
    Tokenize text using rdrsegmenter

    Parameters
    ----------
    text : string
        input text

    Returns
    ----------
    string
        tokenized text (For example, "san pham tot" to "san_pham tot")

    """
    text = self.rdrsegmenter.tokenize(text)
    text = ' '.join([' '.join(x) for x in text])
    return text

  def preprocess(self, data):
    """
    Reformatting text to fit into PhoBERT model. This process include tokenzing, byte-pair-encoding and padding

    Parameters
    ----------
    data : list
        input text data

    Returns
    ----------
    dictionary
       Containing encoded values, masked attention.

    """

    text_list = []
    if self.segmenter_path == "underthesea":
      for text in data:
        text = word_tokenize(text, format="text")
        text_list.append(text)
    else:
      self.rdrsegmenter = VnCoreNLP(self.segmenter_path, annotators="wseg", max_heap_size='-Xmx500m')
      for text in data:
        text = self.rdrsegmenter.tokenize(text)
        text = ' '.join([' '.join(x) for x in text])
        text_list.append(text)
    encoding = self.tokenizer(text_list, padding = "max_length", truncation = True, max_length = 125)
    return encoding

  def generate_dataset(self, processed_data, batch_size = 10):
    """
    Gemerate torch dataset from data

    Parameters
    ----------
    processed_data : dictionary
        output from preprocess function

    batch_size: int
        How many reviews to be included for each iteration

    Returns
    ----------
    torch.dataset
        Dataset representing the reviews and associated labels

    """
    inputs = torch.tensor(processed_data["input_ids"])
    masks = torch.tensor(processed_data["attention_mask"])
    dataset = TensorDataset(inputs, masks)
    dataset_sampler = SequentialSampler(dataset)
    data_loader = DataLoader(dataset, sampler=dataset_sampler, batch_size=batch_size)
    return data_loader

  def predict(self, dataset):
    """
    Get prediction from PhoBERT model

    Parameters
    ----------
    dataset : torch.dataset
        output from generate_dataset function

    Returns
    ----------
    numpy.array
        containing probabilities for each label

    """
    predictions = []
    for step, batch in stqdm(enumerate(dataset), total = len(dataset)):
      b_input_ids, b_input_mask = batch
      with torch.no_grad():
        self.model.eval()
        input_ids = torch.tensor(b_input_ids)
        attention_mask = torch.tensor(b_input_mask)
        outputs = self.model(input_ids,
        token_type_ids=None,
        attention_mask=attention_mask)
        prediction = get_prediction(outputs[0], threshold=0.5)
      predictions.append(prediction)
    res = np.concatenate(predictions)
    return res

  def predict_sentence(self, text):
    """
    Get prediction from PhoBERT model for a single review

    Parameters
    ----------
    text : string
        output from generate_dataset function

    Returns
    ----------
    numpy.array
        containing probabilities for each label

    """
    if self.segmenter_path == "underthesea":
      text = word_tokenize(text, format="text")
    else: 
      self.rdrsegmenter = VnCoreNLP(self.segmenter_path, annotators="wseg", max_heap_size='-Xmx500m')
      text = self.rdrsegment(text)
    encoding = self.tokenizer([text], padding = "max_length", truncation = True, max_length = 125)
    inputs = torch.tensor(encoding["input_ids"])
    masks = torch.tensor(encoding["attention_mask"])
    with torch.no_grad():
      self.model.eval()
      output = self.model(inputs,
          token_type_ids=None,
          attention_mask=masks)
      sigmoid = torch.nn.Sigmoid()
      probs = sigmoid(torch.Tensor(output[0]))
    return probs