import numpy as np
import torch
import torch.nn as nn
from torch.nn.parameter import Parameter
from torch.nn.functional import embedding

from ..init import assign_tensor


class Embedding(nn.Module):
    r"""A simple lookup table that stores embeddings of a fixed dictionary and size.
    This module is often used to store word embeddings and retrieve them using indices.
    The input to the module is a list of indices, and the output is the corresponding
    word embeddings.
    Args:
        num_embeddings (int): size of the dictionary of embeddings
        embedding_dim (int): the size of each embedding vector
        init_embedding (Tensor): If given, the embedding will be initialized with the given tensor.
        freeze (boolean, optional): If ``True``, the tensor does not get updated in the learning process.
        padding_idx (int, optional): If given, pads the output with zeros whenever it encounters the index.
        max_norm (float, optional): If given, will renormalize the embeddings to always have a norm lesser than this
        norm_type (float, optional): The p of the p-norm to compute for the max_norm option
        scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the frequency of
                                                the words in the mini-batch.
        sparse (boolean, optional): if True, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for
                                    more details regarding sparse gradients.
    Attributes:
        weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim)
    Shape:
        - Input: LongTensor `(N1, N2, ...,Nm, W)`, N = mini-batch, W = number of indices to extract per mini-batch
        - Output: `(N1, N2, ..., Nm, W, embedding_dim)`
    Notes:
        Keep in mind that only a limited number of optimizers support
        sparse gradients: currently it's `optim.SGD` (`cuda` and `cpu`),
        and `optim.Adagrad` (`cpu`)
    """

    def __init__(self, num_embeddings, embedding_dim, init_embedding=None, freeze=False, padding_idx=None,
                 max_norm=None, norm_type=2, scale_grad_by_freq=False, sparse=False):
        super(Embedding, self).__init__()
        self.num_embeddings = num_embeddings
        self.embedding_dim = embedding_dim
        self.padding_idx = padding_idx
        self.max_norm = max_norm
        self.norm_type = norm_type
        self.scale_grad_by_freq = scale_grad_by_freq
        self.weight = Parameter(torch.Tensor(num_embeddings, embedding_dim))
        self.frozen = freeze
        self.sparse = sparse

        self.reset_parameters(init_embedding)

    def reset_parameters(self, init_embedding):
        if init_embedding is None:
            scale = np.sqrt(3.0 / self.embedding_dim)
            self.weight.data.uniform_(-scale, scale)
        else:
            assign_tensor(self.weight, init_embedding)
        if self.padding_idx is not None:
            self.weight.data[self.padding_idx].fill_(0)

        if self.frozen:
            if init_embedding is None:
                raise Warning('Freeze embeddings which are randomly initialized.')
            self.weight.requires_grad = False

    def freeze(self):
        self.weight.requires_grad = False
        self.frozen = True

    def forward(self, input):
        padding_idx = self.padding_idx
        if padding_idx is None:
            padding_idx = -1

        input_size = input.size()
        if input.dim() > 2:
            num_inputs = int(np.prod(input_size[:-1]))
            input = input.view(num_inputs, input_size[-1])

        output_size = input_size + (self.embedding_dim,)
        return embedding(input,self.weight,padding_idx,self.max_norm,
                         self.norm_type,self.scale_grad_by_freq,
                         self.sparse).view(output_size)
        #return self._backend.Embedding.apply(
        #    input, self.weight,
        #    padding_idx, self.max_norm, self.norm_type,
        #    self.scale_grad_by_freq, self.sparse).view(output_size)

    def __repr__(self):
        s = '{name}({num_embeddings}, {embedding_dim}'
        if self.padding_idx is not None:
            s += ', padding_idx={padding_idx}'
        if self.max_norm is not None:
            s += ', max_norm={max_norm}'
        if self.norm_type != 2:
            s += ', norm_type={norm_type}'
        if self.scale_grad_by_freq is not False:
            s += ', scale_grad_by_freq={scale_grad_by_freq}'
        if self.sparse is not False:
            s += ', sparse=True'
        s += ')'
        return s.format(name=self.__class__.__name__, **self.__dict__)