import torch
import torch.nn as nn
from torch.nn import functional as F
import gradio as gr

n_emb = 64
block_size = 32
# head_size = 4
n_x = 4
num_heads = 4
eval_iteration = 250
max_iters = 5000
batch_size = 32

device = 'cuda' if torch.cuda.is_available() else 'cpu'


class Head(nn.Module):
  """
  one head in self attention
  """

  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_emb, head_size)
    self.query = nn.Linear(n_emb, head_size)
    self.value = nn.Linear(n_emb, head_size)
    self.dropout = nn.Dropout(0.0)

    # tril: lower-triangular
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

  def forward(self, x):
    batch, blocks, X = x.shape
    query = self.query(x) # batch, block_size, X -- shape
    key = self.key(x) # batch, block_size, X -- shape
    weight = query @ key.transpose(-2, -1)  * X ** -0.5 # batch, block_size, X @ batch, X, blocl_size ---> batch, block_size, block_size
    weight = weight.masked_fill(self.tril[:blocks, :blocks] == 0,float('-inf'))
    weight = F.softmax(weight, dim=-1)
    weight = self.dropout(weight)
    out = weight @  self.value(x)
    return out

class MultiHeadAttention(nn.Module):
  """
  multi head in self attention
  """
# nnum_head = 6
# head_size
  def __init__(self, head_size, num_heads):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.layer = nn.Linear(n_emb, n_emb)
    self.dropout = nn.Dropout(0.0)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    return self.dropout(self.layer(out))


class FeedForward(nn.Module):

  def __init__(self, n_emb):
    super().__init__()
    self.dff = nn.Sequential(
        nn.Linear(n_emb, n_emb*4),
        nn.ReLU(),
        nn.Linear(4*n_emb, n_emb),
        nn.Dropout(0.0)
    )

  def forward(self, x):
    return self.dff(x)

class BlockSeq(nn.Module):
  def __init__(self, n_emb, num_heads):
      super().__init__()
      head_size = int(n_emb / num_heads)
      self.mh_att = MultiHeadAttention(head_size, num_heads)
      self.ff_lay = FeedForward(n_emb)
      self.ln1 = nn.LayerNorm(n_emb)
      self.ln2 = nn.LayerNorm(n_emb)

  def forward(self, x):
      x = x + self.mh_att(self.ln1(x))
      x = x + self.ff_lay(self.ln2(x))
      return x


class TextGenerator(nn.Module):
      def __init__(self):
          super().__init__()
        #  x = [1, 25, 89, 65,63,64]
          self.lookup_token_emd_table = nn.Embedding(vocab_size, n_emb)
          self.postional_encoding = nn.Embedding(block_size, n_emb)
          self.blocks = nn.Sequential(*[BlockSeq(n_emb, num_heads) for _ in range(n_x)])
          self.layer_norm = nn.LayerNorm(n_emb)
          self.model_head = nn.Linear(n_emb, vocab_size)

      def forward(self, x, y=None): #
          batches, block_size_x = x.shape
          out = self.lookup_token_emd_table(x) # 2, 7, 90 , x: 1,2 3
          pos_enc = self.postional_encoding(torch.arange(block_size_x, device=device))
          out = out + pos_enc
          out = self.blocks(out)
          out = self.layer_norm(out)
          out = self.model_head(out)


          if y is None:
            loss = None
          else:
            batches, block_size, X = out.shape
            loss = F.cross_entropy(out.view(batches*block_size, X), y.view(batches*block_size))

          return out, loss

      def generate(self, x, max_tokens=200):
          for _ in range(max_tokens):
            logits, _ = self(x[:, -block_size:])
            logits = logits[:, -1, :]
            # print(logits.shape)
            probilities = F.softmax(logits, dim=-1) # 1, 90
            next_x = torch.multinomial(probilities, num_samples=1)
            x = torch.cat((x, next_x), dim=1) # [hi, ] 1 2 3
          return x
      
      
model = torch.load('entire_model.pth')

import pickle

with open('meta.pkl', 'rb') as f:
        meta = pickle.load(f)
stoi, itos = meta['stoi'], meta['itos']
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: [itos[i] for i in l]


def reply(message, history):
    # encode the beginning of the prompt
    start = message
    start_ids = encode(start)
    x = (torch.tensor(start_ids, dtype=torch.long, device='cpu')[None, ...])
    print(x)
    replied = []
    # run generation
    with torch.no_grad():
        for k in range(3):
            y = model.generate(x, 200)
            replied.append(''.join(decode(y[0].tolist())))
    return '\n'.join(replied)


gr.Interface(reply, "text", "text", title="Poet Demo").launch()