# AUTOGENERATED! DO NOT EDIT! File to edit: ../Attention Classifier Pytorch Student.ipynb.

# %% auto 0
__all__ = ['tokenizer', 'vocab_size', 'embedding_size', 'num_heads', 'embeddings_fname', 'model', 's', 'pred', 'tokens', 'attn',
           'intf', 'DocumentAttentionClassifier', 'get_label_and_weights', 'visualize_attention',
           'predict_and_visualize']

# %% ../Attention Classifier Pytorch Student.ipynb 2
import gradio as gr
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import json

np.random.seed(42)
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
from collections import Counter
import random
from torch import optim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Sort of smart tokenization
from nltk.tokenize import RegexpTokenizer

# Attention plotting
import matplotlib.pyplot as plt

# %% ../Attention Classifier Pytorch Student.ipynb 4
# Load the word-to-index mapping we used for word2vec and use the same type
# of tokenizer. We'll need to use this to tokenize in the same way and keep 
# the same word-to-id mapping

tokenizer = RegexpTokenizer(r'\w+')

with open('word_to_index.json', 'r') as f:
    word_to_index = json.load(f)
with open('index_to_word.json', 'r') as f:
    index_to_word = json.load(f)

# %% ../Attention Classifier Pytorch Student.ipynb 6
class DocumentAttentionClassifier(nn.Module):
    
    def __init__(self, vocab_size, embedding_size, num_heads, embeddings_fname):
        '''
        Creates the new classifier model. embeddings_fname is a string containing the
        filename with the saved pytorch parameters (the state dict) for the Embedding
        object that should be used to initialize this class's word Embedding parameters
        '''
        super(DocumentAttentionClassifier, self).__init__()
        
        # Save the input arguments to the state

        
        # Create the Embedding object that will hold our word embeddings that we
        # learned in word2vec. This embedding object should have the same size
        # as what we learned before. However, we don't to start from scratch! 
        # Once created, load the saved (word2vec-based) parameters into the object
        # using load_state_dict.
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.num_heads = num_heads

        # Load pre-trained embeddings
        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        pretrained_embeddings = torch.load(embeddings_fname)
        self.embeddings.load_state_dict({'weight': pretrained_embeddings})
        
        # Initialize attention heads as trainable parameters
        self.attention_heads = nn.Parameter(torch.randn(num_heads, embedding_size))

        # Linear layer for classification from concatenated attention heads
        self.output_layer = nn.Linear(num_heads * embedding_size, 1)
        

        # Define the attention heads. You have two options:
        # 
        # 1) the worse way to implement this is to define your heads using an Embedding
        #    and then access them individually later in forward(). This will be slower
        #    but will probably still work 
        #
        # 2) the ideal way is to think of your attention heads as rows in a matrix--
        #    just like we do for word2vec. While this is kind of the same as how
        #    we represent things like in an Embedding, the key difference is that we
        #    can now use **matrix operations** to calculate the different r and a
        #    vectors, which will be much faster (and less code). To do this, you'll
        #    need to represent the attention heads as a Tensor directly (not a layer)
        #    and make sure pytorch runs gradient descent on these parameters.
        #
        #  It's up to you which to use, but try option 2 first and see what you do 
        #  in the forward() function
        

        # Define the layer that goes from the concatenated attention heads' outputs
        # to the single output value. We'll push this output value through the sigmoid
        # to get our prediction

        # pass
    

    def forward(self, word_ids):
        
        # Pro Tip™: when implementing this forward pass, try playing around with pytorch
        # tensors in a jupyter notebook by making "fake" versions of them. For example:
        #
        # word_embeds = torch.Tensor([[1,6,2], [9,1,7]])
        #
        # If you have two word embeddings of length 3, how can you define the attention
        # heads to get the 'r' vector? Trying things out in the simple case will let you
        # quickly verify the sequence of operations you want to run, e.g., that you can take
        # the softmax of the 'r' vector to get the 'a' vector and it has the right shape
        # and values
        
        # Hint 1: If you're representing attention using Option 2, most of this code is just 
        #         matrix multiplications

        # Get embeddings for input word IDs
        embeddings = self.embeddings(word_ids)  # [batch_size, seq_len, embedding_size]

        # Calculate 'r' vectors (attention scores) for each head
        attention_scores = torch.matmul(embeddings, self.attention_heads.unsqueeze(0).transpose(1, 2))
        # [batch_size, seq_len, num_heads]

        # Apply softmax to get attention weights ('a' vectors)
        attention_weights = F.softmax(attention_scores, dim=1)  # softmax over seq_len dimension
        # [batch_size, seq_len, num_heads]

        # Apply attention weights to embeddings (weighted sum of embeddings)
        attended_embeddings = torch.matmul(attention_weights.transpose(1, 2), embeddings)
        # [batch_size, num_heads, embedding_size]

        # Concatenate attention head outputs to form a single vector per document
        concatenated = attended_embeddings.view(attended_embeddings.size(0), -1)
        # [batch_size, num_heads * embedding_size]

        # Pass through output layer to get prediction
        output = self.output_layer(concatenated)
        # [batch_size, 1]

        # Apply sigmoid activation for binary classification
        prediction = torch.sigmoid(output).squeeze(1)  # squeeze to remove extra dimension
        # [batch_size]

        return prediction, attention_weights

        # Hint 2: Most of your time is going to be spent figuring out shape errors and what
        #         operations you need to do to get the right outputs. This is normal.
        
        # Hint 3: This is the hardest part of this last part of the homework.

        
        # Get the word embeddings for the ids


        # Calcuate the 'r' vectors which are the dot product of each attention head
        # with each word embedding. You should be getting a tensor that has this
        # dot product back out---remember this vector is capturing how much the 
        # head thinks the vector is relevant for the task


        # Calcuate the softmax of the 'r' vector, which call 'a'. This will give us
        # a probability distribution over the tokens for each head. Be sure to check
        # that the softmax is being calculated over the right axis/dimension of the 
        # data (You should see probability values that sum to 1 for each head's 
        # ratings across all the tokens)        


        # Calculate the re-weighting of the word embeddings for each head's attention
        # weight and sum the reweighted sequence for each head into a single vector.
        # This should give you n_heads vectors that each have embedding_size length.
        # Note again that each head should give you a different weighting of the
        # input word embeddings

        
        # Create a single vector that has all n_heads' attention-weighted vectors
        # as one single vector. We need this one-long-vector shape so that we 
        # can pass all these vectors as input into a layer.
        #
        # NOTE: if you're doing Option 2 for representing attention, you don't 
        # actually need to create a new vector (which is very inefficient).
        # Instead, you can create a new *view* of the same data that reshapes the
        # different heads' vectors so it looks like one long vector. 


        # Pass the side-by-side attention-weighted vectors through your linear
        # layer to get some output activation.
        #
        # NOTE: if you're feeling adventurous, try adding an extra layer here
        # which will allow you different attention-weighted vectors to interact
        # in making the model decision

        
        # Return the sigmoid of the output activation *and* the attention 
        # weights for each head. We'll need these later for visualization
        # pass

# %% ../Attention Classifier Pytorch Student.ipynb 32
# Parameters for model initialization
vocab_size = len(word_to_index)  # Assuming word_to_index is defined
embedding_size = 50  # the size used in word2vec model
num_heads = 5  # number of attention heads
embeddings_fname = 'model_weights_target.pt'
model = DocumentAttentionClassifier(vocab_size, embedding_size, num_heads, embeddings_fname)


# %% ../Attention Classifier Pytorch Student.ipynb 33
model.load_state_dict(torch.load('attention_clf_model.pt', map_location='cpu'))


# %% ../Attention Classifier Pytorch Student.ipynb 34
def get_label_and_weights(text, model):
    '''
    Classifies the text (requires tokenizing, etc.) and returns (1) the classification label, 
    (2) the tokenized words in the model's vocabulary, 
    and (3) the attention weights over the in-vocab tokens as a numpy array. Note that the
    attention weights will be a matrix, depending on how many heads were used in training.
    '''

    # Tokenize the text
    token_ids = tokenizer.tokenize(text.lower())  # Adjust according to your tokenizer
    word_ids = [word_to_index.get(token, word_to_index['<UNK>']) for token in token_ids]
    
    device='cpu'
    model = model.to(device)
    token_ids_tensor = torch.tensor([np.array(word_ids)]).to(device)
    
    # Forward pass through the model
    model.eval()
    with torch.no_grad():
        output, attention_weights = model(token_ids_tensor)
    
        # Convert output to label
        predicted_label = int(output.item() > 0.5)

        predicted_label = "That's positive and so are you!" if predicted_label==1 else "Why so negative?"

        # Convert token IDs back to tokens
        tokens = [index_to_word.get(str(tid)) for tid in token_ids_tensor[0].tolist() if tid!=word_to_index['<UNK>']]
        
        # Convert attention weights to numpy array if not already
        attention_weights_numpy = attention_weights.cpu().numpy()
        
        return predicted_label, tokens, attention_weights_numpy.squeeze(0)

# %% ../Attention Classifier Pytorch Student.ipynb 36
def visualize_attention(words, attention_weights):
    '''
    Makes a heatmap figure that visualizes the attention weights for an item.
    Attention weights should be a numpy array that has the shape (num_words, num_heads)
    '''
    fig, ax = plt.subplots() 
    # Rescale image size based on the input length
    fig.set_size_inches((len(words), 4))    
    im = ax.imshow(attention_weights.T)

    head_labels = [ 'head-%d' % h for h in range(attention_weights.shape[1])]
    ax.set_xticks(np.arange(len(words))) # , labels=words)
    ax.set_yticks(np.arange(len(head_labels))) #, labels=head_labels)

    plt.figure(figsize=(48,10))
    # Rotate the word labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")
    
    # Add the words and axis labels
    ax.set_yticklabels(labels=range(attention_weights.shape[1]), fontsize=16)
    ax.set_ylabel('Attention Head', fontsize=16)
    ax.set_xticklabels(labels=words, fontsize=16)

    # Add a color bar to show probability scaling
    cb = fig.colorbar(im, ax=ax, label='Probability', pad = 0.01)
    cb.ax.tick_params(labelsize=16)
    cb.set_label(label='Probability',size=16)
    fig.tight_layout()
    plt.show()

# %% ../Attention Classifier Pytorch Student.ipynb 38
# s = 'Just as I remembered it, one of my favorites from childhood! Great condition, very happy to have this to share with my daughter. Packaging was so nice and was received quickly.'
# pred, tokens, attn = get_label_and_weights(s, model)
# visualize_attention(tokens, attn)
# print(pred)

# %% ../Attention Classifier Pytorch Student.ipynb 39
def predict_and_visualize(s):
    pred, tokens, attn = get_label_and_weights(s, model)
    
    # Assuming visualize_attention can save an image and return its path
    image_path = visualize_attention(tokens, attn)
    
    return pred, image_path


# %% ../Attention Classifier Pytorch Student.ipynb 40

intf = gr.Interface(fn=predict_and_visualize,
                    inputs="text",
                    outputs=["text", "image"],
                    examples=["The book was amazing!", "Today's Weather is pretty bad!", "How are you feeling"],
                    title="Text Review Classifier with Attention Visualization",
                    description="Enter a review to see the prediction and attention visualization.")

intf.launch(share=True)