# AUTOGENERATED! DO NOT EDIT! File to edit: ../Attention Classifier Pytorch Student.ipynb. # %% auto 0 __all__ = ['tokenizer', 'vocab_size', 'embedding_size', 'num_heads', 'embeddings_fname', 'model', 's', 'pred', 'tokens', 'attn', 'intf', 'DocumentAttentionClassifier', 'get_label_and_weights', 'visualize_attention', 'predict_and_visualize'] # %% ../Attention Classifier Pytorch Student.ipynb 2 import gradio as gr import numpy as np import torch from torch.utils.data import Dataset, DataLoader import json np.random.seed(42) import torch import torch.nn as nn import torch.nn.functional as F from torch.nn import init from collections import Counter import random from torch import optim import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns # Sort of smart tokenization from nltk.tokenize import RegexpTokenizer # Attention plotting import matplotlib.pyplot as plt # %% ../Attention Classifier Pytorch Student.ipynb 4 # Load the word-to-index mapping we used for word2vec and use the same type # of tokenizer. We'll need to use this to tokenize in the same way and keep # the same word-to-id mapping tokenizer = RegexpTokenizer(r'\w+') with open('word_to_index.json', 'r') as f: word_to_index = json.load(f) with open('index_to_word.json', 'r') as f: index_to_word = json.load(f) # %% ../Attention Classifier Pytorch Student.ipynb 6 class DocumentAttentionClassifier(nn.Module): def __init__(self, vocab_size, embedding_size, num_heads, embeddings_fname): ''' Creates the new classifier model. embeddings_fname is a string containing the filename with the saved pytorch parameters (the state dict) for the Embedding object that should be used to initialize this class's word Embedding parameters ''' super(DocumentAttentionClassifier, self).__init__() # Save the input arguments to the state # Create the Embedding object that will hold our word embeddings that we # learned in word2vec. This embedding object should have the same size # as what we learned before. However, we don't to start from scratch! # Once created, load the saved (word2vec-based) parameters into the object # using load_state_dict. self.vocab_size = vocab_size self.embedding_size = embedding_size self.num_heads = num_heads # Load pre-trained embeddings self.embeddings = nn.Embedding(vocab_size, embedding_size) pretrained_embeddings = torch.load(embeddings_fname) self.embeddings.load_state_dict({'weight': pretrained_embeddings}) # Initialize attention heads as trainable parameters self.attention_heads = nn.Parameter(torch.randn(num_heads, embedding_size)) # Linear layer for classification from concatenated attention heads self.output_layer = nn.Linear(num_heads * embedding_size, 1) # Define the attention heads. You have two options: # # 1) the worse way to implement this is to define your heads using an Embedding # and then access them individually later in forward(). This will be slower # but will probably still work # # 2) the ideal way is to think of your attention heads as rows in a matrix-- # just like we do for word2vec. While this is kind of the same as how # we represent things like in an Embedding, the key difference is that we # can now use **matrix operations** to calculate the different r and a # vectors, which will be much faster (and less code). To do this, you'll # need to represent the attention heads as a Tensor directly (not a layer) # and make sure pytorch runs gradient descent on these parameters. # # It's up to you which to use, but try option 2 first and see what you do # in the forward() function # Define the layer that goes from the concatenated attention heads' outputs # to the single output value. We'll push this output value through the sigmoid # to get our prediction # pass def forward(self, word_ids): # Pro Tip™: when implementing this forward pass, try playing around with pytorch # tensors in a jupyter notebook by making "fake" versions of them. For example: # # word_embeds = torch.Tensor([[1,6,2], [9,1,7]]) # # If you have two word embeddings of length 3, how can you define the attention # heads to get the 'r' vector? Trying things out in the simple case will let you # quickly verify the sequence of operations you want to run, e.g., that you can take # the softmax of the 'r' vector to get the 'a' vector and it has the right shape # and values # Hint 1: If you're representing attention using Option 2, most of this code is just # matrix multiplications # Get embeddings for input word IDs embeddings = self.embeddings(word_ids) # [batch_size, seq_len, embedding_size] # Calculate 'r' vectors (attention scores) for each head attention_scores = torch.matmul(embeddings, self.attention_heads.unsqueeze(0).transpose(1, 2)) # [batch_size, seq_len, num_heads] # Apply softmax to get attention weights ('a' vectors) attention_weights = F.softmax(attention_scores, dim=1) # softmax over seq_len dimension # [batch_size, seq_len, num_heads] # Apply attention weights to embeddings (weighted sum of embeddings) attended_embeddings = torch.matmul(attention_weights.transpose(1, 2), embeddings) # [batch_size, num_heads, embedding_size] # Concatenate attention head outputs to form a single vector per document concatenated = attended_embeddings.view(attended_embeddings.size(0), -1) # [batch_size, num_heads * embedding_size] # Pass through output layer to get prediction output = self.output_layer(concatenated) # [batch_size, 1] # Apply sigmoid activation for binary classification prediction = torch.sigmoid(output).squeeze(1) # squeeze to remove extra dimension # [batch_size] return prediction, attention_weights # Hint 2: Most of your time is going to be spent figuring out shape errors and what # operations you need to do to get the right outputs. This is normal. # Hint 3: This is the hardest part of this last part of the homework. # Get the word embeddings for the ids # Calcuate the 'r' vectors which are the dot product of each attention head # with each word embedding. You should be getting a tensor that has this # dot product back out---remember this vector is capturing how much the # head thinks the vector is relevant for the task # Calcuate the softmax of the 'r' vector, which call 'a'. This will give us # a probability distribution over the tokens for each head. Be sure to check # that the softmax is being calculated over the right axis/dimension of the # data (You should see probability values that sum to 1 for each head's # ratings across all the tokens) # Calculate the re-weighting of the word embeddings for each head's attention # weight and sum the reweighted sequence for each head into a single vector. # This should give you n_heads vectors that each have embedding_size length. # Note again that each head should give you a different weighting of the # input word embeddings # Create a single vector that has all n_heads' attention-weighted vectors # as one single vector. We need this one-long-vector shape so that we # can pass all these vectors as input into a layer. # # NOTE: if you're doing Option 2 for representing attention, you don't # actually need to create a new vector (which is very inefficient). # Instead, you can create a new *view* of the same data that reshapes the # different heads' vectors so it looks like one long vector. # Pass the side-by-side attention-weighted vectors through your linear # layer to get some output activation. # # NOTE: if you're feeling adventurous, try adding an extra layer here # which will allow you different attention-weighted vectors to interact # in making the model decision # Return the sigmoid of the output activation *and* the attention # weights for each head. We'll need these later for visualization # pass # %% ../Attention Classifier Pytorch Student.ipynb 32 # Parameters for model initialization vocab_size = len(word_to_index) # Assuming word_to_index is defined embedding_size = 50 # the size used in word2vec model num_heads = 5 # number of attention heads embeddings_fname = 'model_weights_target.pt' model = DocumentAttentionClassifier(vocab_size, embedding_size, num_heads, embeddings_fname) # %% ../Attention Classifier Pytorch Student.ipynb 33 model.load_state_dict(torch.load('attention_clf_model.pt', map_location='cpu')) # %% ../Attention Classifier Pytorch Student.ipynb 34 def get_label_and_weights(text, model): ''' Classifies the text (requires tokenizing, etc.) and returns (1) the classification label, (2) the tokenized words in the model's vocabulary, and (3) the attention weights over the in-vocab tokens as a numpy array. Note that the attention weights will be a matrix, depending on how many heads were used in training. ''' # Tokenize the text token_ids = tokenizer.tokenize(text.lower()) # Adjust according to your tokenizer word_ids = [word_to_index.get(token, word_to_index['']) for token in token_ids] device='cpu' model = model.to(device) token_ids_tensor = torch.tensor([np.array(word_ids)]).to(device) # Forward pass through the model model.eval() with torch.no_grad(): output, attention_weights = model(token_ids_tensor) # Convert output to label predicted_label = int(output.item() > 0.5) predicted_label = "That's positive and so are you!" if predicted_label==1 else "Why so negative?" # Convert token IDs back to tokens tokens = [index_to_word.get(str(tid)) for tid in token_ids_tensor[0].tolist() if tid!=word_to_index['']] # Convert attention weights to numpy array if not already attention_weights_numpy = attention_weights.cpu().numpy() return predicted_label, tokens, attention_weights_numpy.squeeze(0) # %% ../Attention Classifier Pytorch Student.ipynb 36 def visualize_attention(words, attention_weights): ''' Makes a heatmap figure that visualizes the attention weights for an item. Attention weights should be a numpy array that has the shape (num_words, num_heads) ''' fig, ax = plt.subplots() # Rescale image size based on the input length fig.set_size_inches((len(words), 4)) im = ax.imshow(attention_weights.T) head_labels = [ 'head-%d' % h for h in range(attention_weights.shape[1])] ax.set_xticks(np.arange(len(words))) # , labels=words) ax.set_yticks(np.arange(len(head_labels))) #, labels=head_labels) plt.figure(figsize=(48,10)) # Rotate the word labels and set their alignment. plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") # Add the words and axis labels ax.set_yticklabels(labels=range(attention_weights.shape[1]), fontsize=16) ax.set_ylabel('Attention Head', fontsize=16) ax.set_xticklabels(labels=words, fontsize=16) # Add a color bar to show probability scaling cb = fig.colorbar(im, ax=ax, label='Probability', pad = 0.01) cb.ax.tick_params(labelsize=16) cb.set_label(label='Probability',size=16) fig.tight_layout() plt.show() # %% ../Attention Classifier Pytorch Student.ipynb 38 # s = 'Just as I remembered it, one of my favorites from childhood! Great condition, very happy to have this to share with my daughter. Packaging was so nice and was received quickly.' # pred, tokens, attn = get_label_and_weights(s, model) # visualize_attention(tokens, attn) # print(pred) # %% ../Attention Classifier Pytorch Student.ipynb 39 def predict_and_visualize(s): pred, tokens, attn = get_label_and_weights(s, model) # Assuming visualize_attention can save an image and return its path image_path = visualize_attention(tokens, attn) return pred, image_path # %% ../Attention Classifier Pytorch Student.ipynb 40 intf = gr.Interface(fn=predict_and_visualize, inputs="text", outputs=["text", "image"], examples=["The book was amazing!", "Today's Weather is pretty bad!", "How are you feeling"], title="Text Review Classifier with Attention Visualization", description="Enter a review to see the prediction and attention visualization.") intf.launch(share=True)