Spaces:
Sleeping
Sleeping
| # AUTOGENERATED! DO NOT EDIT! File to edit: ../Attention Classifier Pytorch Student.ipynb. | |
| # %% auto 0 | |
| __all__ = ['tokenizer', 'vocab_size', 'embedding_size', 'num_heads', 'embeddings_fname', 'model', 's', 'pred', 'tokens', 'attn', | |
| 'intf', 'DocumentAttentionClassifier', 'get_label_and_weights', 'visualize_attention', | |
| 'predict_and_visualize'] | |
| # %% ../Attention Classifier Pytorch Student.ipynb 2 | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| from torch.utils.data import Dataset, DataLoader | |
| import json | |
| np.random.seed(42) | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| from torch.nn import init | |
| from collections import Counter | |
| import random | |
| from torch import optim | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| # Sort of smart tokenization | |
| from nltk.tokenize import RegexpTokenizer | |
| # Attention plotting | |
| import matplotlib.pyplot as plt | |
| # %% ../Attention Classifier Pytorch Student.ipynb 4 | |
| # Load the word-to-index mapping we used for word2vec and use the same type | |
| # of tokenizer. We'll need to use this to tokenize in the same way and keep | |
| # the same word-to-id mapping | |
| tokenizer = RegexpTokenizer(r'\w+') | |
| with open('word_to_index.json', 'r') as f: | |
| word_to_index = json.load(f) | |
| with open('index_to_word.json', 'r') as f: | |
| index_to_word = json.load(f) | |
| # %% ../Attention Classifier Pytorch Student.ipynb 6 | |
| class DocumentAttentionClassifier(nn.Module): | |
| def __init__(self, vocab_size, embedding_size, num_heads, embeddings_fname): | |
| ''' | |
| Creates the new classifier model. embeddings_fname is a string containing the | |
| filename with the saved pytorch parameters (the state dict) for the Embedding | |
| object that should be used to initialize this class's word Embedding parameters | |
| ''' | |
| super(DocumentAttentionClassifier, self).__init__() | |
| # Save the input arguments to the state | |
| # Create the Embedding object that will hold our word embeddings that we | |
| # learned in word2vec. This embedding object should have the same size | |
| # as what we learned before. However, we don't to start from scratch! | |
| # Once created, load the saved (word2vec-based) parameters into the object | |
| # using load_state_dict. | |
| self.vocab_size = vocab_size | |
| self.embedding_size = embedding_size | |
| self.num_heads = num_heads | |
| # Load pre-trained embeddings | |
| self.embeddings = nn.Embedding(vocab_size, embedding_size) | |
| pretrained_embeddings = torch.load(embeddings_fname) | |
| self.embeddings.load_state_dict({'weight': pretrained_embeddings}) | |
| # Initialize attention heads as trainable parameters | |
| self.attention_heads = nn.Parameter(torch.randn(num_heads, embedding_size)) | |
| # Linear layer for classification from concatenated attention heads | |
| self.output_layer = nn.Linear(num_heads * embedding_size, 1) | |
| # Define the attention heads. You have two options: | |
| # | |
| # 1) the worse way to implement this is to define your heads using an Embedding | |
| # and then access them individually later in forward(). This will be slower | |
| # but will probably still work | |
| # | |
| # 2) the ideal way is to think of your attention heads as rows in a matrix-- | |
| # just like we do for word2vec. While this is kind of the same as how | |
| # we represent things like in an Embedding, the key difference is that we | |
| # can now use **matrix operations** to calculate the different r and a | |
| # vectors, which will be much faster (and less code). To do this, you'll | |
| # need to represent the attention heads as a Tensor directly (not a layer) | |
| # and make sure pytorch runs gradient descent on these parameters. | |
| # | |
| # It's up to you which to use, but try option 2 first and see what you do | |
| # in the forward() function | |
| # Define the layer that goes from the concatenated attention heads' outputs | |
| # to the single output value. We'll push this output value through the sigmoid | |
| # to get our prediction | |
| # pass | |
| def forward(self, word_ids): | |
| # Pro Tip™: when implementing this forward pass, try playing around with pytorch | |
| # tensors in a jupyter notebook by making "fake" versions of them. For example: | |
| # | |
| # word_embeds = torch.Tensor([[1,6,2], [9,1,7]]) | |
| # | |
| # If you have two word embeddings of length 3, how can you define the attention | |
| # heads to get the 'r' vector? Trying things out in the simple case will let you | |
| # quickly verify the sequence of operations you want to run, e.g., that you can take | |
| # the softmax of the 'r' vector to get the 'a' vector and it has the right shape | |
| # and values | |
| # Hint 1: If you're representing attention using Option 2, most of this code is just | |
| # matrix multiplications | |
| # Get embeddings for input word IDs | |
| embeddings = self.embeddings(word_ids) # [batch_size, seq_len, embedding_size] | |
| # Calculate 'r' vectors (attention scores) for each head | |
| attention_scores = torch.matmul(embeddings, self.attention_heads.unsqueeze(0).transpose(1, 2)) | |
| # [batch_size, seq_len, num_heads] | |
| # Apply softmax to get attention weights ('a' vectors) | |
| attention_weights = F.softmax(attention_scores, dim=1) # softmax over seq_len dimension | |
| # [batch_size, seq_len, num_heads] | |
| # Apply attention weights to embeddings (weighted sum of embeddings) | |
| attended_embeddings = torch.matmul(attention_weights.transpose(1, 2), embeddings) | |
| # [batch_size, num_heads, embedding_size] | |
| # Concatenate attention head outputs to form a single vector per document | |
| concatenated = attended_embeddings.view(attended_embeddings.size(0), -1) | |
| # [batch_size, num_heads * embedding_size] | |
| # Pass through output layer to get prediction | |
| output = self.output_layer(concatenated) | |
| # [batch_size, 1] | |
| # Apply sigmoid activation for binary classification | |
| prediction = torch.sigmoid(output).squeeze(1) # squeeze to remove extra dimension | |
| # [batch_size] | |
| return prediction, attention_weights | |
| # Hint 2: Most of your time is going to be spent figuring out shape errors and what | |
| # operations you need to do to get the right outputs. This is normal. | |
| # Hint 3: This is the hardest part of this last part of the homework. | |
| # Get the word embeddings for the ids | |
| # Calcuate the 'r' vectors which are the dot product of each attention head | |
| # with each word embedding. You should be getting a tensor that has this | |
| # dot product back out---remember this vector is capturing how much the | |
| # head thinks the vector is relevant for the task | |
| # Calcuate the softmax of the 'r' vector, which call 'a'. This will give us | |
| # a probability distribution over the tokens for each head. Be sure to check | |
| # that the softmax is being calculated over the right axis/dimension of the | |
| # data (You should see probability values that sum to 1 for each head's | |
| # ratings across all the tokens) | |
| # Calculate the re-weighting of the word embeddings for each head's attention | |
| # weight and sum the reweighted sequence for each head into a single vector. | |
| # This should give you n_heads vectors that each have embedding_size length. | |
| # Note again that each head should give you a different weighting of the | |
| # input word embeddings | |
| # Create a single vector that has all n_heads' attention-weighted vectors | |
| # as one single vector. We need this one-long-vector shape so that we | |
| # can pass all these vectors as input into a layer. | |
| # | |
| # NOTE: if you're doing Option 2 for representing attention, you don't | |
| # actually need to create a new vector (which is very inefficient). | |
| # Instead, you can create a new *view* of the same data that reshapes the | |
| # different heads' vectors so it looks like one long vector. | |
| # Pass the side-by-side attention-weighted vectors through your linear | |
| # layer to get some output activation. | |
| # | |
| # NOTE: if you're feeling adventurous, try adding an extra layer here | |
| # which will allow you different attention-weighted vectors to interact | |
| # in making the model decision | |
| # Return the sigmoid of the output activation *and* the attention | |
| # weights for each head. We'll need these later for visualization | |
| # pass | |
| # %% ../Attention Classifier Pytorch Student.ipynb 32 | |
| # Parameters for model initialization | |
| vocab_size = len(word_to_index) # Assuming word_to_index is defined | |
| embedding_size = 50 # the size used in word2vec model | |
| num_heads = 5 # number of attention heads | |
| embeddings_fname = 'model_weights_target.pt' | |
| model = DocumentAttentionClassifier(vocab_size, embedding_size, num_heads, embeddings_fname) | |
| # %% ../Attention Classifier Pytorch Student.ipynb 33 | |
| model.load_state_dict(torch.load('attention_clf_model.pt', map_location='cpu')) | |
| # %% ../Attention Classifier Pytorch Student.ipynb 34 | |
| def get_label_and_weights(text, model): | |
| ''' | |
| Classifies the text (requires tokenizing, etc.) and returns (1) the classification label, | |
| (2) the tokenized words in the model's vocabulary, | |
| and (3) the attention weights over the in-vocab tokens as a numpy array. Note that the | |
| attention weights will be a matrix, depending on how many heads were used in training. | |
| ''' | |
| # Tokenize the text | |
| token_ids = tokenizer.tokenize(text.lower()) # Adjust according to your tokenizer | |
| word_ids = [word_to_index.get(token, word_to_index['<UNK>']) for token in token_ids] | |
| device='cpu' | |
| model = model.to(device) | |
| token_ids_tensor = torch.tensor([np.array(word_ids)]).to(device) | |
| # Forward pass through the model | |
| model.eval() | |
| with torch.no_grad(): | |
| output, attention_weights = model(token_ids_tensor) | |
| # Convert output to label | |
| predicted_label = int(output.item() > 0.5) | |
| predicted_label = "That's positive and so are you!" if predicted_label==1 else "Why so negative?" | |
| # Convert token IDs back to tokens | |
| tokens = [index_to_word.get(str(tid)) for tid in token_ids_tensor[0].tolist() if tid!=word_to_index['<UNK>']] | |
| # Convert attention weights to numpy array if not already | |
| attention_weights_numpy = attention_weights.cpu().numpy() | |
| return predicted_label, tokens, attention_weights_numpy.squeeze(0) | |
| # %% ../Attention Classifier Pytorch Student.ipynb 36 | |
| def visualize_attention(words, attention_weights): | |
| ''' | |
| Makes a heatmap figure that visualizes the attention weights for an item. | |
| Attention weights should be a numpy array that has the shape (num_words, num_heads) | |
| ''' | |
| fig, ax = plt.subplots() | |
| # Rescale image size based on the input length | |
| fig.set_size_inches((len(words), 4)) | |
| im = ax.imshow(attention_weights.T) | |
| head_labels = [ 'head-%d' % h for h in range(attention_weights.shape[1])] | |
| ax.set_xticks(np.arange(len(words))) # , labels=words) | |
| ax.set_yticks(np.arange(len(head_labels))) #, labels=head_labels) | |
| plt.figure(figsize=(48,10)) | |
| # Rotate the word labels and set their alignment. | |
| plt.setp(ax.get_xticklabels(), rotation=45, ha="right", | |
| rotation_mode="anchor") | |
| # Add the words and axis labels | |
| ax.set_yticklabels(labels=range(attention_weights.shape[1]), fontsize=16) | |
| ax.set_ylabel('Attention Head', fontsize=16) | |
| ax.set_xticklabels(labels=words, fontsize=16) | |
| # Add a color bar to show probability scaling | |
| cb = fig.colorbar(im, ax=ax, label='Probability', pad = 0.01) | |
| cb.ax.tick_params(labelsize=16) | |
| cb.set_label(label='Probability',size=16) | |
| fig.tight_layout() | |
| plt.show() | |
| # %% ../Attention Classifier Pytorch Student.ipynb 38 | |
| # s = 'Just as I remembered it, one of my favorites from childhood! Great condition, very happy to have this to share with my daughter. Packaging was so nice and was received quickly.' | |
| # pred, tokens, attn = get_label_and_weights(s, model) | |
| # visualize_attention(tokens, attn) | |
| # print(pred) | |
| # %% ../Attention Classifier Pytorch Student.ipynb 39 | |
| def predict_and_visualize(s): | |
| pred, tokens, attn = get_label_and_weights(s, model) | |
| # Assuming visualize_attention can save an image and return its path | |
| image_path = visualize_attention(tokens, attn) | |
| return pred, image_path | |
| # %% ../Attention Classifier Pytorch Student.ipynb 40 | |
| intf = gr.Interface(fn=predict_and_visualize, | |
| inputs="text", | |
| outputs=["text", "image"], | |
| examples=["The book was amazing!", "Today's Weather is pretty bad!", "How are you feeling"], | |
| title="Text Review Classifier with Attention Visualization", | |
| description="Enter a review to see the prediction and attention visualization.") | |
| intf.launch(share=True) | |