Divyam Sharma
public Link generation
0d2ca7a
# AUTOGENERATED! DO NOT EDIT! File to edit: ../Attention Classifier Pytorch Student.ipynb.
# %% auto 0
__all__ = ['tokenizer', 'vocab_size', 'embedding_size', 'num_heads', 'embeddings_fname', 'model', 's', 'pred', 'tokens', 'attn',
'intf', 'DocumentAttentionClassifier', 'get_label_and_weights', 'visualize_attention',
'predict_and_visualize']
# %% ../Attention Classifier Pytorch Student.ipynb 2
import gradio as gr
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import json
np.random.seed(42)
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
from collections import Counter
import random
from torch import optim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Sort of smart tokenization
from nltk.tokenize import RegexpTokenizer
# Attention plotting
import matplotlib.pyplot as plt
# %% ../Attention Classifier Pytorch Student.ipynb 4
# Load the word-to-index mapping we used for word2vec and use the same type
# of tokenizer. We'll need to use this to tokenize in the same way and keep
# the same word-to-id mapping
tokenizer = RegexpTokenizer(r'\w+')
with open('word_to_index.json', 'r') as f:
word_to_index = json.load(f)
with open('index_to_word.json', 'r') as f:
index_to_word = json.load(f)
# %% ../Attention Classifier Pytorch Student.ipynb 6
class DocumentAttentionClassifier(nn.Module):
def __init__(self, vocab_size, embedding_size, num_heads, embeddings_fname):
'''
Creates the new classifier model. embeddings_fname is a string containing the
filename with the saved pytorch parameters (the state dict) for the Embedding
object that should be used to initialize this class's word Embedding parameters
'''
super(DocumentAttentionClassifier, self).__init__()
# Save the input arguments to the state
# Create the Embedding object that will hold our word embeddings that we
# learned in word2vec. This embedding object should have the same size
# as what we learned before. However, we don't to start from scratch!
# Once created, load the saved (word2vec-based) parameters into the object
# using load_state_dict.
self.vocab_size = vocab_size
self.embedding_size = embedding_size
self.num_heads = num_heads
# Load pre-trained embeddings
self.embeddings = nn.Embedding(vocab_size, embedding_size)
pretrained_embeddings = torch.load(embeddings_fname)
self.embeddings.load_state_dict({'weight': pretrained_embeddings})
# Initialize attention heads as trainable parameters
self.attention_heads = nn.Parameter(torch.randn(num_heads, embedding_size))
# Linear layer for classification from concatenated attention heads
self.output_layer = nn.Linear(num_heads * embedding_size, 1)
# Define the attention heads. You have two options:
#
# 1) the worse way to implement this is to define your heads using an Embedding
# and then access them individually later in forward(). This will be slower
# but will probably still work
#
# 2) the ideal way is to think of your attention heads as rows in a matrix--
# just like we do for word2vec. While this is kind of the same as how
# we represent things like in an Embedding, the key difference is that we
# can now use **matrix operations** to calculate the different r and a
# vectors, which will be much faster (and less code). To do this, you'll
# need to represent the attention heads as a Tensor directly (not a layer)
# and make sure pytorch runs gradient descent on these parameters.
#
# It's up to you which to use, but try option 2 first and see what you do
# in the forward() function
# Define the layer that goes from the concatenated attention heads' outputs
# to the single output value. We'll push this output value through the sigmoid
# to get our prediction
# pass
def forward(self, word_ids):
# Pro Tip™: when implementing this forward pass, try playing around with pytorch
# tensors in a jupyter notebook by making "fake" versions of them. For example:
#
# word_embeds = torch.Tensor([[1,6,2], [9,1,7]])
#
# If you have two word embeddings of length 3, how can you define the attention
# heads to get the 'r' vector? Trying things out in the simple case will let you
# quickly verify the sequence of operations you want to run, e.g., that you can take
# the softmax of the 'r' vector to get the 'a' vector and it has the right shape
# and values
# Hint 1: If you're representing attention using Option 2, most of this code is just
# matrix multiplications
# Get embeddings for input word IDs
embeddings = self.embeddings(word_ids) # [batch_size, seq_len, embedding_size]
# Calculate 'r' vectors (attention scores) for each head
attention_scores = torch.matmul(embeddings, self.attention_heads.unsqueeze(0).transpose(1, 2))
# [batch_size, seq_len, num_heads]
# Apply softmax to get attention weights ('a' vectors)
attention_weights = F.softmax(attention_scores, dim=1) # softmax over seq_len dimension
# [batch_size, seq_len, num_heads]
# Apply attention weights to embeddings (weighted sum of embeddings)
attended_embeddings = torch.matmul(attention_weights.transpose(1, 2), embeddings)
# [batch_size, num_heads, embedding_size]
# Concatenate attention head outputs to form a single vector per document
concatenated = attended_embeddings.view(attended_embeddings.size(0), -1)
# [batch_size, num_heads * embedding_size]
# Pass through output layer to get prediction
output = self.output_layer(concatenated)
# [batch_size, 1]
# Apply sigmoid activation for binary classification
prediction = torch.sigmoid(output).squeeze(1) # squeeze to remove extra dimension
# [batch_size]
return prediction, attention_weights
# Hint 2: Most of your time is going to be spent figuring out shape errors and what
# operations you need to do to get the right outputs. This is normal.
# Hint 3: This is the hardest part of this last part of the homework.
# Get the word embeddings for the ids
# Calcuate the 'r' vectors which are the dot product of each attention head
# with each word embedding. You should be getting a tensor that has this
# dot product back out---remember this vector is capturing how much the
# head thinks the vector is relevant for the task
# Calcuate the softmax of the 'r' vector, which call 'a'. This will give us
# a probability distribution over the tokens for each head. Be sure to check
# that the softmax is being calculated over the right axis/dimension of the
# data (You should see probability values that sum to 1 for each head's
# ratings across all the tokens)
# Calculate the re-weighting of the word embeddings for each head's attention
# weight and sum the reweighted sequence for each head into a single vector.
# This should give you n_heads vectors that each have embedding_size length.
# Note again that each head should give you a different weighting of the
# input word embeddings
# Create a single vector that has all n_heads' attention-weighted vectors
# as one single vector. We need this one-long-vector shape so that we
# can pass all these vectors as input into a layer.
#
# NOTE: if you're doing Option 2 for representing attention, you don't
# actually need to create a new vector (which is very inefficient).
# Instead, you can create a new *view* of the same data that reshapes the
# different heads' vectors so it looks like one long vector.
# Pass the side-by-side attention-weighted vectors through your linear
# layer to get some output activation.
#
# NOTE: if you're feeling adventurous, try adding an extra layer here
# which will allow you different attention-weighted vectors to interact
# in making the model decision
# Return the sigmoid of the output activation *and* the attention
# weights for each head. We'll need these later for visualization
# pass
# %% ../Attention Classifier Pytorch Student.ipynb 32
# Parameters for model initialization
vocab_size = len(word_to_index) # Assuming word_to_index is defined
embedding_size = 50 # the size used in word2vec model
num_heads = 5 # number of attention heads
embeddings_fname = 'model_weights_target.pt'
model = DocumentAttentionClassifier(vocab_size, embedding_size, num_heads, embeddings_fname)
# %% ../Attention Classifier Pytorch Student.ipynb 33
model.load_state_dict(torch.load('attention_clf_model.pt', map_location='cpu'))
# %% ../Attention Classifier Pytorch Student.ipynb 34
def get_label_and_weights(text, model):
'''
Classifies the text (requires tokenizing, etc.) and returns (1) the classification label,
(2) the tokenized words in the model's vocabulary,
and (3) the attention weights over the in-vocab tokens as a numpy array. Note that the
attention weights will be a matrix, depending on how many heads were used in training.
'''
# Tokenize the text
token_ids = tokenizer.tokenize(text.lower()) # Adjust according to your tokenizer
word_ids = [word_to_index.get(token, word_to_index['<UNK>']) for token in token_ids]
device='cpu'
model = model.to(device)
token_ids_tensor = torch.tensor([np.array(word_ids)]).to(device)
# Forward pass through the model
model.eval()
with torch.no_grad():
output, attention_weights = model(token_ids_tensor)
# Convert output to label
predicted_label = int(output.item() > 0.5)
predicted_label = "That's positive and so are you!" if predicted_label==1 else "Why so negative?"
# Convert token IDs back to tokens
tokens = [index_to_word.get(str(tid)) for tid in token_ids_tensor[0].tolist() if tid!=word_to_index['<UNK>']]
# Convert attention weights to numpy array if not already
attention_weights_numpy = attention_weights.cpu().numpy()
return predicted_label, tokens, attention_weights_numpy.squeeze(0)
# %% ../Attention Classifier Pytorch Student.ipynb 36
def visualize_attention(words, attention_weights):
'''
Makes a heatmap figure that visualizes the attention weights for an item.
Attention weights should be a numpy array that has the shape (num_words, num_heads)
'''
fig, ax = plt.subplots()
# Rescale image size based on the input length
fig.set_size_inches((len(words), 4))
im = ax.imshow(attention_weights.T)
head_labels = [ 'head-%d' % h for h in range(attention_weights.shape[1])]
ax.set_xticks(np.arange(len(words))) # , labels=words)
ax.set_yticks(np.arange(len(head_labels))) #, labels=head_labels)
plt.figure(figsize=(48,10))
# Rotate the word labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
# Add the words and axis labels
ax.set_yticklabels(labels=range(attention_weights.shape[1]), fontsize=16)
ax.set_ylabel('Attention Head', fontsize=16)
ax.set_xticklabels(labels=words, fontsize=16)
# Add a color bar to show probability scaling
cb = fig.colorbar(im, ax=ax, label='Probability', pad = 0.01)
cb.ax.tick_params(labelsize=16)
cb.set_label(label='Probability',size=16)
fig.tight_layout()
plt.show()
# %% ../Attention Classifier Pytorch Student.ipynb 38
# s = 'Just as I remembered it, one of my favorites from childhood! Great condition, very happy to have this to share with my daughter. Packaging was so nice and was received quickly.'
# pred, tokens, attn = get_label_and_weights(s, model)
# visualize_attention(tokens, attn)
# print(pred)
# %% ../Attention Classifier Pytorch Student.ipynb 39
def predict_and_visualize(s):
pred, tokens, attn = get_label_and_weights(s, model)
# Assuming visualize_attention can save an image and return its path
image_path = visualize_attention(tokens, attn)
return pred, image_path
# %% ../Attention Classifier Pytorch Student.ipynb 40
intf = gr.Interface(fn=predict_and_visualize,
inputs="text",
outputs=["text", "image"],
examples=["The book was amazing!", "Today's Weather is pretty bad!", "How are you feeling"],
title="Text Review Classifier with Attention Visualization",
description="Enter a review to see the prediction and attention visualization.")
intf.launch(share=True)