Spaces:
Sleeping
Sleeping
File size: 13,354 Bytes
97364d6 0991538 97364d6 1160b0d 97364d6 0012aca 97364d6 ebc7672 97364d6 ebc7672 97364d6 0d2ca7a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 | # AUTOGENERATED! DO NOT EDIT! File to edit: ../Attention Classifier Pytorch Student.ipynb.
# %% auto 0
__all__ = ['tokenizer', 'vocab_size', 'embedding_size', 'num_heads', 'embeddings_fname', 'model', 's', 'pred', 'tokens', 'attn',
'intf', 'DocumentAttentionClassifier', 'get_label_and_weights', 'visualize_attention',
'predict_and_visualize']
# %% ../Attention Classifier Pytorch Student.ipynb 2
import gradio as gr
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import json
np.random.seed(42)
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
from collections import Counter
import random
from torch import optim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Sort of smart tokenization
from nltk.tokenize import RegexpTokenizer
# Attention plotting
import matplotlib.pyplot as plt
# %% ../Attention Classifier Pytorch Student.ipynb 4
# Load the word-to-index mapping we used for word2vec and use the same type
# of tokenizer. We'll need to use this to tokenize in the same way and keep
# the same word-to-id mapping
tokenizer = RegexpTokenizer(r'\w+')
with open('word_to_index.json', 'r') as f:
word_to_index = json.load(f)
with open('index_to_word.json', 'r') as f:
index_to_word = json.load(f)
# %% ../Attention Classifier Pytorch Student.ipynb 6
class DocumentAttentionClassifier(nn.Module):
def __init__(self, vocab_size, embedding_size, num_heads, embeddings_fname):
'''
Creates the new classifier model. embeddings_fname is a string containing the
filename with the saved pytorch parameters (the state dict) for the Embedding
object that should be used to initialize this class's word Embedding parameters
'''
super(DocumentAttentionClassifier, self).__init__()
# Save the input arguments to the state
# Create the Embedding object that will hold our word embeddings that we
# learned in word2vec. This embedding object should have the same size
# as what we learned before. However, we don't to start from scratch!
# Once created, load the saved (word2vec-based) parameters into the object
# using load_state_dict.
self.vocab_size = vocab_size
self.embedding_size = embedding_size
self.num_heads = num_heads
# Load pre-trained embeddings
self.embeddings = nn.Embedding(vocab_size, embedding_size)
pretrained_embeddings = torch.load(embeddings_fname)
self.embeddings.load_state_dict({'weight': pretrained_embeddings})
# Initialize attention heads as trainable parameters
self.attention_heads = nn.Parameter(torch.randn(num_heads, embedding_size))
# Linear layer for classification from concatenated attention heads
self.output_layer = nn.Linear(num_heads * embedding_size, 1)
# Define the attention heads. You have two options:
#
# 1) the worse way to implement this is to define your heads using an Embedding
# and then access them individually later in forward(). This will be slower
# but will probably still work
#
# 2) the ideal way is to think of your attention heads as rows in a matrix--
# just like we do for word2vec. While this is kind of the same as how
# we represent things like in an Embedding, the key difference is that we
# can now use **matrix operations** to calculate the different r and a
# vectors, which will be much faster (and less code). To do this, you'll
# need to represent the attention heads as a Tensor directly (not a layer)
# and make sure pytorch runs gradient descent on these parameters.
#
# It's up to you which to use, but try option 2 first and see what you do
# in the forward() function
# Define the layer that goes from the concatenated attention heads' outputs
# to the single output value. We'll push this output value through the sigmoid
# to get our prediction
# pass
def forward(self, word_ids):
# Pro Tip™: when implementing this forward pass, try playing around with pytorch
# tensors in a jupyter notebook by making "fake" versions of them. For example:
#
# word_embeds = torch.Tensor([[1,6,2], [9,1,7]])
#
# If you have two word embeddings of length 3, how can you define the attention
# heads to get the 'r' vector? Trying things out in the simple case will let you
# quickly verify the sequence of operations you want to run, e.g., that you can take
# the softmax of the 'r' vector to get the 'a' vector and it has the right shape
# and values
# Hint 1: If you're representing attention using Option 2, most of this code is just
# matrix multiplications
# Get embeddings for input word IDs
embeddings = self.embeddings(word_ids) # [batch_size, seq_len, embedding_size]
# Calculate 'r' vectors (attention scores) for each head
attention_scores = torch.matmul(embeddings, self.attention_heads.unsqueeze(0).transpose(1, 2))
# [batch_size, seq_len, num_heads]
# Apply softmax to get attention weights ('a' vectors)
attention_weights = F.softmax(attention_scores, dim=1) # softmax over seq_len dimension
# [batch_size, seq_len, num_heads]
# Apply attention weights to embeddings (weighted sum of embeddings)
attended_embeddings = torch.matmul(attention_weights.transpose(1, 2), embeddings)
# [batch_size, num_heads, embedding_size]
# Concatenate attention head outputs to form a single vector per document
concatenated = attended_embeddings.view(attended_embeddings.size(0), -1)
# [batch_size, num_heads * embedding_size]
# Pass through output layer to get prediction
output = self.output_layer(concatenated)
# [batch_size, 1]
# Apply sigmoid activation for binary classification
prediction = torch.sigmoid(output).squeeze(1) # squeeze to remove extra dimension
# [batch_size]
return prediction, attention_weights
# Hint 2: Most of your time is going to be spent figuring out shape errors and what
# operations you need to do to get the right outputs. This is normal.
# Hint 3: This is the hardest part of this last part of the homework.
# Get the word embeddings for the ids
# Calcuate the 'r' vectors which are the dot product of each attention head
# with each word embedding. You should be getting a tensor that has this
# dot product back out---remember this vector is capturing how much the
# head thinks the vector is relevant for the task
# Calcuate the softmax of the 'r' vector, which call 'a'. This will give us
# a probability distribution over the tokens for each head. Be sure to check
# that the softmax is being calculated over the right axis/dimension of the
# data (You should see probability values that sum to 1 for each head's
# ratings across all the tokens)
# Calculate the re-weighting of the word embeddings for each head's attention
# weight and sum the reweighted sequence for each head into a single vector.
# This should give you n_heads vectors that each have embedding_size length.
# Note again that each head should give you a different weighting of the
# input word embeddings
# Create a single vector that has all n_heads' attention-weighted vectors
# as one single vector. We need this one-long-vector shape so that we
# can pass all these vectors as input into a layer.
#
# NOTE: if you're doing Option 2 for representing attention, you don't
# actually need to create a new vector (which is very inefficient).
# Instead, you can create a new *view* of the same data that reshapes the
# different heads' vectors so it looks like one long vector.
# Pass the side-by-side attention-weighted vectors through your linear
# layer to get some output activation.
#
# NOTE: if you're feeling adventurous, try adding an extra layer here
# which will allow you different attention-weighted vectors to interact
# in making the model decision
# Return the sigmoid of the output activation *and* the attention
# weights for each head. We'll need these later for visualization
# pass
# %% ../Attention Classifier Pytorch Student.ipynb 32
# Parameters for model initialization
vocab_size = len(word_to_index) # Assuming word_to_index is defined
embedding_size = 50 # the size used in word2vec model
num_heads = 5 # number of attention heads
embeddings_fname = 'model_weights_target.pt'
model = DocumentAttentionClassifier(vocab_size, embedding_size, num_heads, embeddings_fname)
# %% ../Attention Classifier Pytorch Student.ipynb 33
model.load_state_dict(torch.load('attention_clf_model.pt', map_location='cpu'))
# %% ../Attention Classifier Pytorch Student.ipynb 34
def get_label_and_weights(text, model):
'''
Classifies the text (requires tokenizing, etc.) and returns (1) the classification label,
(2) the tokenized words in the model's vocabulary,
and (3) the attention weights over the in-vocab tokens as a numpy array. Note that the
attention weights will be a matrix, depending on how many heads were used in training.
'''
# Tokenize the text
token_ids = tokenizer.tokenize(text.lower()) # Adjust according to your tokenizer
word_ids = [word_to_index.get(token, word_to_index['<UNK>']) for token in token_ids]
device='cpu'
model = model.to(device)
token_ids_tensor = torch.tensor([np.array(word_ids)]).to(device)
# Forward pass through the model
model.eval()
with torch.no_grad():
output, attention_weights = model(token_ids_tensor)
# Convert output to label
predicted_label = int(output.item() > 0.5)
predicted_label = "That's positive and so are you!" if predicted_label==1 else "Why so negative?"
# Convert token IDs back to tokens
tokens = [index_to_word.get(str(tid)) for tid in token_ids_tensor[0].tolist() if tid!=word_to_index['<UNK>']]
# Convert attention weights to numpy array if not already
attention_weights_numpy = attention_weights.cpu().numpy()
return predicted_label, tokens, attention_weights_numpy.squeeze(0)
# %% ../Attention Classifier Pytorch Student.ipynb 36
def visualize_attention(words, attention_weights):
'''
Makes a heatmap figure that visualizes the attention weights for an item.
Attention weights should be a numpy array that has the shape (num_words, num_heads)
'''
fig, ax = plt.subplots()
# Rescale image size based on the input length
fig.set_size_inches((len(words), 4))
im = ax.imshow(attention_weights.T)
head_labels = [ 'head-%d' % h for h in range(attention_weights.shape[1])]
ax.set_xticks(np.arange(len(words))) # , labels=words)
ax.set_yticks(np.arange(len(head_labels))) #, labels=head_labels)
plt.figure(figsize=(48,10))
# Rotate the word labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
# Add the words and axis labels
ax.set_yticklabels(labels=range(attention_weights.shape[1]), fontsize=16)
ax.set_ylabel('Attention Head', fontsize=16)
ax.set_xticklabels(labels=words, fontsize=16)
# Add a color bar to show probability scaling
cb = fig.colorbar(im, ax=ax, label='Probability', pad = 0.01)
cb.ax.tick_params(labelsize=16)
cb.set_label(label='Probability',size=16)
fig.tight_layout()
plt.show()
# %% ../Attention Classifier Pytorch Student.ipynb 38
# s = 'Just as I remembered it, one of my favorites from childhood! Great condition, very happy to have this to share with my daughter. Packaging was so nice and was received quickly.'
# pred, tokens, attn = get_label_and_weights(s, model)
# visualize_attention(tokens, attn)
# print(pred)
# %% ../Attention Classifier Pytorch Student.ipynb 39
def predict_and_visualize(s):
pred, tokens, attn = get_label_and_weights(s, model)
# Assuming visualize_attention can save an image and return its path
image_path = visualize_attention(tokens, attn)
return pred, image_path
# %% ../Attention Classifier Pytorch Student.ipynb 40
intf = gr.Interface(fn=predict_and_visualize,
inputs="text",
outputs=["text", "image"],
examples=["The book was amazing!", "Today's Weather is pretty bad!", "How are you feeling"],
title="Text Review Classifier with Attention Visualization",
description="Enter a review to see the prediction and attention visualization.")
intf.launch(share=True)
|