File size: 13,354 Bytes
97364d6
 
 
 
 
 
 
 
0991538
97364d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1160b0d
97364d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0012aca
97364d6
 
 
 
 
 
 
 
 
 
 
ebc7672
 
97364d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ebc7672
 
 
 
97364d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d2ca7a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
# AUTOGENERATED! DO NOT EDIT! File to edit: ../Attention Classifier Pytorch Student.ipynb.

# %% auto 0
__all__ = ['tokenizer', 'vocab_size', 'embedding_size', 'num_heads', 'embeddings_fname', 'model', 's', 'pred', 'tokens', 'attn',
           'intf', 'DocumentAttentionClassifier', 'get_label_and_weights', 'visualize_attention',
           'predict_and_visualize']

# %% ../Attention Classifier Pytorch Student.ipynb 2
import gradio as gr
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import json

np.random.seed(42)
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
from collections import Counter
import random
from torch import optim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Sort of smart tokenization
from nltk.tokenize import RegexpTokenizer

# Attention plotting
import matplotlib.pyplot as plt

# %% ../Attention Classifier Pytorch Student.ipynb 4
# Load the word-to-index mapping we used for word2vec and use the same type
# of tokenizer. We'll need to use this to tokenize in the same way and keep 
# the same word-to-id mapping

tokenizer = RegexpTokenizer(r'\w+')

with open('word_to_index.json', 'r') as f:
    word_to_index = json.load(f)
with open('index_to_word.json', 'r') as f:
    index_to_word = json.load(f)

# %% ../Attention Classifier Pytorch Student.ipynb 6
class DocumentAttentionClassifier(nn.Module):
    
    def __init__(self, vocab_size, embedding_size, num_heads, embeddings_fname):
        '''
        Creates the new classifier model. embeddings_fname is a string containing the
        filename with the saved pytorch parameters (the state dict) for the Embedding
        object that should be used to initialize this class's word Embedding parameters
        '''
        super(DocumentAttentionClassifier, self).__init__()
        
        # Save the input arguments to the state

        
    
        # Create the Embedding object that will hold our word embeddings that we
        # learned in word2vec. This embedding object should have the same size
        # as what we learned before. However, we don't to start from scratch! 
        # Once created, load the saved (word2vec-based) parameters into the object
        # using load_state_dict.
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.num_heads = num_heads

        # Load pre-trained embeddings
        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        pretrained_embeddings = torch.load(embeddings_fname)
        self.embeddings.load_state_dict({'weight': pretrained_embeddings})
        
        # Initialize attention heads as trainable parameters
        self.attention_heads = nn.Parameter(torch.randn(num_heads, embedding_size))

        # Linear layer for classification from concatenated attention heads
        self.output_layer = nn.Linear(num_heads * embedding_size, 1)
        

        # Define the attention heads. You have two options:
        # 
        # 1) the worse way to implement this is to define your heads using an Embedding
        #    and then access them individually later in forward(). This will be slower
        #    but will probably still work 
        #
        # 2) the ideal way is to think of your attention heads as rows in a matrix--
        #    just like we do for word2vec. While this is kind of the same as how
        #    we represent things like in an Embedding, the key difference is that we
        #    can now use **matrix operations** to calculate the different r and a
        #    vectors, which will be much faster (and less code). To do this, you'll
        #    need to represent the attention heads as a Tensor directly (not a layer)
        #    and make sure pytorch runs gradient descent on these parameters.
        #
        #  It's up to you which to use, but try option 2 first and see what you do 
        #  in the forward() function
        

        
        
        
        # Define the layer that goes from the concatenated attention heads' outputs
        # to the single output value. We'll push this output value through the sigmoid
        # to get our prediction

        # pass
    

    def forward(self, word_ids):
        
        # Pro Tip™: when implementing this forward pass, try playing around with pytorch
        # tensors in a jupyter notebook by making "fake" versions of them. For example:
        #
        # word_embeds = torch.Tensor([[1,6,2], [9,1,7]])
        #
        # If you have two word embeddings of length 3, how can you define the attention
        # heads to get the 'r' vector? Trying things out in the simple case will let you
        # quickly verify the sequence of operations you want to run, e.g., that you can take
        # the softmax of the 'r' vector to get the 'a' vector and it has the right shape
        # and values
        
        # Hint 1: If you're representing attention using Option 2, most of this code is just 
        #         matrix multiplications

        # Get embeddings for input word IDs
        embeddings = self.embeddings(word_ids)  # [batch_size, seq_len, embedding_size]

        # Calculate 'r' vectors (attention scores) for each head
        attention_scores = torch.matmul(embeddings, self.attention_heads.unsqueeze(0).transpose(1, 2))
        # [batch_size, seq_len, num_heads]

        # Apply softmax to get attention weights ('a' vectors)
        attention_weights = F.softmax(attention_scores, dim=1)  # softmax over seq_len dimension
        # [batch_size, seq_len, num_heads]

        # Apply attention weights to embeddings (weighted sum of embeddings)
        attended_embeddings = torch.matmul(attention_weights.transpose(1, 2), embeddings)
        # [batch_size, num_heads, embedding_size]

        # Concatenate attention head outputs to form a single vector per document
        concatenated = attended_embeddings.view(attended_embeddings.size(0), -1)
        # [batch_size, num_heads * embedding_size]

        # Pass through output layer to get prediction
        output = self.output_layer(concatenated)
        # [batch_size, 1]

        # Apply sigmoid activation for binary classification
        prediction = torch.sigmoid(output).squeeze(1)  # squeeze to remove extra dimension
        # [batch_size]

        return prediction, attention_weights

        # Hint 2: Most of your time is going to be spent figuring out shape errors and what
        #         operations you need to do to get the right outputs. This is normal.
        
        # Hint 3: This is the hardest part of this last part of the homework.

        
        # Get the word embeddings for the ids


        # Calcuate the 'r' vectors which are the dot product of each attention head
        # with each word embedding. You should be getting a tensor that has this
        # dot product back out---remember this vector is capturing how much the 
        # head thinks the vector is relevant for the task


        # Calcuate the softmax of the 'r' vector, which call 'a'. This will give us
        # a probability distribution over the tokens for each head. Be sure to check
        # that the softmax is being calculated over the right axis/dimension of the 
        # data (You should see probability values that sum to 1 for each head's 
        # ratings across all the tokens)        


        # Calculate the re-weighting of the word embeddings for each head's attention
        # weight and sum the reweighted sequence for each head into a single vector.
        # This should give you n_heads vectors that each have embedding_size length.
        # Note again that each head should give you a different weighting of the
        # input word embeddings

        
        # Create a single vector that has all n_heads' attention-weighted vectors
        # as one single vector. We need this one-long-vector shape so that we 
        # can pass all these vectors as input into a layer.
        #
        # NOTE: if you're doing Option 2 for representing attention, you don't 
        # actually need to create a new vector (which is very inefficient).
        # Instead, you can create a new *view* of the same data that reshapes the
        # different heads' vectors so it looks like one long vector. 


        # Pass the side-by-side attention-weighted vectors through your linear
        # layer to get some output activation.
        #
        # NOTE: if you're feeling adventurous, try adding an extra layer here
        # which will allow you different attention-weighted vectors to interact
        # in making the model decision

        
        
        # Return the sigmoid of the output activation *and* the attention 
        # weights for each head. We'll need these later for visualization
        # pass

# %% ../Attention Classifier Pytorch Student.ipynb 32
# Parameters for model initialization
vocab_size = len(word_to_index)  # Assuming word_to_index is defined
embedding_size = 50  # the size used in word2vec model
num_heads = 5  # number of attention heads
embeddings_fname = 'model_weights_target.pt'
model = DocumentAttentionClassifier(vocab_size, embedding_size, num_heads, embeddings_fname)


# %% ../Attention Classifier Pytorch Student.ipynb 33
model.load_state_dict(torch.load('attention_clf_model.pt', map_location='cpu'))


# %% ../Attention Classifier Pytorch Student.ipynb 34
def get_label_and_weights(text, model):
    '''
    Classifies the text (requires tokenizing, etc.) and returns (1) the classification label, 
    (2) the tokenized words in the model's vocabulary, 
    and (3) the attention weights over the in-vocab tokens as a numpy array. Note that the
    attention weights will be a matrix, depending on how many heads were used in training.
    '''

    # Tokenize the text
    token_ids = tokenizer.tokenize(text.lower())  # Adjust according to your tokenizer
    word_ids = [word_to_index.get(token, word_to_index['<UNK>']) for token in token_ids]
    
    device='cpu'
    model = model.to(device)
    token_ids_tensor = torch.tensor([np.array(word_ids)]).to(device)
    
    # Forward pass through the model
    model.eval()
    with torch.no_grad():
        output, attention_weights = model(token_ids_tensor)
    
        # Convert output to label
        predicted_label = int(output.item() > 0.5)

        predicted_label = "That's positive and so are you!" if predicted_label==1 else "Why so negative?"

        # Convert token IDs back to tokens
        tokens = [index_to_word.get(str(tid)) for tid in token_ids_tensor[0].tolist() if tid!=word_to_index['<UNK>']]
        
        # Convert attention weights to numpy array if not already
        attention_weights_numpy = attention_weights.cpu().numpy()
        
        return predicted_label, tokens, attention_weights_numpy.squeeze(0)

# %% ../Attention Classifier Pytorch Student.ipynb 36
def visualize_attention(words, attention_weights):
    '''
    Makes a heatmap figure that visualizes the attention weights for an item.
    Attention weights should be a numpy array that has the shape (num_words, num_heads)
    '''
    fig, ax = plt.subplots() 
    # Rescale image size based on the input length
    fig.set_size_inches((len(words), 4))    
    im = ax.imshow(attention_weights.T)

    head_labels = [ 'head-%d' % h for h in range(attention_weights.shape[1])]
    ax.set_xticks(np.arange(len(words))) # , labels=words)
    ax.set_yticks(np.arange(len(head_labels))) #, labels=head_labels)

    plt.figure(figsize=(48,10))
    # Rotate the word labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")
    
    # Add the words and axis labels
    ax.set_yticklabels(labels=range(attention_weights.shape[1]), fontsize=16)
    ax.set_ylabel('Attention Head', fontsize=16)
    ax.set_xticklabels(labels=words, fontsize=16)

    # Add a color bar to show probability scaling
    cb = fig.colorbar(im, ax=ax, label='Probability', pad = 0.01)
    cb.ax.tick_params(labelsize=16)
    cb.set_label(label='Probability',size=16)
    fig.tight_layout()
    plt.show()

# %% ../Attention Classifier Pytorch Student.ipynb 38
# s = 'Just as I remembered it, one of my favorites from childhood! Great condition, very happy to have this to share with my daughter. Packaging was so nice and was received quickly.'
# pred, tokens, attn = get_label_and_weights(s, model)
# visualize_attention(tokens, attn)
# print(pred)

# %% ../Attention Classifier Pytorch Student.ipynb 39
def predict_and_visualize(s):
    pred, tokens, attn = get_label_and_weights(s, model)
    
    # Assuming visualize_attention can save an image and return its path
    image_path = visualize_attention(tokens, attn)
    
    return pred, image_path


# %% ../Attention Classifier Pytorch Student.ipynb 40

intf = gr.Interface(fn=predict_and_visualize,
                    inputs="text",
                    outputs=["text", "image"],
                    examples=["The book was amazing!", "Today's Weather is pretty bad!", "How are you feeling"],
                    title="Text Review Classifier with Attention Visualization",
                    description="Enter a review to see the prediction and attention visualization.")

intf.launch(share=True)