Spaces:

DdIiVvYyAaMm
/

TextSentenceClassifier

Sleeping

Divyam Sharma

public Link generation

0d2ca7a about 2 years ago

13.4 kB

	# AUTOGENERATED! DO NOT EDIT! File to edit: ../Attention Classifier Pytorch Student.ipynb.

	# %% auto 0
	__all__ = ['tokenizer', 'vocab_size', 'embedding_size', 'num_heads', 'embeddings_fname', 'model', 's', 'pred', 'tokens', 'attn',
	'intf', 'DocumentAttentionClassifier', 'get_label_and_weights', 'visualize_attention',
	'predict_and_visualize']

	# %% ../Attention Classifier Pytorch Student.ipynb 2
	import gradio as gr
	import numpy as np
	import torch
	from torch.utils.data import Dataset, DataLoader
	import json

	np.random.seed(42)
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.nn import init
	from collections import Counter
	import random
	from torch import optim
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns

	# Sort of smart tokenization
	from nltk.tokenize import RegexpTokenizer

	# Attention plotting
	import matplotlib.pyplot as plt

	# %% ../Attention Classifier Pytorch Student.ipynb 4
	# Load the word-to-index mapping we used for word2vec and use the same type
	# of tokenizer. We'll need to use this to tokenize in the same way and keep
	# the same word-to-id mapping

	tokenizer = RegexpTokenizer(r'\w+')

	with open('word_to_index.json', 'r') as f:
	word_to_index = json.load(f)
	with open('index_to_word.json', 'r') as f:
	index_to_word = json.load(f)

	# %% ../Attention Classifier Pytorch Student.ipynb 6
	class DocumentAttentionClassifier(nn.Module):

	def __init__(self, vocab_size, embedding_size, num_heads, embeddings_fname):
	'''
	Creates the new classifier model. embeddings_fname is a string containing the
	filename with the saved pytorch parameters (the state dict) for the Embedding
	object that should be used to initialize this class's word Embedding parameters
	'''
	super(DocumentAttentionClassifier, self).__init__()

	# Save the input arguments to the state



	# Create the Embedding object that will hold our word embeddings that we
	# learned in word2vec. This embedding object should have the same size
	# as what we learned before. However, we don't to start from scratch!
	# Once created, load the saved (word2vec-based) parameters into the object
	# using load_state_dict.
	self.vocab_size = vocab_size
	self.embedding_size = embedding_size
	self.num_heads = num_heads

	# Load pre-trained embeddings
	self.embeddings = nn.Embedding(vocab_size, embedding_size)
	pretrained_embeddings = torch.load(embeddings_fname)
	self.embeddings.load_state_dict({'weight': pretrained_embeddings})

	# Initialize attention heads as trainable parameters
	self.attention_heads = nn.Parameter(torch.randn(num_heads, embedding_size))

	# Linear layer for classification from concatenated attention heads
	self.output_layer = nn.Linear(num_heads * embedding_size, 1)


	# Define the attention heads. You have two options:
	#
	# 1) the worse way to implement this is to define your heads using an Embedding
	# and then access them individually later in forward(). This will be slower
	# but will probably still work
	#
	# 2) the ideal way is to think of your attention heads as rows in a matrix--
	# just like we do for word2vec. While this is kind of the same as how
	# we represent things like in an Embedding, the key difference is that we
	# can now use matrix operations to calculate the different r and a
	# vectors, which will be much faster (and less code). To do this, you'll
	# need to represent the attention heads as a Tensor directly (not a layer)
	# and make sure pytorch runs gradient descent on these parameters.
	#
	# It's up to you which to use, but try option 2 first and see what you do
	# in the forward() function





	# Define the layer that goes from the concatenated attention heads' outputs
	# to the single output value. We'll push this output value through the sigmoid
	# to get our prediction

	# pass


	def forward(self, word_ids):

	# Pro Tip™: when implementing this forward pass, try playing around with pytorch
	# tensors in a jupyter notebook by making "fake" versions of them. For example:
	#
	# word_embeds = torch.Tensor([[1,6,2], [9,1,7]])
	#
	# If you have two word embeddings of length 3, how can you define the attention
	# heads to get the 'r' vector? Trying things out in the simple case will let you
	# quickly verify the sequence of operations you want to run, e.g., that you can take
	# the softmax of the 'r' vector to get the 'a' vector and it has the right shape
	# and values

	# Hint 1: If you're representing attention using Option 2, most of this code is just
	# matrix multiplications

	# Get embeddings for input word IDs
	embeddings = self.embeddings(word_ids) # [batch_size, seq_len, embedding_size]

	# Calculate 'r' vectors (attention scores) for each head
	attention_scores = torch.matmul(embeddings, self.attention_heads.unsqueeze(0).transpose(1, 2))
	# [batch_size, seq_len, num_heads]

	# Apply softmax to get attention weights ('a' vectors)
	attention_weights = F.softmax(attention_scores, dim=1) # softmax over seq_len dimension
	# [batch_size, seq_len, num_heads]

	# Apply attention weights to embeddings (weighted sum of embeddings)
	attended_embeddings = torch.matmul(attention_weights.transpose(1, 2), embeddings)
	# [batch_size, num_heads, embedding_size]

	# Concatenate attention head outputs to form a single vector per document
	concatenated = attended_embeddings.view(attended_embeddings.size(0), -1)
	# [batch_size, num_heads * embedding_size]

	# Pass through output layer to get prediction
	output = self.output_layer(concatenated)
	# [batch_size, 1]

	# Apply sigmoid activation for binary classification
	prediction = torch.sigmoid(output).squeeze(1) # squeeze to remove extra dimension
	# [batch_size]

	return prediction, attention_weights

	# Hint 2: Most of your time is going to be spent figuring out shape errors and what
	# operations you need to do to get the right outputs. This is normal.

	# Hint 3: This is the hardest part of this last part of the homework.


	# Get the word embeddings for the ids


	# Calcuate the 'r' vectors which are the dot product of each attention head
	# with each word embedding. You should be getting a tensor that has this
	# dot product back out---remember this vector is capturing how much the
	# head thinks the vector is relevant for the task


	# Calcuate the softmax of the 'r' vector, which call 'a'. This will give us
	# a probability distribution over the tokens for each head. Be sure to check
	# that the softmax is being calculated over the right axis/dimension of the
	# data (You should see probability values that sum to 1 for each head's
	# ratings across all the tokens)


	# Calculate the re-weighting of the word embeddings for each head's attention
	# weight and sum the reweighted sequence for each head into a single vector.
	# This should give you n_heads vectors that each have embedding_size length.
	# Note again that each head should give you a different weighting of the
	# input word embeddings


	# Create a single vector that has all n_heads' attention-weighted vectors
	# as one single vector. We need this one-long-vector shape so that we
	# can pass all these vectors as input into a layer.
	#
	# NOTE: if you're doing Option 2 for representing attention, you don't
	# actually need to create a new vector (which is very inefficient).
	# Instead, you can create a new view of the same data that reshapes the
	# different heads' vectors so it looks like one long vector.


	# Pass the side-by-side attention-weighted vectors through your linear
	# layer to get some output activation.
	#
	# NOTE: if you're feeling adventurous, try adding an extra layer here
	# which will allow you different attention-weighted vectors to interact
	# in making the model decision



	# Return the sigmoid of the output activation and the attention
	# weights for each head. We'll need these later for visualization
	# pass

	# %% ../Attention Classifier Pytorch Student.ipynb 32
	# Parameters for model initialization
	vocab_size = len(word_to_index) # Assuming word_to_index is defined
	embedding_size = 50 # the size used in word2vec model
	num_heads = 5 # number of attention heads
	embeddings_fname = 'model_weights_target.pt'
	model = DocumentAttentionClassifier(vocab_size, embedding_size, num_heads, embeddings_fname)


	# %% ../Attention Classifier Pytorch Student.ipynb 33
	model.load_state_dict(torch.load('attention_clf_model.pt', map_location='cpu'))


	# %% ../Attention Classifier Pytorch Student.ipynb 34
	def get_label_and_weights(text, model):
	'''
	Classifies the text (requires tokenizing, etc.) and returns (1) the classification label,
	(2) the tokenized words in the model's vocabulary,
	and (3) the attention weights over the in-vocab tokens as a numpy array. Note that the
	attention weights will be a matrix, depending on how many heads were used in training.
	'''

	# Tokenize the text
	token_ids = tokenizer.tokenize(text.lower()) # Adjust according to your tokenizer
	word_ids = [word_to_index.get(token, word_to_index['<UNK>']) for token in token_ids]

	device='cpu'
	model = model.to(device)
	token_ids_tensor = torch.tensor([np.array(word_ids)]).to(device)

	# Forward pass through the model
	model.eval()
	with torch.no_grad():
	output, attention_weights = model(token_ids_tensor)

	# Convert output to label
	predicted_label = int(output.item() > 0.5)

	predicted_label = "That's positive and so are you!" if predicted_label==1 else "Why so negative?"

	# Convert token IDs back to tokens
	tokens = [index_to_word.get(str(tid)) for tid in token_ids_tensor[0].tolist() if tid!=word_to_index['<UNK>']]

	# Convert attention weights to numpy array if not already
	attention_weights_numpy = attention_weights.cpu().numpy()

	return predicted_label, tokens, attention_weights_numpy.squeeze(0)

	# %% ../Attention Classifier Pytorch Student.ipynb 36
	def visualize_attention(words, attention_weights):
	'''
	Makes a heatmap figure that visualizes the attention weights for an item.
	Attention weights should be a numpy array that has the shape (num_words, num_heads)
	'''
	fig, ax = plt.subplots()
	# Rescale image size based on the input length
	fig.set_size_inches((len(words), 4))
	im = ax.imshow(attention_weights.T)

	head_labels = [ 'head-%d' % h for h in range(attention_weights.shape[1])]
	ax.set_xticks(np.arange(len(words))) # , labels=words)
	ax.set_yticks(np.arange(len(head_labels))) #, labels=head_labels)

	plt.figure(figsize=(48,10))
	# Rotate the word labels and set their alignment.
	plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
	rotation_mode="anchor")

	# Add the words and axis labels
	ax.set_yticklabels(labels=range(attention_weights.shape[1]), fontsize=16)
	ax.set_ylabel('Attention Head', fontsize=16)
	ax.set_xticklabels(labels=words, fontsize=16)

	# Add a color bar to show probability scaling
	cb = fig.colorbar(im, ax=ax, label='Probability', pad = 0.01)
	cb.ax.tick_params(labelsize=16)
	cb.set_label(label='Probability',size=16)
	fig.tight_layout()
	plt.show()

	# %% ../Attention Classifier Pytorch Student.ipynb 38
	# s = 'Just as I remembered it, one of my favorites from childhood! Great condition, very happy to have this to share with my daughter. Packaging was so nice and was received quickly.'
	# pred, tokens, attn = get_label_and_weights(s, model)
	# visualize_attention(tokens, attn)
	# print(pred)

	# %% ../Attention Classifier Pytorch Student.ipynb 39
	def predict_and_visualize(s):
	pred, tokens, attn = get_label_and_weights(s, model)

	# Assuming visualize_attention can save an image and return its path
	image_path = visualize_attention(tokens, attn)

	return pred, image_path


	# %% ../Attention Classifier Pytorch Student.ipynb 40

	intf = gr.Interface(fn=predict_and_visualize,
	inputs="text",
	outputs=["text", "image"],
	examples=["The book was amazing!", "Today's Weather is pretty bad!", "How are you feeling"],
	title="Text Review Classifier with Attention Visualization",
	description="Enter a review to see the prediction and attention visualization.")

	intf.launch(share=True)