Group209
/

Sentiment_Analysis

+# -*- coding: utf-8 -*-
+"""Copie_de_08_sentiment_analysis_with_bert.ipynb
+Automatically generated by Colaboratory.
+Original file is located at
+    https://colab.research.google.com/drive/1zHnnWVxTXMeLoDe2L-hV_LzK6S7Flgps
+"""
+!nvidia-smi
+"""## Setup
+We'll need [the Transformers library](https://huggingface.co/transformers/) by Hugging Face:
+"""
+!pip install -q -U watermark
+!pip install -qq transformers
+# Commented out IPython magic to ensure Python compatibility.
+# %reload_ext watermark
+# %watermark -v -p numpy,pandas,torch,transformers
+# Commented out IPython magic to ensure Python compatibility.
+#@title Setup & Config
+import transformers
+from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
+import torch
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from pylab import rcParams
+import matplotlib.pyplot as plt
+from matplotlib import rc
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import confusion_matrix, classification_report
+from collections import defaultdict
+from textwrap import wrap
+from torch import nn, optim
+from torch.utils.data import Dataset, DataLoader
+import torch.nn.functional as F
+# %matplotlib inline
+# %config InlineBackend.figure_format='retina'
+sns.set(style='whitegrid', palette='muted', font_scale=1.2)
+HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
+sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
+rcParams['figure.figsize'] = 12, 8
+RANDOM_SEED = 42
+np.random.seed(RANDOM_SEED)
+torch.manual_seed(RANDOM_SEED)
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+device
+!gdown --id 1S6qMioqPJjyBLpLVz4gmRTnJHnjitnuV
+!gdown --id 1zdmewp7ayS4js4VtrJEHzAheSW-5NBZv
+df = pd.read_csv("reviews.csv")
+df.head()
+df.shape
+df.info()
+print(df.score)
+sns.countplot(x='score', data = df)
+plt.xlabel('review score');
+def to_sentiment(rating):
+  rating = int(rating)
+  if rating <= 2:
+    return 0
+  elif rating == 3:
+    return 1
+  else:
+    return 2
+df['sentiment'] = df.score.apply(to_sentiment)
+class_names = ['negative', 'neutral', 'positive']
+print(df.sentiment)
+ax = sns.countplot(x='sentiment', data = df)
+plt.xlabel('review sentiment')
+ax.set_xticklabels(class_names);
+PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
+tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
+sample_txt = 'When was I last outside? I am stuck at home for 2 weeks.'
+tokens = tokenizer.tokenize(sample_txt)
+token_ids = tokenizer.convert_tokens_to_ids(tokens)
+print(f' Sentence: {sample_txt}')
+print(f'   Tokens: {tokens}')
+print(f'Token IDs: {token_ids}')
+tokenizer.sep_token, tokenizer.sep_token_id
+tokenizer.cls_token, tokenizer.cls_token_id
+tokenizer.pad_token, tokenizer.pad_token_id
+tokenizer.unk_token, tokenizer.unk_token_id
+encoding = tokenizer.encode_plus(
+  sample_txt,
+  max_length=32,
+  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
+  return_token_type_ids=False,
+  pad_to_max_length=True,
+  return_attention_mask=True,
+  return_tensors='pt',  # Return PyTorch tensors
+)
+encoding.keys()
+print(len(encoding['input_ids'][0]))
+encoding['input_ids'][0]
+print(len(encoding['attention_mask'][0]))
+encoding['attention_mask']
+tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])
+token_lens = []
+for txt in df.content:
+  tokens = tokenizer.encode(txt, max_length=512)
+  token_lens.append(len(tokens))
+sns.distplot(token_lens)
+plt.xlim([0, 256]);
+plt.xlabel('Token count');
+MAX_LEN = 160
+class GPReviewDataset(Dataset):
+  def __init__(self, reviews, targets, tokenizer, max_len):
+    self.reviews = reviews
+    self.targets = targets
+    self.tokenizer = tokenizer
+    self.max_len = max_len
+  def __len__(self):
+    return len(self.reviews)
+  def __getitem__(self, item):
+    review = str(self.reviews[item])
+    target = self.targets[item]
+    encoding = self.tokenizer.encode_plus(
+      review,
+      add_special_tokens=True,
+      max_length=self.max_len,
+      return_token_type_ids=False,
+      pad_to_max_length=True,
+      return_attention_mask=True,
+      return_tensors='pt',
+    )
+    return {
+      'review_text': review,
+      'input_ids': encoding['input_ids'].flatten(),
+      'attention_mask': encoding['attention_mask'].flatten(),
+      'targets': torch.tensor(target, dtype=torch.long)
+    }
+df_train, df_test = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)
+df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)
+df_train.shape, df_val.shape, df_test.shape
+def create_data_loader(df, tokenizer, max_len, batch_size):
+  ds = GPReviewDataset(
+    reviews=df.content.to_numpy(),
+    targets=df.sentiment.to_numpy(),
+    tokenizer=tokenizer,
+    max_len=max_len
+  )
+  return DataLoader(
+    ds,
+    batch_size=batch_size,
+    num_workers=4
+  )
+BATCH_SIZE = 16
+train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
+val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
+test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
+data = next(iter(train_data_loader))
+data.keys()
+print(data['input_ids'].shape)
+print(data['attention_mask'].shape)
+print(data['targets'].shape)
+bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
+last_hidden_state, pooled_output = bert_model(
+  input_ids=encoding['input_ids'],
+  attention_mask=encoding['attention_mask'],
+  return_dict = False
+)
+last_hidden_state.shape
+bert_model.config.hidden_size
+pooled_output.shape
+class SentimentClassifier(nn.Module):
+  def __init__(self, n_classes):
+    super(SentimentClassifier, self).__init__()
+    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
+    self.drop = nn.Dropout(p=0.3)
+    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
+  def forward(self, input_ids, attention_mask):
+    returned = self.bert(
+      input_ids=input_ids,
+      attention_mask=attention_mask
+    )
+    pooled_output = returned["pooler_output"]
+    output = self.drop(pooled_output)
+    return self.out(output)
+model = SentimentClassifier(len(class_names))
+model = model.to(device)
+input_ids = data['input_ids'].to(device)
+attention_mask = data['attention_mask'].to(device)
+print(input_ids.shape) # batch size x seq length
+print(attention_mask.shape) # batch size x seq length
+F.softmax(model(input_ids, attention_mask), dim=1)
+"""### Training"""
+EPOCHS = 6
+optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
+total_steps = len(train_data_loader) * EPOCHS
+scheduler = get_linear_schedule_with_warmup(
+  optimizer,
+  num_warmup_steps=0,
+  num_training_steps=total_steps
+)
+loss_fn = nn.CrossEntropyLoss().to(device)
+def train_epoch(
+  model,
+  data_loader,
+  loss_fn,
+  optimizer,
+  device,
+  scheduler,
+  n_examples
+):
+  model = model.train()
+  losses = []
+  correct_predictions = 0
+  for d in data_loader:
+    input_ids = d["input_ids"].to(device)
+    attention_mask = d["attention_mask"].to(device)
+    targets = d["targets"].to(device)
+    outputs = model(
+      input_ids=input_ids,
+      attention_mask=attention_mask
+    )
+    _, preds = torch.max(outputs, dim=1)
+    loss = loss_fn(outputs, targets)
+    correct_predictions += torch.sum(preds == targets)
+    losses.append(loss.item())
+    loss.backward()
+    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+    optimizer.step()
+    scheduler.step()
+    optimizer.zero_grad()
+  return correct_predictions.double() / n_examples, np.mean(losses)
+def eval_model(model, data_loader, loss_fn, device, n_examples):
+  model = model.eval()
+  losses = []
+  correct_predictions = 0
+  with torch.no_grad():
+    for d in data_loader:
+      input_ids = d["input_ids"].to(device)
+      attention_mask = d["attention_mask"].to(device)
+      targets = d["targets"].to(device)
+      outputs = model(
+        input_ids=input_ids,
+        attention_mask=attention_mask
+      )
+      _, preds = torch.max(outputs, dim=1)
+      loss = loss_fn(outputs, targets)
+      correct_predictions += torch.sum(preds == targets)
+      losses.append(loss.item())
+  return correct_predictions.double() / n_examples, np.mean(losses)
+# Commented out IPython magic to ensure Python compatibility.
+# %%time
+#
+# history = defaultdict(list)
+# best_accuracy = 0
+#
+# for epoch in range(EPOCHS):
+#
+#   print(f'Epoch {epoch + 1}/{EPOCHS}')
+#   print('-' * 10)
+#
+#   train_acc, train_loss = train_epoch(
+#     model,
+#     train_data_loader,
+#     loss_fn,
+#     optimizer,
+#     device,
+#     scheduler,
+#     len(df_train)
+#   )
+#
+#   print(f'Train loss {train_loss} accuracy {train_acc}')
+#
+#   val_acc, val_loss = eval_model(
+#     model,
+#     val_data_loader,
+#     loss_fn,
+#     device,
+#     len(df_val)
+#   )
+#
+#   print(f'Val   loss {val_loss} accuracy {val_acc}')
+#   print()
+#
+#   history['train_acc'].append(train_acc)
+#   history['train_loss'].append(train_loss)
+#   history['val_acc'].append(val_acc)
+#   history['val_loss'].append(val_loss)
+#
+#   if val_acc > best_accuracy:
+#     torch.save(model.state_dict(), 'best_model_state.bin')
+#     best_accuracy = val_acc
+print(history['train_acc'])
+list_of_train_accuracy= [t.cpu().numpy() for t in history['train_acc']]
+list_of_train_accuracy
+print(history['val_acc'])
+list_of_val_accuracy= [t.cpu().numpy() for t in history['val_acc']]
+list_of_val_accuracy
+plt.plot(list_of_train_accuracy, label='train accuracy')
+plt.plot(list_of_val_accuracy, label='validation accuracy')
+plt.title('Training history')
+plt.ylabel('Accuracy')
+plt.xlabel('Epoch')
+plt.legend()
+plt.ylim([0, 1]);
+# !gdown --id 1V8itWtowCYnb2Bc9KlK9SxGff9WwmogA
+# model = SentimentClassifier(len(class_names))
+# model.load_state_dict(torch.load('best_model_state.bin'))
+# model = model.to(device)
+test_acc, _ = eval_model(
+  model,
+  test_data_loader,
+  loss_fn,
+  device,
+  len(df_test)
+)
+print(('\n'))
+print('Test Accuracy : ', test_acc.item())
+def get_predictions(model, data_loader):
+  model = model.eval()
+  review_texts = []
+  predictions = []
+  prediction_probs = []
+  real_values = []
+  with torch.no_grad():
+    for d in data_loader:
+      texts = d["review_text"]
+      input_ids = d["input_ids"].to(device)
+      attention_mask = d["attention_mask"].to(device)
+      targets = d["targets"].to(device)
+      outputs = model(
+        input_ids=input_ids,
+        attention_mask=attention_mask
+      )
+      _, preds = torch.max(outputs, dim=1)
+      probs = F.softmax(outputs, dim=1)
+      review_texts.extend(texts)
+      predictions.extend(preds)
+      prediction_probs.extend(probs)
+      real_values.extend(targets)
+  predictions = torch.stack(predictions).cpu()
+  prediction_probs = torch.stack(prediction_probs).cpu()
+  real_values = torch.stack(real_values).cpu()
+  return review_texts, predictions, prediction_probs, real_values
+y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
+  model,
+  test_data_loader
+)
+print(classification_report(y_test, y_pred, target_names=class_names))
+def show_confusion_matrix(confusion_matrix):
+  hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
+  hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
+  hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
+  plt.ylabel('True sentiment')
+  plt.xlabel('Predicted sentiment');
+cm = confusion_matrix(y_test, y_pred)
+df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
+show_confusion_matrix(df_cm)
+idx = 2
+review_text = y_review_texts[idx]
+true_sentiment = y_test[idx]
+pred_df = pd.DataFrame({
+  'class_names': class_names,
+  'values': y_pred_probs[idx]
+})
+print("\n".join(wrap(review_text)))
+print()
+print(f'True sentiment: {class_names[true_sentiment]}')
+sns.barplot(x='values', y='class_names', data=pred_df, orient='h')
+plt.ylabel('sentiment')
+plt.xlabel('probability')
+plt.xlim([0, 1]);
+review_text = "I hate you!!!"
+encoded_review = tokenizer.encode_plus(
+  review_text,
+  max_length=MAX_LEN,
+  add_special_tokens=True,
+  return_token_type_ids=False,
+  pad_to_max_length=True,
+  return_attention_mask=True,
+  return_tensors='pt',
+)
+input_ids = encoded_review['input_ids'].to(device)
+attention_mask = encoded_review['attention_mask'].to(device)
+output = model(input_ids, attention_mask)
+_, prediction = torch.max(output, dim=1)
+print(f'Review text: {review_text}')
+print(f'Sentiment  : {class_names[prediction]}')
+"""## References
+- [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)
+- [L11 Language Models - Alec Radford (OpenAI)](https://www.youtube.com/watch?v=BnpB3GrpsfM)
+- [The Illustrated BERT, ELMo, and co.](https://jalammar.github.io/illustrated-bert/)
+- [BERT Fine-Tuning Tutorial with PyTorch](https://mccormickml.com/2019/07/22/BERT-fine-tuning/)
+- [How to Fine-Tune BERT for Text Classification?](https://arxiv.org/pdf/1905.05583.pdf)
+- [Huggingface Transformers](https://huggingface.co/transformers/)
+- [BERT Explained: State of the art language model for NLP](https://towardsdatascience.com/bert-explained-state-of-the-art-language-model-for-nlp-f8b21a9b6270)
+"""