MarieAngeA13 commited on
Commit
d5435a2
·
1 Parent(s): 3d89881

Upload copie_de_08_sentiment_analysis_with_bert.py

Browse files
copie_de_08_sentiment_analysis_with_bert.py ADDED
@@ -0,0 +1,514 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Copie_de_08_sentiment_analysis_with_bert.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1zHnnWVxTXMeLoDe2L-hV_LzK6S7Flgps
8
+ """
9
+
10
+ !nvidia-smi
11
+
12
+ """## Setup
13
+
14
+ We'll need [the Transformers library](https://huggingface.co/transformers/) by Hugging Face:
15
+ """
16
+
17
+ !pip install -q -U watermark
18
+
19
+ !pip install -qq transformers
20
+
21
+ # Commented out IPython magic to ensure Python compatibility.
22
+ # %reload_ext watermark
23
+ # %watermark -v -p numpy,pandas,torch,transformers
24
+
25
+ # Commented out IPython magic to ensure Python compatibility.
26
+ #@title Setup & Config
27
+ import transformers
28
+ from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
29
+ import torch
30
+
31
+ import numpy as np
32
+ import pandas as pd
33
+ import seaborn as sns
34
+ from pylab import rcParams
35
+ import matplotlib.pyplot as plt
36
+ from matplotlib import rc
37
+ from sklearn.model_selection import train_test_split
38
+ from sklearn.metrics import confusion_matrix, classification_report
39
+ from collections import defaultdict
40
+ from textwrap import wrap
41
+
42
+ from torch import nn, optim
43
+ from torch.utils.data import Dataset, DataLoader
44
+ import torch.nn.functional as F
45
+
46
+ # %matplotlib inline
47
+ # %config InlineBackend.figure_format='retina'
48
+
49
+ sns.set(style='whitegrid', palette='muted', font_scale=1.2)
50
+
51
+ HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
52
+
53
+ sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
54
+
55
+ rcParams['figure.figsize'] = 12, 8
56
+
57
+ RANDOM_SEED = 42
58
+ np.random.seed(RANDOM_SEED)
59
+ torch.manual_seed(RANDOM_SEED)
60
+
61
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
62
+ device
63
+
64
+ !gdown --id 1S6qMioqPJjyBLpLVz4gmRTnJHnjitnuV
65
+ !gdown --id 1zdmewp7ayS4js4VtrJEHzAheSW-5NBZv
66
+
67
+ df = pd.read_csv("reviews.csv")
68
+ df.head()
69
+
70
+ df.shape
71
+
72
+ df.info()
73
+
74
+ print(df.score)
75
+
76
+ sns.countplot(x='score', data = df)
77
+ plt.xlabel('review score');
78
+
79
+ def to_sentiment(rating):
80
+ rating = int(rating)
81
+ if rating <= 2:
82
+ return 0
83
+ elif rating == 3:
84
+ return 1
85
+ else:
86
+ return 2
87
+
88
+ df['sentiment'] = df.score.apply(to_sentiment)
89
+
90
+ class_names = ['negative', 'neutral', 'positive']
91
+
92
+ print(df.sentiment)
93
+
94
+ ax = sns.countplot(x='sentiment', data = df)
95
+ plt.xlabel('review sentiment')
96
+ ax.set_xticklabels(class_names);
97
+
98
+ PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
99
+
100
+ tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
101
+
102
+ sample_txt = 'When was I last outside? I am stuck at home for 2 weeks.'
103
+
104
+ tokens = tokenizer.tokenize(sample_txt)
105
+ token_ids = tokenizer.convert_tokens_to_ids(tokens)
106
+
107
+ print(f' Sentence: {sample_txt}')
108
+ print(f' Tokens: {tokens}')
109
+ print(f'Token IDs: {token_ids}')
110
+
111
+ tokenizer.sep_token, tokenizer.sep_token_id
112
+
113
+ tokenizer.cls_token, tokenizer.cls_token_id
114
+
115
+ tokenizer.pad_token, tokenizer.pad_token_id
116
+
117
+ tokenizer.unk_token, tokenizer.unk_token_id
118
+
119
+ encoding = tokenizer.encode_plus(
120
+ sample_txt,
121
+ max_length=32,
122
+ add_special_tokens=True, # Add '[CLS]' and '[SEP]'
123
+ return_token_type_ids=False,
124
+ pad_to_max_length=True,
125
+ return_attention_mask=True,
126
+ return_tensors='pt', # Return PyTorch tensors
127
+ )
128
+
129
+ encoding.keys()
130
+
131
+ print(len(encoding['input_ids'][0]))
132
+ encoding['input_ids'][0]
133
+
134
+ print(len(encoding['attention_mask'][0]))
135
+ encoding['attention_mask']
136
+
137
+ tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])
138
+
139
+ token_lens = []
140
+
141
+ for txt in df.content:
142
+ tokens = tokenizer.encode(txt, max_length=512)
143
+ token_lens.append(len(tokens))
144
+
145
+ sns.distplot(token_lens)
146
+ plt.xlim([0, 256]);
147
+ plt.xlabel('Token count');
148
+
149
+ MAX_LEN = 160
150
+
151
+ class GPReviewDataset(Dataset):
152
+
153
+ def __init__(self, reviews, targets, tokenizer, max_len):
154
+ self.reviews = reviews
155
+ self.targets = targets
156
+ self.tokenizer = tokenizer
157
+ self.max_len = max_len
158
+
159
+ def __len__(self):
160
+ return len(self.reviews)
161
+
162
+ def __getitem__(self, item):
163
+ review = str(self.reviews[item])
164
+ target = self.targets[item]
165
+
166
+ encoding = self.tokenizer.encode_plus(
167
+ review,
168
+ add_special_tokens=True,
169
+ max_length=self.max_len,
170
+ return_token_type_ids=False,
171
+ pad_to_max_length=True,
172
+ return_attention_mask=True,
173
+ return_tensors='pt',
174
+ )
175
+
176
+ return {
177
+ 'review_text': review,
178
+ 'input_ids': encoding['input_ids'].flatten(),
179
+ 'attention_mask': encoding['attention_mask'].flatten(),
180
+ 'targets': torch.tensor(target, dtype=torch.long)
181
+ }
182
+
183
+ df_train, df_test = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)
184
+ df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)
185
+
186
+ df_train.shape, df_val.shape, df_test.shape
187
+
188
+ def create_data_loader(df, tokenizer, max_len, batch_size):
189
+ ds = GPReviewDataset(
190
+ reviews=df.content.to_numpy(),
191
+ targets=df.sentiment.to_numpy(),
192
+ tokenizer=tokenizer,
193
+ max_len=max_len
194
+ )
195
+
196
+ return DataLoader(
197
+ ds,
198
+ batch_size=batch_size,
199
+ num_workers=4
200
+ )
201
+
202
+ BATCH_SIZE = 16
203
+
204
+ train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
205
+ val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
206
+ test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
207
+
208
+ data = next(iter(train_data_loader))
209
+ data.keys()
210
+
211
+ print(data['input_ids'].shape)
212
+ print(data['attention_mask'].shape)
213
+ print(data['targets'].shape)
214
+
215
+ bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
216
+
217
+ last_hidden_state, pooled_output = bert_model(
218
+ input_ids=encoding['input_ids'],
219
+ attention_mask=encoding['attention_mask'],
220
+ return_dict = False
221
+ )
222
+
223
+ last_hidden_state.shape
224
+
225
+ bert_model.config.hidden_size
226
+
227
+ pooled_output.shape
228
+
229
+ class SentimentClassifier(nn.Module):
230
+
231
+ def __init__(self, n_classes):
232
+ super(SentimentClassifier, self).__init__()
233
+ self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
234
+ self.drop = nn.Dropout(p=0.3)
235
+ self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
236
+
237
+ def forward(self, input_ids, attention_mask):
238
+ returned = self.bert(
239
+ input_ids=input_ids,
240
+ attention_mask=attention_mask
241
+ )
242
+ pooled_output = returned["pooler_output"]
243
+ output = self.drop(pooled_output)
244
+ return self.out(output)
245
+
246
+ model = SentimentClassifier(len(class_names))
247
+ model = model.to(device)
248
+
249
+ input_ids = data['input_ids'].to(device)
250
+ attention_mask = data['attention_mask'].to(device)
251
+
252
+ print(input_ids.shape) # batch size x seq length
253
+ print(attention_mask.shape) # batch size x seq length
254
+
255
+ F.softmax(model(input_ids, attention_mask), dim=1)
256
+
257
+ """### Training"""
258
+
259
+ EPOCHS = 6
260
+
261
+ optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
262
+ total_steps = len(train_data_loader) * EPOCHS
263
+
264
+ scheduler = get_linear_schedule_with_warmup(
265
+ optimizer,
266
+ num_warmup_steps=0,
267
+ num_training_steps=total_steps
268
+ )
269
+
270
+ loss_fn = nn.CrossEntropyLoss().to(device)
271
+
272
+ def train_epoch(
273
+ model,
274
+ data_loader,
275
+ loss_fn,
276
+ optimizer,
277
+ device,
278
+ scheduler,
279
+ n_examples
280
+ ):
281
+ model = model.train()
282
+
283
+ losses = []
284
+ correct_predictions = 0
285
+
286
+ for d in data_loader:
287
+ input_ids = d["input_ids"].to(device)
288
+ attention_mask = d["attention_mask"].to(device)
289
+ targets = d["targets"].to(device)
290
+
291
+ outputs = model(
292
+ input_ids=input_ids,
293
+ attention_mask=attention_mask
294
+ )
295
+
296
+ _, preds = torch.max(outputs, dim=1)
297
+ loss = loss_fn(outputs, targets)
298
+
299
+ correct_predictions += torch.sum(preds == targets)
300
+ losses.append(loss.item())
301
+
302
+ loss.backward()
303
+ nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
304
+ optimizer.step()
305
+ scheduler.step()
306
+ optimizer.zero_grad()
307
+
308
+ return correct_predictions.double() / n_examples, np.mean(losses)
309
+
310
+ def eval_model(model, data_loader, loss_fn, device, n_examples):
311
+ model = model.eval()
312
+
313
+ losses = []
314
+ correct_predictions = 0
315
+
316
+ with torch.no_grad():
317
+ for d in data_loader:
318
+ input_ids = d["input_ids"].to(device)
319
+ attention_mask = d["attention_mask"].to(device)
320
+ targets = d["targets"].to(device)
321
+
322
+ outputs = model(
323
+ input_ids=input_ids,
324
+ attention_mask=attention_mask
325
+ )
326
+ _, preds = torch.max(outputs, dim=1)
327
+
328
+ loss = loss_fn(outputs, targets)
329
+
330
+ correct_predictions += torch.sum(preds == targets)
331
+ losses.append(loss.item())
332
+
333
+ return correct_predictions.double() / n_examples, np.mean(losses)
334
+
335
+ # Commented out IPython magic to ensure Python compatibility.
336
+ # %%time
337
+ #
338
+ # history = defaultdict(list)
339
+ # best_accuracy = 0
340
+ #
341
+ # for epoch in range(EPOCHS):
342
+ #
343
+ # print(f'Epoch {epoch + 1}/{EPOCHS}')
344
+ # print('-' * 10)
345
+ #
346
+ # train_acc, train_loss = train_epoch(
347
+ # model,
348
+ # train_data_loader,
349
+ # loss_fn,
350
+ # optimizer,
351
+ # device,
352
+ # scheduler,
353
+ # len(df_train)
354
+ # )
355
+ #
356
+ # print(f'Train loss {train_loss} accuracy {train_acc}')
357
+ #
358
+ # val_acc, val_loss = eval_model(
359
+ # model,
360
+ # val_data_loader,
361
+ # loss_fn,
362
+ # device,
363
+ # len(df_val)
364
+ # )
365
+ #
366
+ # print(f'Val loss {val_loss} accuracy {val_acc}')
367
+ # print()
368
+ #
369
+ # history['train_acc'].append(train_acc)
370
+ # history['train_loss'].append(train_loss)
371
+ # history['val_acc'].append(val_acc)
372
+ # history['val_loss'].append(val_loss)
373
+ #
374
+ # if val_acc > best_accuracy:
375
+ # torch.save(model.state_dict(), 'best_model_state.bin')
376
+ # best_accuracy = val_acc
377
+
378
+ print(history['train_acc'])
379
+
380
+ list_of_train_accuracy= [t.cpu().numpy() for t in history['train_acc']]
381
+ list_of_train_accuracy
382
+
383
+ print(history['val_acc'])
384
+
385
+ list_of_val_accuracy= [t.cpu().numpy() for t in history['val_acc']]
386
+ list_of_val_accuracy
387
+
388
+ plt.plot(list_of_train_accuracy, label='train accuracy')
389
+ plt.plot(list_of_val_accuracy, label='validation accuracy')
390
+
391
+ plt.title('Training history')
392
+ plt.ylabel('Accuracy')
393
+ plt.xlabel('Epoch')
394
+ plt.legend()
395
+ plt.ylim([0, 1]);
396
+
397
+ # !gdown --id 1V8itWtowCYnb2Bc9KlK9SxGff9WwmogA
398
+
399
+ # model = SentimentClassifier(len(class_names))
400
+ # model.load_state_dict(torch.load('best_model_state.bin'))
401
+ # model = model.to(device)
402
+
403
+ test_acc, _ = eval_model(
404
+ model,
405
+ test_data_loader,
406
+ loss_fn,
407
+ device,
408
+ len(df_test)
409
+ )
410
+
411
+ print(('\n'))
412
+ print('Test Accuracy : ', test_acc.item())
413
+
414
+ def get_predictions(model, data_loader):
415
+ model = model.eval()
416
+
417
+ review_texts = []
418
+ predictions = []
419
+ prediction_probs = []
420
+ real_values = []
421
+
422
+ with torch.no_grad():
423
+ for d in data_loader:
424
+
425
+ texts = d["review_text"]
426
+ input_ids = d["input_ids"].to(device)
427
+ attention_mask = d["attention_mask"].to(device)
428
+ targets = d["targets"].to(device)
429
+
430
+ outputs = model(
431
+ input_ids=input_ids,
432
+ attention_mask=attention_mask
433
+ )
434
+ _, preds = torch.max(outputs, dim=1)
435
+
436
+ probs = F.softmax(outputs, dim=1)
437
+
438
+ review_texts.extend(texts)
439
+ predictions.extend(preds)
440
+ prediction_probs.extend(probs)
441
+ real_values.extend(targets)
442
+
443
+ predictions = torch.stack(predictions).cpu()
444
+ prediction_probs = torch.stack(prediction_probs).cpu()
445
+ real_values = torch.stack(real_values).cpu()
446
+ return review_texts, predictions, prediction_probs, real_values
447
+
448
+ y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
449
+ model,
450
+ test_data_loader
451
+ )
452
+
453
+ print(classification_report(y_test, y_pred, target_names=class_names))
454
+
455
+ def show_confusion_matrix(confusion_matrix):
456
+ hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
457
+ hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
458
+ hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
459
+ plt.ylabel('True sentiment')
460
+ plt.xlabel('Predicted sentiment');
461
+
462
+ cm = confusion_matrix(y_test, y_pred)
463
+ df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
464
+ show_confusion_matrix(df_cm)
465
+
466
+ idx = 2
467
+
468
+ review_text = y_review_texts[idx]
469
+ true_sentiment = y_test[idx]
470
+ pred_df = pd.DataFrame({
471
+ 'class_names': class_names,
472
+ 'values': y_pred_probs[idx]
473
+ })
474
+
475
+ print("\n".join(wrap(review_text)))
476
+ print()
477
+ print(f'True sentiment: {class_names[true_sentiment]}')
478
+
479
+ sns.barplot(x='values', y='class_names', data=pred_df, orient='h')
480
+ plt.ylabel('sentiment')
481
+ plt.xlabel('probability')
482
+ plt.xlim([0, 1]);
483
+
484
+ review_text = "I hate you!!!"
485
+
486
+ encoded_review = tokenizer.encode_plus(
487
+ review_text,
488
+ max_length=MAX_LEN,
489
+ add_special_tokens=True,
490
+ return_token_type_ids=False,
491
+ pad_to_max_length=True,
492
+ return_attention_mask=True,
493
+ return_tensors='pt',
494
+ )
495
+
496
+ input_ids = encoded_review['input_ids'].to(device)
497
+ attention_mask = encoded_review['attention_mask'].to(device)
498
+
499
+ output = model(input_ids, attention_mask)
500
+ _, prediction = torch.max(output, dim=1)
501
+
502
+ print(f'Review text: {review_text}')
503
+ print(f'Sentiment : {class_names[prediction]}')
504
+
505
+ """## References
506
+
507
+ - [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)
508
+ - [L11 Language Models - Alec Radford (OpenAI)](https://www.youtube.com/watch?v=BnpB3GrpsfM)
509
+ - [The Illustrated BERT, ELMo, and co.](https://jalammar.github.io/illustrated-bert/)
510
+ - [BERT Fine-Tuning Tutorial with PyTorch](https://mccormickml.com/2019/07/22/BERT-fine-tuning/)
511
+ - [How to Fine-Tune BERT for Text Classification?](https://arxiv.org/pdf/1905.05583.pdf)
512
+ - [Huggingface Transformers](https://huggingface.co/transformers/)
513
+ - [BERT Explained: State of the art language model for NLP](https://towardsdatascience.com/bert-explained-state-of-the-art-language-model-for-nlp-f8b21a9b6270)
514
+ """