Muskan Sharma commited on
Commit
7aa5855
·
1 Parent(s): 3d9ad3b

Added the bert directions model using the small dataset and gradio app file

Browse files
BERT DEMO/bert_fine_tuning_for_directions.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Bert_fine_tuning for directions.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1iR42JiG66KlXsFg1CXNUXLfOEgKhuX72
8
+
9
+ # IMPORTANT NOTE
10
+
11
+ This is the file where I am trying to make the BERT working. I am following this tutorial: https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb#scrollTo=6DV0Rtetxgd4
12
+
13
+ # Set-up the Environment
14
+ """
15
+
16
+ !pip install -q transformers datasets
17
+
18
+ !pip install -q gradio
19
+
20
+ ! huggingface-cli login
21
+
22
+ ! huggingface-cli repo create ROSITA123
23
+
24
+ """# Code
25
+
26
+ """
27
+
28
+ from datasets import *
29
+ ds = load_dataset('ROSITA123/dataset_directions_second_try')
30
+
31
+ train_testvalid = ds['train'].train_test_split(test_size=0.2)
32
+
33
+ test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
34
+ dataset = DatasetDict({
35
+ 'train': train_testvalid['train'],
36
+ 'test': test_valid['test'],
37
+ 'validation': test_valid['train']})
38
+
39
+ # having a look at the dataset structure
40
+ dataset
41
+
42
+ """creating a list that contains the labels, as well as 2 dictionaries that map labels to integers and back."""
43
+
44
+ labels = [label for label in dataset['train'].features.keys() if label not in ['prompt']]
45
+ id2label = {idx:label for idx, label in enumerate(labels)}
46
+ label2id = {label:idx for idx, label in enumerate(labels)}
47
+ labels
48
+
49
+ """# Pre-processing
50
+
51
+
52
+
53
+ """
54
+
55
+ from transformers import AutoTokenizer
56
+ import numpy as np
57
+
58
+ # Assuming labels is defined somewhere in your code
59
+ #labels = ['label1', 'label2', 'label3']
60
+
61
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
62
+
63
+ def preprocess_data(examples):
64
+ # take a batch of texts
65
+ text = examples["prompt"]
66
+ # encode them
67
+ encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
68
+ # add labels
69
+ labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
70
+ # create numpy array of shape (batch_size, num_labels)
71
+ labels_matrix = np.zeros((len(text), len(labels)))
72
+ # fill numpy array
73
+ for idx, label in enumerate(labels):
74
+ labels_matrix[:, idx] = labels_batch[label]
75
+
76
+ encoding["labels"] = labels_matrix.tolist()
77
+
78
+ return encoding
79
+
80
+ encoded_dataset = dataset.map(preprocess_data, batched=True)
81
+
82
+ example = encoded_dataset['train'][0]
83
+ print(example.keys())
84
+
85
+ tokenizer.decode(example['input_ids'])
86
+
87
+ example['labels']
88
+
89
+ [id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]
90
+
91
+ # formatting dataset to pytorch tesnors
92
+ encoded_dataset.set_format("torch")
93
+
94
+ """# Defining Model
95
+
96
+ """
97
+
98
+ from transformers import AutoModelForSequenceClassification
99
+
100
+ model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
101
+ problem_type="multi_label_classification",
102
+ num_labels=len(labels),
103
+ id2label=id2label,
104
+ label2id=label2id)
105
+
106
+ """# Training the model"""
107
+
108
+ !pip install -q accelerate -U
109
+
110
+ batch_size = 8
111
+ metric_name = "f1"
112
+
113
+ """**# The instructions to run the args correctly:**
114
+
115
+ Run pip install accelerate -U in a cell
116
+
117
+ In the top menu click Runtime → Restart Runtime
118
+
119
+ Do not rerun any cells with !pip install in them
120
+ Rerun all the other code cells and you should be good to go!
121
+ """
122
+
123
+ from transformers import TrainingArguments, Trainer
124
+
125
+ args = TrainingArguments(
126
+ f"ROSITA-second-attempt",
127
+ evaluation_strategy = "epoch",
128
+ save_strategy = "epoch",
129
+ learning_rate=2e-5,
130
+ per_device_train_batch_size=batch_size,
131
+ per_device_eval_batch_size=batch_size,
132
+ num_train_epochs=5,
133
+ weight_decay=0.01,
134
+ load_best_model_at_end=True,
135
+ metric_for_best_model=metric_name,
136
+ #push_to_hub=True,
137
+ )
138
+
139
+ """This part is needed to compute metrics of the model"""
140
+
141
+ from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
142
+ from transformers import EvalPrediction
143
+ import torch
144
+
145
+ # source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
146
+ def multi_label_metrics(predictions, labels, threshold=0.5):
147
+ # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
148
+ sigmoid = torch.nn.Sigmoid()
149
+ probs = sigmoid(torch.Tensor(predictions))
150
+ # next, use threshold to turn them into integer predictions
151
+ y_pred = np.zeros(probs.shape)
152
+ y_pred[np.where(probs >= threshold)] = 1
153
+ # finally, compute metrics
154
+ y_true = labels
155
+ f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
156
+ roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
157
+ accuracy = accuracy_score(y_true, y_pred)
158
+ # return as dictionary
159
+ metrics = {'f1': f1_micro_average,
160
+ 'roc_auc': roc_auc,
161
+ 'accuracy': accuracy}
162
+ return metrics
163
+
164
+ def compute_metrics(p: EvalPrediction):
165
+ preds = p.predictions[0] if isinstance(p.predictions,
166
+ tuple) else p.predictions
167
+ result = multi_label_metrics(
168
+ predictions=preds,
169
+ labels=p.label_ids)
170
+ return result
171
+
172
+ """verifying a batch as well as the forward tensor"""
173
+
174
+ encoded_dataset['train'][0]['labels'].type()
175
+
176
+ encoded_dataset['train']['input_ids'][0]
177
+
178
+ #forward pass
179
+ outputs = model(input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0), labels=encoded_dataset['train'][0]['labels'].unsqueeze(0))
180
+ outputs
181
+
182
+ """Training the model"""
183
+
184
+ trainer = Trainer(
185
+ model,
186
+ args,
187
+ train_dataset=encoded_dataset["train"],
188
+ eval_dataset=encoded_dataset["validation"],
189
+ tokenizer=tokenizer,
190
+ compute_metrics=compute_metrics
191
+ )
192
+
193
+ trainer.train()
194
+
195
+ """# Evaluate
196
+
197
+ """
198
+
199
+ trainer.evaluate()
200
+
201
+ """# Inference
202
+
203
+ """
204
+
205
+ text = "Go lower"
206
+
207
+ encoding = tokenizer(text, return_tensors="pt")
208
+ encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}
209
+
210
+ outputs = trainer.model(**encoding)
211
+
212
+ logits = outputs.logits
213
+ logits.shape
214
+
215
+ # apply sigmoid + threshold
216
+ sigmoid = torch.nn.Sigmoid()
217
+ probs = sigmoid(logits.squeeze().cpu())
218
+ predictions = np.zeros(probs.shape)
219
+ predictions[np.where(probs >= 0.5)] = 1
220
+ # turn predicted id's into actual label names
221
+ predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
222
+ print(predicted_labels)