BERTNN commited on
Commit
f50871b
·
1 Parent(s): 3719e8f

Upload 2 files

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. BERTNN_model +3 -0
  3. predefined_bertnn.py +716 -0
.gitattributes CHANGED
@@ -31,3 +31,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
31
  *.zip filter=lfs diff=lfs merge=lfs -text
32
  *.zst filter=lfs diff=lfs merge=lfs -text
33
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
31
  *.zip filter=lfs diff=lfs merge=lfs -text
32
  *.zst filter=lfs diff=lfs merge=lfs -text
33
  *tfevents* filter=lfs diff=lfs merge=lfs -text
34
+ BERTNN_model filter=lfs diff=lfs merge=lfs -text
BERTNN_model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3234a877d983726862db4eebe40e200ae6752c23302c0be8a20e47b5a0a0c412
3
+ size 1341231081
predefined_bertnn.py ADDED
@@ -0,0 +1,716 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ## Import required libraries
3
+ from datetime import datetime
4
+ import numpy as np
5
+ import pandas as pd
6
+ import random
7
+ from transformers import BertTokenizer, BertModel
8
+ import logging
9
+ import matplotlib.pyplot as plt
10
+ tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')#bert-large-uncased
11
+ import itertools
12
+ from sklearn.preprocessing import StandardScaler
13
+ from itertools import cycle,islice
14
+ from random import sample
15
+ import torch
16
+ import torch.nn as nn
17
+ from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
18
+ import torch.nn.functional as F
19
+
20
+ ## ****** We use GPU if available ******** ##
21
+ if torch.cuda.is_available():
22
+ device = torch.device("cuda")
23
+ print(f'There are {torch.cuda.device_count()} GPU(s) available.')
24
+ print('Device name:', torch.cuda.get_device_name(0))
25
+ else:
26
+ print('No GPU available, using the CPU instead.')
27
+ device = torch.device("cpu")
28
+ ## ****** Initialize random seeds ******** ##
29
+ rnd_st=42
30
+ np.random.seed(rnd_st)
31
+ random.seed(rnd_st)
32
+ torch.manual_seed(rnd_st)
33
+ torch.cuda.manual_seed(rnd_st)
34
+ # Running on the CuDNN backend
35
+ torch.backends.cudnn.deterministic = True
36
+ torch.backends.cudnn.benchmark = False
37
+
38
+ ## ****** Load and normalize the refrence EPA dictionary ******** ##
39
+ def load_dictionary(file):
40
+ df=pd.read_csv(file).reset_index().rename(columns={"index": 'index_in_dic'})
41
+ df['term2']=df['term']
42
+ df.term=df.term.str.replace("_", " ")
43
+ df['len_Bert']=df.apply(lambda x: len(tokenizer.tokenize(x['term'])),axis=1)
44
+ # df=add_cluster(df)
45
+ return(df)
46
+
47
+ Modifiers =load_dictionary("FullSurveyorInteract_Modifiers.csv")
48
+ # Behaviors=load_dictionary("FullSurveyorInteract_Behaviors.csv")
49
+ Behaviors=load_dictionary("FullSurveyorInteract_Behaviors_3rd.csv")
50
+ Identities=load_dictionary("FullSurveyorInteract_Identities.csv")
51
+
52
+ n_Modifiers = Modifiers.copy()
53
+ n_Behaviors =Behaviors.copy()
54
+ n_Identities = Identities.copy()
55
+
56
+ scaler_B,scaler_M,scaler_I = StandardScaler(),StandardScaler(),StandardScaler()
57
+
58
+ n_Behaviors[['E','P','A']] = scaler_B.fit_transform(Behaviors[['E','P','A']])
59
+ n_Modifiers[['E','P','A']] = scaler_M.fit_transform(Modifiers[['E','P','A']])
60
+ n_Identities[['E','P','A']] = scaler_I.fit_transform(Identities[['E','P','A']])
61
+
62
+
63
+ # Ref: https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/
64
+
65
+ rnd_st=42
66
+
67
+ # Load the BERT tokenizer
68
+ tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)
69
+
70
+ # Create a function to tokenize a set of texts
71
+ def preprocessing_for_bert(data,MAX_LEN=40):
72
+ """Perform required preprocessing steps for pretrained BERT.
73
+ @param data (np.array): Array of texts to be processed.
74
+ @return input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
75
+ @return attention_masks (torch.Tensor): Tensor of indices specifying which
76
+ tokens should be attended to by the model.
77
+ """
78
+ # Create empty lists to store outputs
79
+ input_ids = []
80
+ attention_masks = []
81
+
82
+ # For every sentence...
83
+ for sent in data:
84
+ # `encode_plus` will:
85
+ # (1) Tokenize the sentence
86
+ # (2) Add the `[CLS]` and `[SEP]` token to the start and end
87
+ # (3) Truncate/Pad sentence to max length
88
+ # (4) Map tokens to their IDs
89
+ # (5) Create attention mask
90
+ # (6) Return a dictionary of outputs
91
+ encoded_sent = tokenizer.encode_plus(
92
+ text=sent, # Preprocess sentence
93
+ add_special_tokens=True, # Add `[CLS]` and `[SEP]`
94
+ max_length=MAX_LEN, # Max length to truncate/pad
95
+ padding='max_length',
96
+ return_attention_mask=True # Return attention mask
97
+ )
98
+
99
+ # Add the outputs to the lists
100
+ input_ids.append(encoded_sent.get('input_ids'))
101
+ attention_masks.append(encoded_sent.get('attention_mask'))
102
+
103
+ # Convert lists to tensors
104
+ input_ids = torch.tensor(input_ids[0])
105
+ attention_masks = torch.tensor(attention_masks[0])
106
+
107
+ return input_ids, attention_masks
108
+
109
+
110
+ # # Convert other data types to torch.Tensor
111
+ # from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
112
+ def gnrtr2(Identity,Behavior,Modifier):
113
+ ident1,ident2,behav=Identity.sample(axis = 0),Identity.sample(axis = 0),Behavior.sample(axis = 0)
114
+ modif1,modif2=Modifier.sample(axis = 0),Modifier.sample(axis = 0)
115
+ id1,id2,beh,mod1,mod2=list(ident1.term),list(ident2.term),list(behav.term),list(modif1.term),list(modif2.term)
116
+ sents=' '.join(map(str, (mod1+id1+beh+mod2+id2)))
117
+ values=np.concatenate([(modif1[['E','P','A']]).to_numpy(),
118
+ (ident1[['E','P','A']]).to_numpy(),
119
+ (behav[['E','P','A']]).to_numpy(),
120
+ (modif2[['E','P','A']]).to_numpy(),
121
+ (ident2[['E','P','A']]).to_numpy()], axis=1)[0]
122
+ indexx=torch.tensor([[(modif1['index_in_dic']).to_numpy()][0][0],
123
+ [(ident1['index_in_dic']).to_numpy()][0][0],
124
+ [(behav['index_in_dic']).to_numpy()][0][0],
125
+ [(modif2['index_in_dic']).to_numpy()][0][0],
126
+ [(ident2['index_in_dic']).to_numpy()][0][0]])
127
+ ys= torch.tensor(values)
128
+ inputs, masks = preprocessing_for_bert([sents])
129
+ yield inputs, masks, ys,indexx #torch.tensor(sents),
130
+
131
+ # For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
132
+ def dta_ldr2(I,B,M,batch_size=32):
133
+ dt_ldr= [x for x in DataLoader([next(gnrtr2(I,B,M)) for x in range(batch_size)], batch_size=batch_size)][0]
134
+ return(dt_ldr)
135
+
136
+
137
+
138
+ # # Convert other data types to torch.Tensor
139
+
140
+ # For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
141
+ def dta_ldr(I,B,M,batch_size=32):
142
+ dt_ldr= [x for x in DataLoader([next(gnrtr(I,B,M)) for x in range(batch_size)], batch_size=batch_size)][0]
143
+ return(dt_ldr)
144
+ class BertRegressor(nn.Module):
145
+ """Bert Model for Regression Tasks.
146
+ """
147
+ def __init__(self, freeze_bert=False):
148
+ """
149
+ @param bert: a BertModel object
150
+ @param classifier: a torch.nn.Module regressor
151
+ @param freeze_bert (bool): Set `False` to fine-tune the BERT model
152
+ """
153
+ super(BertRegressor, self).__init__()
154
+ # Specify hidden size of BERT, hidden size of our regressor, and number of independent variables
155
+ D_in, H, D_out = 1024, 120, 15
156
+
157
+ # Instantiate BERT model
158
+ self.bert = BertModel.from_pretrained('bert-large-uncased')
159
+
160
+ # Instantiate an one-layer feed-forward classifier
161
+ self.regressor = nn.Sequential(
162
+ nn.Dropout(0.4),
163
+ nn.Linear(D_in, H),
164
+ nn.Dropout(0.3),
165
+ nn.ReLU(),
166
+ nn.Dropout(0.3),
167
+ nn.ReLU(),
168
+ nn.Dropout(0.3),
169
+ nn.Linear(H, D_out)
170
+ )
171
+
172
+ # Freeze the BERT model
173
+ if freeze_bert:
174
+ for param in self.bert.parameters():
175
+ param.requires_grad = False
176
+
177
+ def forward(self, input_ids, attention_mask):
178
+ """
179
+ Feed input to BERT and the classifier to compute logits.
180
+ @param input_ids (torch.Tensor): an input tensor with shape (batch_size,
181
+ max_length)
182
+ @param attention_mask (torch.Tensor): a tensor that hold attention mask
183
+ information with shape (batch_size, max_length)
184
+ @return logits (torch.Tensor): an output tensor with shape (batch_size,
185
+ num_labels)
186
+ """
187
+ # Feed input to BERT
188
+ outputs = self.bert(input_ids=input_ids,
189
+ attention_mask=attention_mask)
190
+
191
+ # Extract the last hidden state of the token `[CLS]` for regression task
192
+ last_hidden_state_cls = outputs.pooler_output#outputs[0][:, 0, :]
193
+
194
+ # Feed input to classifier to compute predictions
195
+ predictions = self.regressor(last_hidden_state_cls)#.float()
196
+ return predictions#.float()
197
+ from transformers import AdamW, get_linear_schedule_with_warmup
198
+
199
+ def initialize_model(epochs=4):
200
+ """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
201
+ """
202
+ # Instantiate Bert Classifier
203
+ bert_regressor = BertRegressor(freeze_bert=False)
204
+
205
+ # Tell PyTorch to run the model on GPU
206
+ bert_regressor.to(device)
207
+
208
+ # Create the optimizer
209
+ optimizer = AdamW(bert_regressor.parameters(),
210
+ lr=2e-5, # Smaller LR
211
+ eps=1e-8, # Default epsilon value
212
+ weight_decay =0.001 # Decoupled weight decay to apply.
213
+ )
214
+
215
+ # Total number of training steps
216
+ total_steps = 100000#len(train_dataloader) * epochs
217
+
218
+ # Set up the learning rate scheduler
219
+ scheduler = get_linear_schedule_with_warmup(optimizer,
220
+ num_warmup_steps=0, # Default value
221
+ num_training_steps=total_steps)
222
+ return bert_regressor, optimizer, scheduler
223
+ import random
224
+ import time
225
+
226
+ # Specify loss function
227
+ loss_fn = nn.MSELoss()
228
+
229
+ def set_seed(seed_value=42):
230
+ """Set seed for reproducibility.
231
+ """
232
+ random.seed(seed_value)
233
+ np.random.seed(seed_value)
234
+ torch.manual_seed(seed_value)
235
+ torch.cuda.manual_seed_all(seed_value)
236
+
237
+ def train(model, I_trn,B_trn,M_trn,I_tst,B_tst,M_tst,
238
+ batch_size_tst=32, batch_size=50,batch_epochs=400, evaluation=False,batch_size_trn=32):
239
+ """Train the BertClassifier model.
240
+ """
241
+ #initialize val_loss with something big to prevent initialization error
242
+ # val_loss=10
243
+ # Start training loop
244
+ print("Start training...\n")
245
+ # =======================================
246
+ # Training
247
+ # =======================================
248
+ # Print the header of the result table
249
+ print(f" {'Batch':^5} | {'Train Loss':^12} | {'Val Loss':^10} | {'Elapsed':^9}")
250
+ print("-"*50)
251
+ # Measure the elapsed time of each epoch
252
+ t0_batch = time.time()
253
+ # Reset tracking variables at the beginning of each epoch
254
+ batch_loss, batch_counts = 0, 0
255
+ # Put the model into the training mode
256
+ model.train()
257
+ # For each batch of training data...
258
+ for batch in range(batch_epochs): #298
259
+ batch_counts +=1
260
+ if ((batch==(704))):break #457#383#1451#246
261
+ # if val_loss<0.3: break
262
+ # Load batch to GPU
263
+ b_input_ids, b_attn_mask, b_ys,_ = tuple(t.to(device) for t in dta_ldr(I=I_trn,B=B_trn,M=M_trn,batch_size=batch_size_trn))
264
+ # Zero out any previously calculated gradients
265
+ model.zero_grad()
266
+ # Perform a forward pass. This will return logits.
267
+ # print(b_input_ids,'Mask:\n',b_attn_mask)
268
+ preds = model(b_input_ids, b_attn_mask)
269
+ # Compute loss
270
+ loss = loss_fn(preds.float(), b_ys.float())
271
+ batch_loss += loss.item()
272
+ # Perform a backward pass to calculate gradients
273
+ loss.backward()
274
+ # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
275
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
276
+ # Update parameters and the learning rate
277
+ optimizer.step()
278
+ scheduler.step()
279
+
280
+ # Print the loss values and time elapsed for every 20 batches
281
+ if (batch_counts % 50 == 0 and batch_counts != 0) : #or(batch>585)
282
+ # Calculate time elapsed for 20 batches
283
+ time_elapsed = time.time() - t0_batch
284
+
285
+ # Print training results
286
+ val_loss = evaluate(model, Ie=I_tst,Be=B_tst,Me=M_tst,batch_size_e=batch_size_tst)
287
+ print(f"{batch+ 1:^7}|{batch_loss / batch_counts:^12.6f} | {val_loss:^10.6f} | {time_elapsed:^9.2f}") #| {step:^7}
288
+ # After the completion of each training epoch, measure the model's performance
289
+ # on our validation set.
290
+ print("-"*50)
291
+ # print(batch)
292
+
293
+ # if (batch<586):
294
+ # # Reset batch tracking variables
295
+ # batch_loss, batch_counts = 0, 0
296
+ # t0_batch = time.time()
297
+ # # Reset batch tracking variables
298
+ batch_loss, batch_counts = 0, 0
299
+ t0_batch = time.time()
300
+
301
+ # Calculate the average loss over the entire training data
302
+ # avg_train_loss = total_loss / (batch_size*batch_epochs)
303
+
304
+ # =======================================
305
+ # Evaluation
306
+ # =======================================
307
+ if evaluation == True:
308
+ # After the completion of each training epoch, measure the model's performance
309
+ # on our validation set.
310
+ val_loss = evaluate(model, Ie=I_tst,Be=B_tst,Me=M_tst,batch_size_e=batch_size_tst)
311
+ if val_loss<0.32:
312
+ print('\n Consider this one with val:', val_loss,' at:',batch,'\n')
313
+ print("-"*50)
314
+
315
+
316
+
317
+ # Calculate the average loss over the entire training data
318
+ # avg_train_loss = total_loss / (batch_size*batch_epochs)
319
+
320
+ val_loss = evaluate(model, Ie=I_tst,Be=B_tst,Me=M_tst,batch_size_e=batch_size_tst)
321
+ print(f"{batch+ 1:^7}|{batch_loss / batch_counts:^12.6f} | {val_loss:^10.6f} | {time_elapsed:^9.2f}") #| {step:^7}
322
+ print("Training complete!")
323
+
324
+
325
+ def evaluate(model, Ie,Be,Me,batch_size_e):
326
+ """After the completion of each training epoch, measure the model's performance
327
+ on our validation set.
328
+ """
329
+ # Put the model into the evaluation mode. The dropout layers are disabled during
330
+ # the test time.
331
+ model.eval()
332
+
333
+ # Tracking variables
334
+ val_loss = []
335
+
336
+ # For each batch in our validation set...
337
+ for batch in range(1):
338
+ # Load batch to GPU
339
+ b_input_ids, b_attn_mask, b_ys,_ = tuple(t.to(device) for t in dta_ldr2(Ie,Be,Me,batch_size_e))
340
+
341
+ # Compute logits
342
+ with torch.no_grad():
343
+ preds = model(b_input_ids, b_attn_mask)
344
+
345
+ # Compute loss
346
+ loss = loss_fn(preds, b_ys)
347
+ val_loss.append(loss.item())
348
+
349
+
350
+ # Compute the absolutr error and loss over the validation set.
351
+ val_loss = np.mean(val_loss)
352
+
353
+ return val_loss
354
+
355
+
356
+ rnd_st=42
357
+ np.random.seed(rnd_st)
358
+ random.seed(rnd_st)
359
+ torch.manual_seed(rnd_st)
360
+ torch.cuda.manual_seed(rnd_st)
361
+ # Running on the CuDNN backend
362
+ torch.backends.cudnn.deterministic = True
363
+ torch.backends.cudnn.benchmark = False
364
+
365
+
366
+ ## ****** Load pre-trained model ******** ##
367
+ bert_regressor = BertRegressor()
368
+ bert_regressor.load_state_dict(torch.load("BERTNN_model",map_location=torch.device(device)))
369
+ bert_regressor.eval()
370
+
371
+
372
+
373
+
374
+ def bert_predict(model, test_dataloader):
375
+ """Perform a forward pass on the trained BERT model to predict probabilities
376
+ on the test set.
377
+ """
378
+ # Put the model into the evaluation mode. The dropout layers are disabled during
379
+ # the test time.
380
+ model.eval()
381
+ all_preds = []
382
+ # For each batch in our test set...
383
+ for batch in range(1):
384
+ # Load batch to GPU
385
+ b_input_ids, b_attn_mask = tuple(t.to(device) for t in test_dataloader)[:2]
386
+
387
+ # Compute predictions
388
+ with torch.no_grad():
389
+ preds = model(b_input_ids, b_attn_mask)#.to(device)
390
+ all_preds.append(preds)
391
+
392
+ # Concatenate predictions from each batch
393
+ all_preds = torch.cat(all_preds, dim=0)
394
+
395
+ return all_preds
396
+
397
+ ## ****** The following function inverses the normalization function and uses original dictionaries to represent MABMO events ******** ##
398
+
399
+ def out_df(data,predictions,df_beh=Behaviors,df_ident=Identities,df_mod=Modifiers):
400
+ df2=pd.concat([pd.DataFrame(scaler_M.inverse_transform(predictions[:,0:3].cpu())),
401
+ pd.DataFrame(scaler_M.inverse_transform(data[2][:,0:3])),
402
+ pd.DataFrame(scaler_I.inverse_transform(predictions[:,3:6].cpu())),
403
+ pd.DataFrame(scaler_I.inverse_transform(data[2][:,3:6])),
404
+ pd.DataFrame(scaler_B.inverse_transform(predictions[:,6:9].cpu())),
405
+ pd.DataFrame(scaler_B.inverse_transform(data[2][:,6:9])),
406
+ pd.DataFrame(scaler_M.inverse_transform(predictions[:,9:12].cpu())),
407
+ pd.DataFrame(scaler_M.inverse_transform(data[2][:,9:12])),
408
+ pd.DataFrame(scaler_I.inverse_transform(predictions[:,12:15].cpu())),
409
+ pd.DataFrame(scaler_I.inverse_transform(data[2][:,12:15])),pd.DataFrame(np.array(data[3]))
410
+ ],axis=1).set_axis(['EEMA', 'EPMA', 'EAMA','EM1', 'PM1', 'AM1',
411
+ 'EEA', 'EPA', 'EAA','EA', 'PA', 'AA',
412
+ 'EEB', 'EPB', 'EAB','EB', 'PB', 'AB',
413
+ 'EEMO', 'EPMO', 'EAMO','EM2', 'PM2', 'AM2',
414
+ 'EEO', 'EPO', 'EAO','EO', 'PO', 'AO',
415
+ 'idx_ModA','idx_Act','idx_Beh','idx_ModO','idx_Obj'], axis=1, inplace=False)
416
+ df2=pd.merge(df2, df_mod[['term','index_in_dic']], left_on= ['idx_ModA'], right_on = ["index_in_dic"],
417
+ how='left').rename(columns={"term": 'ModA'}).drop(['index_in_dic'], axis=1)
418
+ df2=pd.merge(df2, df_ident[['term','index_in_dic']], left_on= ['idx_Act'], right_on = ["index_in_dic"],
419
+ how='left').rename(columns={"term": 'Actor'}).drop(['index_in_dic'], axis=1)
420
+ df2=pd.merge(df2, df_beh[['term','index_in_dic']], left_on= ['idx_Beh'], right_on = ["index_in_dic"],
421
+ how='left').rename(columns={"term": 'Behavior'}).drop(['index_in_dic'], axis=1)
422
+ df2=pd.merge(df2, df_mod[['term','index_in_dic']], left_on= ['idx_ModO'], right_on = ["index_in_dic"],
423
+ how='left').rename(columns={"term": 'ModO'}).drop(['index_in_dic'], axis=1)
424
+ df2=pd.merge(df2, df_ident[['term','index_in_dic']], left_on= ['idx_Obj'], right_on = ["index_in_dic"],
425
+ how='left').rename(columns={"term": 'Object'}).drop(['index_in_dic'], axis=1)
426
+
427
+ df2=df2[['EEMA','EPMA', 'EAMA', 'EEA', 'EPA', 'EAA', 'EEB', 'EPB', 'EAB','EEMO', 'EPMO', 'EAMO', 'EEO', 'EPO', 'EAO','EM1', 'PM1', 'AM1','EA', 'PA', 'AA', 'EB', 'PB','AB', 'EM2', 'PM2', 'AM2', 'EO',
428
+ 'PO', 'AO', 'ModA','Actor','Behavior', 'ModO', 'Object']]
429
+ return(df2)
430
+
431
+ def get_output(I_b=n_Identities,B_b=n_Behaviors,M_b=n_Modifiers,batch_sz=3000,batch_num=10):
432
+ df=pd.DataFrame()
433
+ for i in range(batch_num):
434
+ q=dta_ldr2(I=I_b,B=B_b,M=M_b,batch_size=batch_sz)
435
+ preds = bert_predict(bert_regressor.to(device), q)
436
+ df2=out_df(data=q,predictions=preds)
437
+ df=pd.concat([df,df2],axis=0)
438
+ return(df)
439
+
440
+
441
+ def gen_new(Identity,Behavior,Modifier,n_df,word_type):
442
+ if word_type=='identity':
443
+ ident1=n_df.sample(axis = 0,random_state=56)
444
+ else:ident1=Identity.sample(axis = 0,random_state=6)
445
+ ident2=Identity.sample(axis = 0,random_state=6)
446
+ if word_type=='behavior':
447
+ behav=n_df.sample(axis = 0,random_state=5)
448
+ else: behav=Behavior.sample(axis = 0,random_state=5)
449
+ if word_type=='modifier':
450
+ modif1=n_df.sample(axis = 0,random_state=55)
451
+ else: modif1=Modifier.sample(axis = 0)
452
+ modif2=Modifier.sample(axis = 0,random_state=96)
453
+ id1=list(ident1.term)
454
+ id2=list(ident2.term)
455
+ beh=list(behav.term)
456
+ mod1=list(modif1.term)
457
+ mod2=list(modif2.term)
458
+ # wrdvc_ident1=gs_model.get_vector((list(ident1.trm_org))[0], norm=True)
459
+ sents=' '.join(map(str, (mod1+id1+beh+mod2+id2)))
460
+ values=np.concatenate([(modif1[['E','P','A']]).to_numpy(),
461
+ (ident1[['E','P','A']]).to_numpy(),
462
+ (behav[['E','P','A']]).to_numpy(),
463
+ (modif2[['E','P','A']]).to_numpy(),
464
+ (ident2[['E','P','A']]).to_numpy()], axis=1)[0]
465
+ #indexx=[(ident1['index_in_dic']).to_numpy()][0][0]
466
+ indexx=torch.tensor([[(modif1['index_in_dic']).to_numpy()][0][0],
467
+ [(ident1['index_in_dic']).to_numpy()][0][0],
468
+ [(behav['index_in_dic']).to_numpy()][0][0],
469
+ [(modif2['index_in_dic']).to_numpy()][0][0],
470
+ [(ident2['index_in_dic']).to_numpy()][0][0]])
471
+ ys= torch.tensor(values)
472
+ inputs, masks = preprocessing_for_bert([sents])
473
+ yield inputs, masks, ys,indexx #torch.tensor(sents),
474
+ def ldr_new(I,B,M,N_df,WT,batch_size=32):
475
+ dt_ldr= [x for x in DataLoader([next(gen_new(I,B,M,N_df,WT)) for x in range(batch_size)], batch_size=batch_size)][0]
476
+ return(dt_ldr)
477
+
478
+ def gen_new(Identity,Behavior,Modifier,n_df,word_type):
479
+
480
+ modif1,modif2,ident1,ident2,behav=Modifier.sample(axis = 0),Modifier.sample(axis = 0),Identity.sample(axis = 0),Identity.sample(axis = 0),Behavior.sample(axis = 0)
481
+
482
+ if word_type=='identity': ident1=n_df.sample(axis = 0)
483
+ if word_type=='behavior': behav=n_df.sample(axis = 0)
484
+ if word_type=='modifier': modif1=n_df.sample(axis = 0)
485
+
486
+ id1,id2,beh,mod1,mod2=list(ident1.term),list(ident2.term),list(behav.term),list(modif1.term),list(modif2.term)
487
+
488
+ # wrdvc_ident1=gs_model.get_vector((list(ident1.trm_org))[0], norm=True)
489
+ sents=' '.join(map(str, (mod1+id1+beh+mod2+id2)))
490
+ values=np.concatenate([(modif1[['E','P','A']]).to_numpy(),
491
+ (ident1[['E','P','A']]).to_numpy(),
492
+ (behav[['E','P','A']]).to_numpy(),
493
+ (modif2[['E','P','A']]).to_numpy(),
494
+ (ident2[['E','P','A']]).to_numpy()], axis=1)[0]
495
+ indexx=torch.tensor([[(modif1['index_in_dic']).to_numpy()][0][0],
496
+ [(ident1['index_in_dic']).to_numpy()][0][0],
497
+ [(behav['index_in_dic']).to_numpy()][0][0],
498
+ [(modif2['index_in_dic']).to_numpy()][0][0],
499
+ [(ident2['index_in_dic']).to_numpy()][0][0]])
500
+ ys= torch.tensor(values)
501
+ inputs, masks = preprocessing_for_bert([sents])
502
+ yield inputs, masks, ys,indexx
503
+
504
+
505
+ def gen_alt(Identity,Behavior,Modifier,n_df,word_type):
506
+
507
+ modif1,modif2,ident1,ident2,behav=Modifier.sample(axis = 0),Modifier.sample(axis = 0),Identity.sample(axis = 0),Identity.sample(axis = 0),Behavior.sample(axis = 0)
508
+ if word_type=='identity': ident2=n_df.sample(axis = 0)
509
+ if word_type=='behavior': behav=n_df.sample(axis = 0)
510
+ if word_type=='modifier': modif2=n_df.sample(axis = 0)
511
+
512
+ id1,id2,beh,mod1,mod2=list(ident1.term),list(ident2.term),list(behav.term),list(modif1.term),list(modif2.term)
513
+ sents=' '.join(map(str, (mod1+id1+beh+mod2+id2)))
514
+ values=np.concatenate([(modif1[['E','P','A']]).to_numpy(),
515
+ (ident1[['E','P','A']]).to_numpy(),
516
+ (behav[['E','P','A']]).to_numpy(),
517
+ (modif2[['E','P','A']]).to_numpy(),
518
+ (ident2[['E','P','A']]).to_numpy()], axis=1)[0]
519
+ indexx=torch.tensor([[(modif1['index_in_dic']).to_numpy()][0][0],
520
+ [(ident1['index_in_dic']).to_numpy()][0][0],
521
+ [(behav['index_in_dic']).to_numpy()][0][0],
522
+ [(modif2['index_in_dic']).to_numpy()][0][0],
523
+ [(ident2['index_in_dic']).to_numpy()][0][0]])
524
+ ys= torch.tensor(values)
525
+ inputs, masks = preprocessing_for_bert([sents])
526
+
527
+ yield inputs, masks, ys,indexx
528
+
529
+ def ldr_new(I,B,M,N_df,WT,batch_size=32,alt=0):
530
+ if alt:
531
+ dt_ldr= [x for x in DataLoader([next(gen_alt(I,B,M,N_df,WT)) for x in range(batch_size)], batch_size=batch_size)][0]
532
+ else:
533
+ dt_ldr= [x for x in DataLoader([next(gen_new(I,B,M,N_df,WT)) for x in range(batch_size)], batch_size=batch_size)][0]
534
+ return(dt_ldr)
535
+
536
+ cols=['EEMA', 'EPMA', 'EAMA', 'EEA', 'EPA', 'EAA', 'EEB', 'EPB', 'EAB',
537
+ 'EEMO', 'EPMO', 'EAMO', 'EEO', 'EPO', 'EAO', 'ModA', 'Actor', 'Behavior', 'ModO', 'Object']
538
+ def get_output_new(w,wt,I_b=n_Identities,B_b=n_Behaviors,M_b=n_Modifiers,batch_sz=300,batch_num=1,columnss=cols,cus_col=1):
539
+
540
+ df=pd.DataFrame()
541
+ for i in range(batch_num):
542
+ new_df=pd.DataFrame({'index_in_dic':4000,'term':w,'E':10,'P':10,'A':10,'E2':10,'P2':10,'A2':10,'term2':w,'len_Bert':3}, index=[0])
543
+ q=ldr_new(I=I_b,B=B_b,M=M_b,N_df=new_df,WT=wt,batch_size=batch_sz)
544
+ preds = bert_predict(bert_regressor.to(device), q)
545
+ if wt=='identity':
546
+ df_identity=pd.concat([Identities,new_df],axis=0)
547
+ df2=out_df(data=q,predictions=preds,df_ident=df_identity)
548
+ if cus_col:
549
+ columnss=[ 'EEA', 'EPA', 'EAA', 'ModA', 'Actor', 'Behavior', 'ModO', 'Object']
550
+ if wt=='behavior':
551
+ df_behavior=pd.concat([Behaviors,new_df],axis=0)
552
+ df2=out_df(data=q,predictions=preds,df_beh=df_behavior)
553
+ if cus_col:
554
+ columnss=['EEB', 'EPB', 'EAB', 'ModA', 'Actor', 'Behavior', 'ModO', 'Object']
555
+ if wt=='modifier':
556
+ df_modifier=pd.concat([Modifiers,new_df],axis=0)
557
+ df2=out_df(data=q,predictions=preds,df_mod=df_modifier)
558
+ if cus_col:
559
+ columnss=['EEMA', 'EPMA', 'EAMA', 'ModA', 'Actor', 'Behavior', 'ModO', 'Object']
560
+ df=pd.concat([df,df2],axis=0)
561
+ return(df[columnss])
562
+
563
+ def gen_new(Identity,Behavior,Modifier,n_df,word_type):
564
+ if word_type=='identity':
565
+ ident1=n_df.sample(axis = 0)
566
+ else:ident1=Identity.sample(axis = 0)
567
+ ident2=Identity.sample(axis = 0)
568
+ if word_type=='behavior':
569
+ behav=n_df.sample(axis = 0)
570
+ else: behav=Behavior.sample(axis = 0)
571
+ if word_type=='modifier':
572
+ modif1=n_df.sample(axis = 0)
573
+ else: modif1=Modifier.sample(axis = 0)
574
+ modif2=Modifier.sample(axis = 0)
575
+ id1=list(ident1.term)
576
+ id2=list(ident2.term)
577
+ beh=list(behav.term)
578
+ mod1=list(modif1.term)
579
+ mod2=list(modif2.term)
580
+ sents=' '.join(map(str, (mod1+id1+beh+mod2+id2)))
581
+ values=np.concatenate([(modif1[['E','P','A']]).to_numpy(),
582
+ (ident1[['E','P','A']]).to_numpy(),
583
+ (behav[['E','P','A']]).to_numpy(),
584
+ (modif2[['E','P','A']]).to_numpy(),
585
+ (ident2[['E','P','A']]).to_numpy()], axis=1)[0]
586
+ indexx=torch.tensor([[(modif1['index_in_dic']).to_numpy()][0][0],
587
+ [(ident1['index_in_dic']).to_numpy()][0][0],
588
+ [(behav['index_in_dic']).to_numpy()][0][0],
589
+ [(modif2['index_in_dic']).to_numpy()][0][0],
590
+ [(ident2['index_in_dic']).to_numpy()][0][0]])
591
+ ys= torch.tensor(values)
592
+ inputs, masks = preprocessing_for_bert([sents])
593
+ yield inputs, masks, ys,indexx #torch.tensor(sents),
594
+
595
+ def get_output_agg(w,wt,I_b=n_I_v,B_b=n_B_v,M_b=n_M_v,batch_sz=300,batch_num=1):
596
+ df=pd.DataFrame()
597
+ for i in range(batch_num):
598
+ new_df=pd.DataFrame({'index_in_dic':4000,'term':w,'E':10,'P':10,'A':10,'E2':10,'P2':10,'A2':10,'term2':w,'len_Bert':3}, index=[0])
599
+ batch1=int(batch_sz/2)
600
+ batch2=batch_sz-batch1
601
+ if wt=='behavior':
602
+ q=ldr_new(I=I_b,B=B_b,M=M_b,N_df=new_df,WT=wt,batch_size=batch_sz)
603
+ preds = bert_predict(bert_regressor.to(device), q)
604
+ df_behavior=pd.concat([Behaviors,new_df],axis=0)
605
+ df2=out_df(data=q,predictions=preds,df_beh=df_behavior)
606
+ df=pd.concat([df,df2],axis=0)[['EEB','EPB','EAB','EB','PB', 'AB','ModA', 'Actor', 'Behavior', 'ModO', 'Object']].rename(columns={'EEB':'EE','EPB':'EP','EAB':'EA','EB':'E','PB':'P', 'AB':'A'})
607
+
608
+ if wt=='identity':
609
+ q=ldr_new(I=I_b,B=B_b,M=M_b,N_df=new_df,WT=wt,batch_size=batch1)
610
+ preds = bert_predict(bert_regressor.to(device), q)
611
+ df_identity=pd.concat([Identities,new_df],axis=0)
612
+ df2=out_df(data=q,predictions=preds,df_ident=df_identity)
613
+ df_act=pd.concat([df,df2],axis=0)
614
+ df_act=df_act.copy()[['EEA','EPA','EAA','EA','PA', 'AA','ModA', 'Actor', 'Behavior', 'ModO', 'Object']].rename(columns={'EEA':'EE','EPA':'EP','EAA':'EA','EA':'E','PA':'P', 'AA':'A'})
615
+ q=ldr_new(I=I_b,B=B_b,M=M_b,N_df=new_df,WT=wt,batch_size=batch2,alt=1)
616
+ preds = bert_predict(bert_regressor.to(device), q)
617
+ df_identity=pd.concat([Identities,new_df],axis=0)
618
+ df2=out_df(data=q,predictions=preds,df_ident=df_identity)
619
+ df_obj=pd.concat([df,df2],axis=0)
620
+ df_obj=df_obj.copy()[['EEO','EPO','EAO','EO', 'PO', 'AO','ModA', 'Actor', 'Behavior', 'ModO', 'Object']].rename(columns={'EEO':'EE','EPO':'EP','EAO':'EA','EO':'E','PO':'P', 'AO':'A'})
621
+ df=pd.concat([df_act,df_obj],axis=0)
622
+ if wt=='modifier':
623
+ q=ldr_new(I=I_b,B=B_b,M=M_b,N_df=new_df,WT=wt,batch_size=batch1)
624
+ preds = bert_predict(bert_regressor.to(device), q)
625
+ df_modifier=pd.concat([Modifiers,new_df],axis=0)
626
+ df2=out_df(data=q,predictions=preds,df_mod=df_modifier)
627
+ df_act=pd.concat([df,df2],axis=0)
628
+ df_act=df_act.copy()[['EEMA', 'EPMA', 'EAMA','EM1', 'PM1', 'AM1','ModA', 'Actor', 'Behavior', 'ModO', 'Object']].rename(columns={'EEMA':'EE','EPMA':'EP','EAMA':'EA','EM1':'E','PM1':'P', 'AM1':'A'})
629
+ q=ldr_new(I=I_b,B=B_b,M=M_b,N_df=new_df,WT=wt,batch_size=batch2,alt=1)
630
+ preds = bert_predict(bert_regressor.to(device), q)
631
+ df_modifier=pd.concat([Modifiers,new_df],axis=0)
632
+ df2=out_df(data=q,predictions=preds,df_mod=df_modifier)
633
+ df_obj=pd.concat([df,df2],axis=0)
634
+ df_obj=df_obj.copy()[['EEMO', 'EPMO', 'EAMO','EM2', 'PM2', 'AM2','ModA', 'Actor', 'Behavior', 'ModO', 'Object']].rename(columns={'EEMO':'EE','EPMO':'EP','EAMO':'EA','EM2':'E','PM2':'P', 'AM2':'A'})
635
+ df=pd.concat([df_act,df_obj],axis=0)
636
+ return(df)
637
+
638
+ def ldr_new(I,B,M,N_df,WT,batch_size=32):
639
+ dt_ldr= [x for x in DataLoader([next(gen_new(I,B,M,N_df,WT)) for x in range(batch_size)], batch_size=batch_size)][0]
640
+ return(dt_ldr)
641
+
642
+
643
+
644
+ def sent_gen(sentence):
645
+ sents=sentence
646
+ indexx=torch.tensor([1,1,1,1,1,1,1,1,1,1,1,1])
647
+ ys= torch.tensor([1,1,1,1,1,1,1,1,1,1,1,1])
648
+ inputs, masks = preprocessing_for_bert([sents])
649
+ yield inputs, masks, ys,indexx #torch.tensor(sents),
650
+ def sent_ldr(sent2,batch_size=1):
651
+ dt_ldr= [x for x in DataLoader([next(sent_gen(sent2)) for x in range(batch_size)], batch_size=batch_size)][0]
652
+ return(dt_ldr)
653
+ def EPA_sents(sent):
654
+ q=sent_ldr(sent)
655
+ predictions=bert_predict(bert_regressor.to(device), q)
656
+ df_out=pd.concat([pd.DataFrame(scaler_M.inverse_transform(predictions[:,0:3].cpu())),
657
+ pd.DataFrame(scaler_I.inverse_transform(predictions[:,3:6].cpu())),
658
+ pd.DataFrame(scaler_B.inverse_transform(predictions[:,6:9].cpu())),
659
+ pd.DataFrame(scaler_M.inverse_transform(predictions[:,9:12].cpu())),
660
+ pd.DataFrame(scaler_I.inverse_transform(predictions[:,12:15].cpu()))
661
+ ],axis=1).set_axis(['EEMA', 'EPMA', 'EAMA',
662
+ 'EEA', 'EPA', 'EAA', 'EEB', 'EPB', 'EAB',
663
+ 'EEMO', 'EPMO', 'EAMO','EEO', 'EPO', 'EAO'], axis=1, inplace=False)
664
+ return(df_out.round(decimals=2))
665
+
666
+ # Ref: https://stackoverflow.com/questions/28778668/freeze-header-in-pandas-dataframe
667
+
668
+ from ipywidgets import interact, IntSlider
669
+ from IPython.display import display
670
+
671
+ def freeze_header(df, num_rows=30, num_columns=10, step_rows=1,
672
+ step_columns=1):
673
+ """
674
+ Freeze the headers (column and index names) of a Pandas DataFrame. A widget
675
+ enables to slide through the rows and columns.
676
+ Parameters
677
+ ----------
678
+ df : Pandas DataFrame
679
+ DataFrame to display
680
+ num_rows : int, optional
681
+ Number of rows to display
682
+ num_columns : int, optional
683
+ Number of columns to display
684
+ step_rows : int, optional
685
+ Step in the rows
686
+ step_columns : int, optional
687
+ Step in the columns
688
+ Returns
689
+ -------
690
+ Displays the DataFrame with the widget
691
+ """
692
+ @interact(last_row=IntSlider(min=min(num_rows, df.shape[0]),
693
+ max=df.shape[0],
694
+ step=step_rows,
695
+ description='rows',
696
+ readout=False,
697
+ disabled=False,
698
+ continuous_update=True,
699
+ orientation='horizontal',
700
+ slider_color='purple'),
701
+ last_column=IntSlider(min=min(num_columns, df.shape[1]),
702
+ max=df.shape[1],
703
+ step=step_columns,
704
+ description='columns',
705
+ readout=False,
706
+ disabled=False,
707
+ continuous_update=True,
708
+ orientation='horizontal',
709
+ slider_color='purple'))
710
+ def _freeze_header(last_row, last_column):
711
+ display(df.iloc[max(0, last_row-num_rows):last_row,
712
+ max(0, last_column-num_columns):last_column])
713
+
714
+
715
+
716
+