moeen-m commited on
Commit
16f0d30
·
1 Parent(s): fa218dd

Upload predefined.py

Browse files
Files changed (1) hide show
  1. predefined.py +623 -0
predefined.py ADDED
@@ -0,0 +1,623 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !pip install transformers
2
+
3
+ !pip install git-lfs
4
+ !git lfs install
5
+ !git clone https://huggingface.co/moeenm/BERTNN
6
+ %cd BERTNN
7
+ ## Import required libraries
8
+ from datetime import datetime
9
+ import numpy as np
10
+ import pandas as pd
11
+ import random
12
+ from transformers import BertTokenizer, BertModel
13
+ import logging
14
+ import matplotlib.pyplot as plt
15
+ tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')#bert-large-uncased
16
+ import itertools
17
+ from sklearn.preprocessing import StandardScaler
18
+ from itertools import cycle,islice
19
+ from random import sample
20
+ import torch
21
+ import torch.nn as nn
22
+ from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
23
+ import torch.nn.functional as F
24
+
25
+
26
+ if torch.cuda.is_available():
27
+ device = torch.device("cuda")
28
+ print(f'There are {torch.cuda.device_count()} GPU(s) available.')
29
+ print('Device name:', torch.cuda.get_device_name(0))
30
+
31
+ else:
32
+ print('No GPU available, using the CPU instead.')
33
+ device = torch.device("cpu")
34
+
35
+ # Load and normalize the refrence EPA dictionary
36
+ def load_dictionary(file):
37
+ df=pd.read_csv(file).reset_index().rename(columns={"index": 'index_in_dic'})
38
+ df['term2']=df['term']
39
+ df.term=df.term.str.replace("_", " ")
40
+ df['len_Bert']=df.apply(lambda x: len(tokenizer.tokenize(x['term'])),axis=1)
41
+ # df=add_cluster(df)
42
+ return(df)
43
+
44
+ Modifiers =load_dictionary("FullSurveyorInteract_Modifiers.csv")
45
+ Behaviors=load_dictionary("FullSurveyorInteract_Behaviors.csv")
46
+ Identities=load_dictionary("FullSurveyorInteract_Identities.csv")
47
+
48
+ n_Modifiers = Modifiers.copy()
49
+ n_Behaviors =Behaviors.copy()
50
+ n_Identities = Identities.copy()
51
+
52
+ scaler_B = StandardScaler()
53
+ scaler_M = StandardScaler()
54
+ scaler_I = StandardScaler()
55
+
56
+ n_Behaviors[['E','P','A']] = scaler_B.fit_transform(Behaviors[['E','P','A']])
57
+ n_Modifiers[['E','P','A']] = scaler_M.fit_transform(Modifiers[['E','P','A']])
58
+ n_Identities[['E','P','A']] = scaler_I.fit_transform(Identities[['E','P','A']])
59
+
60
+
61
+ # Ref: https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/
62
+
63
+ rnd_st=42
64
+
65
+ # Load the BERT tokenizer
66
+ tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)
67
+
68
+ # Create a function to tokenize a set of texts
69
+ def preprocessing_for_bert(data,MAX_LEN=40):
70
+ """Perform required preprocessing steps for pretrained BERT.
71
+ @param data (np.array): Array of texts to be processed.
72
+ @return input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
73
+ @return attention_masks (torch.Tensor): Tensor of indices specifying which
74
+ tokens should be attended to by the model.
75
+ """
76
+ # Create empty lists to store outputs
77
+ input_ids = []
78
+ attention_masks = []
79
+
80
+ # For every sentence...
81
+ for sent in data:
82
+ # `encode_plus` will:
83
+ # (1) Tokenize the sentence
84
+ # (2) Add the `[CLS]` and `[SEP]` token to the start and end
85
+ # (3) Truncate/Pad sentence to max length
86
+ # (4) Map tokens to their IDs
87
+ # (5) Create attention mask
88
+ # (6) Return a dictionary of outputs
89
+ encoded_sent = tokenizer.encode_plus(
90
+ text=sent, # Preprocess sentence
91
+ add_special_tokens=True, # Add `[CLS]` and `[SEP]`
92
+ max_length=MAX_LEN, # Max length to truncate/pad
93
+ # pad_to_max_length=True, # Pad sentence to max length
94
+ padding='max_length',
95
+ #return_tensors='pt', # Return PyTorch tensor
96
+ return_attention_mask=True # Return attention mask
97
+ )
98
+
99
+ # Add the outputs to the lists
100
+ input_ids.append(encoded_sent.get('input_ids'))
101
+ attention_masks.append(encoded_sent.get('attention_mask'))
102
+
103
+ # Convert lists to tensors
104
+ input_ids = torch.tensor(input_ids[0])
105
+ attention_masks = torch.tensor(attention_masks[0])
106
+
107
+ return input_ids, attention_masks
108
+
109
+
110
+
111
+
112
+
113
+
114
+ # # Convert other data types to torch.Tensor
115
+ # from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
116
+ def gnrtr2(Identity,Behavior,Modifier):
117
+ ident1=Identity.sample(axis = 0)
118
+ ident2=Identity.sample(axis = 0)
119
+ behav=Behavior.sample(axis = 0)
120
+ modif1=Modifier.sample(axis = 0)
121
+ modif2=Modifier.sample(axis = 0)
122
+ id1=list(ident1.term)
123
+ id2=list(ident2.term)
124
+ beh=list(behav.term)
125
+ mod1=list(modif1.term)
126
+ mod2=list(modif2.term)
127
+ sents=' '.join(map(str, (mod1+id1+beh+mod2+id2)))
128
+ values=np.concatenate([(modif1[['E','P','A']]).to_numpy(),
129
+ (ident1[['E','P','A']]).to_numpy(),
130
+ (behav[['E','P','A']]).to_numpy(),
131
+ (modif2[['E','P','A']]).to_numpy(),
132
+ (ident2[['E','P','A']]).to_numpy()], axis=1)[0]
133
+ indexx=torch.tensor([[(modif1['index_in_dic']).to_numpy()][0][0],
134
+ [(ident1['index_in_dic']).to_numpy()][0][0],
135
+ [(behav['index_in_dic']).to_numpy()][0][0],
136
+ [(modif2['index_in_dic']).to_numpy()][0][0],
137
+ [(ident2['index_in_dic']).to_numpy()][0][0]])
138
+ ys= torch.tensor(values)
139
+ inputs, masks = preprocessing_for_bert([sents])
140
+ # data=TensorDataset(inputs, masks, ys)
141
+
142
+ yield inputs, masks, ys,indexx #torch.tensor(sents),
143
+ # For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
144
+ def dta_ldr2(I,B,M,batch_size=32):
145
+ dt_ldr= [x for x in DataLoader([next(gnrtr2(I,B,M)) for x in range(batch_size)], batch_size=batch_size)][0]
146
+ return(dt_ldr)
147
+
148
+
149
+
150
+ # # Convert other data types to torch.Tensor
151
+
152
+ # For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
153
+ def dta_ldr(I,B,M,batch_size=32):
154
+ dt_ldr= [x for x in DataLoader([next(gnrtr(I,B,M)) for x in range(batch_size)], batch_size=batch_size)][0]
155
+ return(dt_ldr)
156
+ class BertRegressor(nn.Module):
157
+ """Bert Model for Regression Tasks.
158
+ """
159
+ def __init__(self, freeze_bert=False):
160
+ """
161
+ @param bert: a BertModel object
162
+ @param classifier: a torch.nn.Module regressor
163
+ @param freeze_bert (bool): Set `False` to fine-tune the BERT model
164
+ """
165
+ super(BertRegressor, self).__init__()
166
+ # Specify hidden size of BERT, hidden size of our regressor, and number of independent variables
167
+ D_in, H, D_out = 1024, 120, 15
168
+
169
+ # Instantiate BERT model
170
+ self.bert = BertModel.from_pretrained('bert-large-uncased')
171
+
172
+ # Instantiate an one-layer feed-forward classifier
173
+ self.regressor = nn.Sequential(
174
+ nn.Dropout(0.4),
175
+ nn.Linear(D_in, H),
176
+ nn.Dropout(0.3),
177
+ nn.ReLU(),
178
+ nn.Dropout(0.3),
179
+ nn.Linear(H, D_out)
180
+ )
181
+ # for name, param in list(self.bert.named_parameters())[:-90]:#-20#-90 #-196 #-4 very very slow in training
182
+ # print('I will be frozen: {}'.format(name))
183
+ # param.requires_grad = False
184
+
185
+ # Freeze the BERT model
186
+ if freeze_bert:
187
+ for param in self.bert.parameters():
188
+ param.requires_grad = False
189
+
190
+ def forward(self, input_ids, attention_mask):
191
+ """
192
+ Feed input to BERT and the classifier to compute logits.
193
+ @param input_ids (torch.Tensor): an input tensor with shape (batch_size,
194
+ max_length)
195
+ @param attention_mask (torch.Tensor): a tensor that hold attention mask
196
+ information with shape (batch_size, max_length)
197
+ @return logits (torch.Tensor): an output tensor with shape (batch_size,
198
+ num_labels)
199
+ """
200
+ # Feed input to BERT
201
+ outputs = self.bert(input_ids=input_ids,
202
+ attention_mask=attention_mask)
203
+
204
+ # Extract the last hidden state of the token `[CLS]` for regression task
205
+ last_hidden_state_cls = outputs[0][:, 0, :]
206
+
207
+ # Feed input to classifier to compute predictions
208
+ predictions = self.regressor(last_hidden_state_cls)#.float()
209
+
210
+ return predictions#.float()
211
+ from transformers import AdamW, get_linear_schedule_with_warmup
212
+
213
+ def initialize_model(epochs=4):
214
+ """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
215
+ """
216
+ # Instantiate Bert Classifier
217
+ bert_regressor = BertRegressor(freeze_bert=False)
218
+
219
+ # Tell PyTorch to run the model on GPU
220
+ bert_regressor.to(device)
221
+
222
+ # Create the optimizer
223
+ optimizer = AdamW(bert_regressor.parameters(),
224
+ lr=2e-5, # Smaller LR
225
+ eps=1e-8, # Default epsilon value
226
+ weight_decay =0.001 # Decoupled weight decay to apply.
227
+ )
228
+
229
+ # Total number of training steps
230
+ total_steps = 100000#len(train_dataloader) * epochs
231
+
232
+ # Set up the learning rate scheduler
233
+ scheduler = get_linear_schedule_with_warmup(optimizer,
234
+ num_warmup_steps=0, # Default value
235
+ num_training_steps=total_steps)
236
+ return bert_regressor, optimizer, scheduler
237
+ import random
238
+ import time
239
+
240
+ # Specify loss function
241
+ loss_fn = nn.MSELoss()
242
+
243
+ def set_seed(seed_value=42):
244
+ """Set seed for reproducibility.
245
+ """
246
+ random.seed(seed_value)
247
+ np.random.seed(seed_value)
248
+ torch.manual_seed(seed_value)
249
+ torch.cuda.manual_seed_all(seed_value)
250
+
251
+ def train(model, I_trn=n_I_train,B_trn=n_B_train,M_trn=n_M_train,batch_size_trn=32,
252
+ I_tst=n_I_test,B_tst=n_B_test,M_tst=n_M_test,batch_size_tst=32, batch_size=50,batch_epochs=400, evaluation=False):
253
+ """Train the BertClassifier model.
254
+ """
255
+ #initialize val_loss with something big to prevent initialization error
256
+ # val_loss=10
257
+ # Start training loop
258
+ print("Start training...\n")
259
+ # =======================================
260
+ # Training
261
+ # =======================================
262
+ # Print the header of the result table
263
+ print(f" {'Batch':^5} | {'Train Loss':^12} | {'Val Loss':^10} | {'Elapsed':^9}")
264
+ print("-"*50)
265
+ # Measure the elapsed time of each epoch
266
+ t0_batch = time.time()
267
+ # Reset tracking variables at the beginning of each epoch
268
+ batch_loss, batch_counts = 0, 0
269
+ # Put the model into the training mode
270
+ model.train()
271
+ # For each batch of training data...
272
+ for batch in range(batch_epochs): #298
273
+ batch_counts +=1
274
+ if ((batch==(456))):break #1451#246
275
+ # if val_loss<0.3: break
276
+ # Load batch to GPU
277
+ b_input_ids, b_attn_mask, b_ys,_ = tuple(t.to(device) for t in dta_ldr(I=I_trn,B=B_trn,M=M_trn,batch_size=batch_size_trn))
278
+ # Zero out any previously calculated gradients
279
+ model.zero_grad()
280
+ # Perform a forward pass. This will return logits.
281
+ # print(b_input_ids,'Mask:\n',b_attn_mask)
282
+ preds = model(b_input_ids, b_attn_mask)
283
+ # Compute loss
284
+ loss = loss_fn(preds.float(), b_ys.float())
285
+ batch_loss += loss.item()
286
+ # Perform a backward pass to calculate gradients
287
+ loss.backward()
288
+ # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
289
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
290
+ # Update parameters and the learning rate
291
+ optimizer.step()
292
+ scheduler.step()
293
+
294
+ # Print the loss values and time elapsed for every 20 batches
295
+ if (batch_counts % 50 == 0 and batch_counts != 0) : #or(batch>585)
296
+ # Calculate time elapsed for 20 batches
297
+ time_elapsed = time.time() - t0_batch
298
+
299
+ # Print training results
300
+ val_loss = evaluate(model, Ie=I_tst,Be=B_tst,Me=M_tst,batch_size_e=batch_size_tst)
301
+ print(f"{batch+ 1:^7}|{batch_loss / batch_counts:^12.6f} | {val_loss:^10.6f} | {time_elapsed:^9.2f}") #| {step:^7}
302
+ # After the completion of each training epoch, measure the model's performance
303
+ # on our validation set.
304
+ print("-"*50)
305
+ # print(batch)
306
+
307
+ # if (batch<586):
308
+ # # Reset batch tracking variables
309
+ # batch_loss, batch_counts = 0, 0
310
+ # t0_batch = time.time()
311
+ # # Reset batch tracking variables
312
+ batch_loss, batch_counts = 0, 0
313
+ t0_batch = time.time()
314
+
315
+ # Calculate the average loss over the entire training data
316
+ # avg_train_loss = total_loss / (batch_size*batch_epochs)
317
+
318
+ # =======================================
319
+ # Evaluation
320
+ # =======================================
321
+ if evaluation == True:
322
+ # After the completion of each training epoch, measure the model's performance
323
+ # on our validation set.
324
+ val_loss = evaluate(model, Ie=I_tst,Be=B_tst,Me=M_tst,batch_size_e=batch_size_tst)
325
+ if val_loss<0.32:
326
+ print('\n Consider this one with val:', val_loss,' at:',batch,'\n')
327
+ print("-"*50)
328
+
329
+
330
+
331
+ # Calculate the average loss over the entire training data
332
+ # avg_train_loss = total_loss / (batch_size*batch_epochs)
333
+
334
+ val_loss = evaluate(model, Ie=I_tst,Be=B_tst,Me=M_tst,batch_size_e=batch_size_tst)
335
+ print(f"{batch+ 1:^7}|{batch_loss / batch_counts:^12.6f} | {val_loss:^10.6f} | {time_elapsed:^9.2f}") #| {step:^7}
336
+ print("Training complete!")
337
+
338
+
339
+ def evaluate(model, Ie,Be,Me,batch_size_e):
340
+ """After the completion of each training epoch, measure the model's performance
341
+ on our validation set.
342
+ """
343
+ # Put the model into the evaluation mode. The dropout layers are disabled during
344
+ # the test time.
345
+ model.eval()
346
+
347
+ # Tracking variables
348
+ val_loss = []
349
+
350
+ # For each batch in our validation set...
351
+ for batch in range(1):
352
+ # Load batch to GPU
353
+ b_input_ids, b_attn_mask, b_ys,_ = tuple(t.to(device) for t in dta_ldr2(Ie,Be,Me,batch_size_e))
354
+
355
+ # Compute logits
356
+ with torch.no_grad():
357
+ preds = model(b_input_ids, b_attn_mask)
358
+
359
+ # Compute loss
360
+ loss = loss_fn(preds, b_ys)
361
+ val_loss.append(loss.item())
362
+
363
+
364
+ # Compute the absolutr error and loss over the validation set.
365
+ val_loss = np.mean(val_loss)
366
+
367
+ return val_loss
368
+
369
+
370
+
371
+ bert_regressor = BertRegressor()
372
+ bert_regressor.load_state_dict(torch.load("MAMBO_Cluster_deflection_training_impression_change",map_location=torch.device(device)))
373
+ bert_regressor.eval()
374
+
375
+
376
+
377
+
378
+ def bert_predict(model, test_dataloader):
379
+ """Perform a forward pass on the trained BERT model to predict probabilities
380
+ on the test set.
381
+ """
382
+ # Put the model into the evaluation mode. The dropout layers are disabled during
383
+ # the test time.
384
+ model.eval()
385
+ all_preds = []
386
+ # For each batch in our test set...
387
+ for batch in range(1):
388
+ # Load batch to GPU
389
+ b_input_ids, b_attn_mask = tuple(t.to(device) for t in test_dataloader)[:2]
390
+
391
+ # Compute predictions
392
+ with torch.no_grad():
393
+ preds = model(b_input_ids, b_attn_mask)#.to(device)
394
+ all_preds.append(preds)
395
+
396
+ # Concatenate predictions from each batch
397
+ all_preds = torch.cat(all_preds, dim=0)
398
+
399
+ return all_preds
400
+ def out_df(data,predictions,df_beh=Behaviors,df_ident=Identities,df_mod=Modifiers):
401
+ df2=pd.concat([pd.DataFrame(scaler_M.inverse_transform(predictions[:,0:3].cpu())),
402
+ pd.DataFrame(scaler_M.inverse_transform(data[2][:,0:3])),
403
+ pd.DataFrame(scaler_I.inverse_transform(predictions[:,3:6].cpu())),
404
+ pd.DataFrame(scaler_I.inverse_transform(data[2][:,3:6])),
405
+ pd.DataFrame(scaler_B.inverse_transform(predictions[:,6:9].cpu())),
406
+ pd.DataFrame(scaler_B.inverse_transform(data[2][:,6:9])),
407
+ pd.DataFrame(scaler_M.inverse_transform(predictions[:,9:12].cpu())),
408
+ pd.DataFrame(scaler_M.inverse_transform(data[2][:,9:12])),
409
+ pd.DataFrame(scaler_I.inverse_transform(predictions[:,12:15].cpu())),
410
+ pd.DataFrame(scaler_I.inverse_transform(data[2][:,12:15])),pd.DataFrame(np.array(data[3]))
411
+ ],axis=1).set_axis(['EEMA', 'EPMA', 'EAMA','EM1', 'PM1', 'AM1',
412
+ 'EEA', 'EPA', 'EAA','EA', 'PA', 'AA',
413
+ 'EEB', 'EPB', 'EAB','EB', 'PB', 'AB',
414
+ 'EEMO', 'EPMO', 'EAMO','EM2', 'PM2', 'AM2',
415
+ 'EEO', 'EPO', 'EAO','EO', 'PO', 'AO',
416
+ 'idx_ModA','idx_Act','idx_Beh','idx_ModO','idx_Obj'], axis=1, inplace=False)
417
+ df2=pd.merge(df2, df_mod[['term','index_in_dic']], left_on= ['idx_ModA'], right_on = ["index_in_dic"],
418
+ how='left').rename(columns={"term": 'ModA'}).drop(['index_in_dic'], axis=1)
419
+ df2=pd.merge(df2, df_ident[['term','index_in_dic']], left_on= ['idx_Act'], right_on = ["index_in_dic"],
420
+ how='left').rename(columns={"term": 'Actor'}).drop(['index_in_dic'], axis=1)
421
+ df2=pd.merge(df2, df_beh[['term','index_in_dic']], left_on= ['idx_Beh'], right_on = ["index_in_dic"],
422
+ how='left').rename(columns={"term": 'Behavior'}).drop(['index_in_dic'], axis=1)
423
+ df2=pd.merge(df2, df_mod[['term','index_in_dic']], left_on= ['idx_ModO'], right_on = ["index_in_dic"],
424
+ how='left').rename(columns={"term": 'ModO'}).drop(['index_in_dic'], axis=1)
425
+ df2=pd.merge(df2, df_ident[['term','index_in_dic']], left_on= ['idx_Obj'], right_on = ["index_in_dic"],
426
+ how='left').rename(columns={"term": 'Object'}).drop(['index_in_dic'], axis=1)
427
+
428
+ df2=df2[['EEMA','EPMA', 'EAMA', 'EEA', 'EPA', 'EAA', 'EEB', 'EPB', 'EAB','EEMO', 'EPMO', 'EAMO', 'EEO', 'EPO', 'EAO','EM1', 'PM1', 'AM1','EA', 'PA', 'AA', 'EB', 'PB','AB', 'EM2', 'PM2', 'AM2', 'EO',
429
+ 'PO', 'AO', 'ModA','Actor','Behavior', 'ModO', 'Object']]
430
+ return(df2)
431
+
432
+ def get_output(I_b=n_Identities,B_b=n_Behaviors,M_b=n_Modifiers,batch_sz=3000,batch_num=10):
433
+ df=pd.DataFrame()
434
+ for i in range(batch_num):
435
+ q=dta_ldr2(I=I_b,B=B_b,M=M_b,batch_size=batch_sz)
436
+ preds = bert_predict(bert_regressor.to(device), q)
437
+ df2=out_df(data=q,predictions=preds)
438
+ df=pd.concat([df,df2],axis=0)
439
+ return(df)
440
+ def gen_new(Identity,Behavior,Modifier,n_df,word_type):
441
+ if word_type=='identity':
442
+ ident1=n_df.sample(axis = 0,random_state=56)
443
+ else:ident1=Identity.sample(axis = 0,random_state=6)
444
+ ident2=Identity.sample(axis = 0,random_state=6)
445
+ if word_type=='behavior':
446
+ behav=n_df.sample(axis = 0,random_state=5)
447
+ else: behav=Behavior.sample(axis = 0,random_state=5)
448
+ if word_type=='modifier':
449
+ modif1=n_df.sample(axis = 0,random_state=55)
450
+ else: modif1=Modifier.sample(axis = 0)
451
+ modif2=Modifier.sample(axis = 0,random_state=96)
452
+ id1=list(ident1.term)
453
+ id2=list(ident2.term)
454
+ beh=list(behav.term)
455
+ mod1=list(modif1.term)
456
+ mod2=list(modif2.term)
457
+ # wrdvc_ident1=gs_model.get_vector((list(ident1.trm_org))[0], norm=True)
458
+ sents=' '.join(map(str, (mod1+id1+beh+mod2+id2)))
459
+ values=np.concatenate([(modif1[['E','P','A']]).to_numpy(),
460
+ (ident1[['E','P','A']]).to_numpy(),
461
+ (behav[['E','P','A']]).to_numpy(),
462
+ (modif2[['E','P','A']]).to_numpy(),
463
+ (ident2[['E','P','A']]).to_numpy()], axis=1)[0]
464
+ # print(values)
465
+ #indexx=[(ident1['index_in_dic']).to_numpy()][0][0]
466
+ indexx=torch.tensor([[(modif1['index_in_dic']).to_numpy()][0][0],
467
+ [(ident1['index_in_dic']).to_numpy()][0][0],
468
+ [(behav['index_in_dic']).to_numpy()][0][0],
469
+ [(modif2['index_in_dic']).to_numpy()][0][0],
470
+ [(ident2['index_in_dic']).to_numpy()][0][0]])
471
+ ys= torch.tensor(values)
472
+
473
+
474
+ inputs, masks = preprocessing_for_bert([sents])
475
+ # data=TensorDataset(inputs, masks, ys)
476
+
477
+ yield inputs, masks, ys,indexx #torch.tensor(sents),
478
+ def ldr_new(I,B,M,N_df,WT,batch_size=32):
479
+ dt_ldr= [x for x in DataLoader([next(gen_new(I,B,M,N_df,WT)) for x in range(batch_size)], batch_size=batch_size)][0]
480
+ return(dt_ldr)
481
+
482
+
483
+
484
+ def get_output_new(w,wt,I_b=n_Identities,B_b=n_Behaviors,M_b=n_Modifiers,batch_sz=300,batch_num=1):
485
+
486
+ df=pd.DataFrame()
487
+ for i in range(batch_num):
488
+ new_df=pd.DataFrame({'index_in_dic':1000,'term':w,'E':10,'P':10,'A':10,'E2':10,'P2':10,'A2':10,'term2':w,'len_Bert':3}, index=[0])
489
+ q=ldr_new(I=I_b,B=B_b,M=M_b,N_df=new_df,WT=wt,batch_size=batch_sz)
490
+ preds = bert_predict(bert_regressor.to(device), q)
491
+ if wt=='identity':
492
+ df_identity=pd.concat([Identities,new_df],axis=0)
493
+ df2=out_df(data=q,predictions=preds,df_ident=df_identity)
494
+
495
+ if wt=='behavior':
496
+ df_behavior=pd.concat([Behaviors,new_df],axis=0)
497
+ df2=out_df(data=q,predictions=preds,df_beh=df_behavior)
498
+
499
+ if wt=='modifier':
500
+ df_modifier=pd.concat([Modifiers,new_df],axis=0)
501
+ df2=out_df(data=q,predictions=preds,df_mod=df_modifier)
502
+
503
+ df=pd.concat([df,df2],axis=0)
504
+ return(df)
505
+
506
+ def gen_new(Identity,Behavior,Modifier,n_df,word_type):
507
+ if word_type=='identity':
508
+ ident1=n_df.sample(axis = 0)
509
+ else:ident1=Identity.sample(axis = 0)
510
+ ident2=Identity.sample(axis = 0)
511
+ if word_type=='behavior':
512
+ behav=n_df.sample(axis = 0)
513
+ else: behav=Behavior.sample(axis = 0)
514
+ if word_type=='modifier':
515
+ modif1=n_df.sample(axis = 0)
516
+ else: modif1=Modifier.sample(axis = 0)
517
+ modif2=Modifier.sample(axis = 0)
518
+ id1=list(ident1.term)
519
+ id2=list(ident2.term)
520
+ beh=list(behav.term)
521
+ mod1=list(modif1.term)
522
+ mod2=list(modif2.term)
523
+ # wrdvc_ident1=gs_model.get_vector((list(ident1.trm_org))[0], norm=True)
524
+ sents=' '.join(map(str, (mod1+id1+beh+mod2+id2)))
525
+ values=np.concatenate([(modif1[['E','P','A']]).to_numpy(),
526
+ (ident1[['E','P','A']]).to_numpy(),
527
+ (behav[['E','P','A']]).to_numpy(),
528
+ (modif2[['E','P','A']]).to_numpy(),
529
+ (ident2[['E','P','A']]).to_numpy()], axis=1)[0]
530
+ # print(values)
531
+ #indexx=[(ident1['index_in_dic']).to_numpy()][0][0]
532
+ indexx=torch.tensor([[(modif1['index_in_dic']).to_numpy()][0][0],
533
+ [(ident1['index_in_dic']).to_numpy()][0][0],
534
+ [(behav['index_in_dic']).to_numpy()][0][0],
535
+ [(modif2['index_in_dic']).to_numpy()][0][0],
536
+ [(ident2['index_in_dic']).to_numpy()][0][0]])
537
+ ys= torch.tensor(values)
538
+ inputs, masks = preprocessing_for_bert([sents])
539
+ # data=TensorDataset(inputs, masks, ys)
540
+
541
+ yield inputs, masks, ys,indexx #torch.tensor(sents),
542
+ def ldr_new(I,B,M,N_df,WT,batch_size=32):
543
+ dt_ldr= [x for x in DataLoader([next(gen_new(I,B,M,N_df,WT)) for x in range(batch_size)], batch_size=batch_size)][0]
544
+ return(dt_ldr)
545
+
546
+
547
+
548
+ def sent_gen(sentence):
549
+ sents=sentence
550
+ indexx=torch.tensor([1,1,1,1,1,1,1,1,1,1,1,1])
551
+ ys= torch.tensor([1,1,1,1,1,1,1,1,1,1,1,1])
552
+ inputs, masks = preprocessing_for_bert([sents])
553
+ yield inputs, masks, ys,indexx #torch.tensor(sents),
554
+ def sent_ldr(sent2,batch_size=1):
555
+ dt_ldr= [x for x in DataLoader([next(sent_gen(sent2)) for x in range(batch_size)], batch_size=batch_size)][0]
556
+ return(dt_ldr)
557
+ def EPA_sents(sent):
558
+ q=sent_ldr(sent)
559
+ predictions=bert_predict(bert_regressor.to(device), q)
560
+ df_out=pd.concat([pd.DataFrame(scaler_M.inverse_transform(predictions[:,0:3].cpu())),
561
+ pd.DataFrame(scaler_I.inverse_transform(predictions[:,3:6].cpu())),
562
+ pd.DataFrame(scaler_B.inverse_transform(predictions[:,6:9].cpu())),
563
+ pd.DataFrame(scaler_M.inverse_transform(predictions[:,9:12].cpu())),
564
+ pd.DataFrame(scaler_I.inverse_transform(predictions[:,12:15].cpu()))
565
+ ],axis=1).set_axis(['EEMA', 'EPMA', 'EAMA',
566
+ 'EEA', 'EPA', 'EAA', 'EEB', 'EPB', 'EAB',
567
+ 'EEMO', 'EPMO', 'EAMO','EEO', 'EPO', 'EAO'], axis=1, inplace=False)
568
+ return(df_out.round(decimals=2))
569
+
570
+ # Ref: https://stackoverflow.com/questions/28778668/freeze-header-in-pandas-dataframe
571
+
572
+ from ipywidgets import interact, IntSlider
573
+ from IPython.display import display
574
+
575
+ def freeze_header(df, num_rows=30, num_columns=10, step_rows=1,
576
+ step_columns=1):
577
+ """
578
+ Freeze the headers (column and index names) of a Pandas DataFrame. A widget
579
+ enables to slide through the rows and columns.
580
+
581
+ Parameters
582
+ ----------
583
+ df : Pandas DataFrame
584
+ DataFrame to display
585
+ num_rows : int, optional
586
+ Number of rows to display
587
+ num_columns : int, optional
588
+ Number of columns to display
589
+ step_rows : int, optional
590
+ Step in the rows
591
+ step_columns : int, optional
592
+ Step in the columns
593
+
594
+ Returns
595
+ -------
596
+ Displays the DataFrame with the widget
597
+ """
598
+ @interact(last_row=IntSlider(min=min(num_rows, df.shape[0]),
599
+ max=df.shape[0],
600
+ step=step_rows,
601
+ description='rows',
602
+ readout=False,
603
+ disabled=False,
604
+ continuous_update=True,
605
+ orientation='horizontal',
606
+ slider_color='purple'),
607
+ last_column=IntSlider(min=min(num_columns, df.shape[1]),
608
+ max=df.shape[1],
609
+ step=step_columns,
610
+ description='columns',
611
+ readout=False,
612
+ disabled=False,
613
+ continuous_update=True,
614
+ orientation='horizontal',
615
+ slider_color='purple'))
616
+ def _freeze_header(last_row, last_column):
617
+ display(df.iloc[max(0, last_row-num_rows):last_row,
618
+ max(0, last_column-num_columns):last_column])
619
+
620
+
621
+
622
+ cols=['EEMA', 'EPMA', 'EAMA', 'EEA', 'EPA', 'EAA', 'EEB', 'EPB', 'EAB',
623
+ 'EEMO', 'EPMO', 'EAMO', 'EEO', 'EPO', 'EAO', 'ModA', 'Actor', 'Behavior', 'ModO', 'Object']