| MAX_LEN = 70 |
|
|
| bertInput = bert_tokenizer.encode_plus( |
| data.Text[id_data], |
| add_special_tokens = True, |
| padding = 'max_length', |
| truncation = 'longest_first', |
| max_length = 50, |
| return_attention_mask = True, |
| return_token_type_ids = True |
| ) |
|
|
| bertInput.keys() |
|
|
| def convert_example_to_feature(sentence): |
| return bert_tokenizer.encode_plus( |
| sentence, |
| add_special_tokens = True, |
| padding = 'max_length', |
| truncation = 'longest_first', |
| max_length = MAX_LEN, |
| return_attention_mask = True, |
| return_token_type_ids=True |
| ) |
| |
| |
| def map_example_to_dict(input_ids, attention_masks, token_type_ids, label): |
| return { |
| "input_ids": input_ids, |
| "token_type_ids": token_type_ids, |
| "attention_mask": attention_masks, |
| }, label |
|
|
|
|
| def encode(data): |
| input_ids_list = [] |
| token_type_ids_list = [] |
| attention_mask_list = [] |
| label_list = [] |
| |
| for label,sentence in data.to_numpy(): |
|
|
| bert_input = convert_example_to_feature(sentence) |
| input_ids_list.append(bert_input['input_ids']) |
| token_type_ids_list.append(bert_input['token_type_ids']) |
| attention_mask_list.append(bert_input['attention_mask']) |
| label_list.append([label]) |
|
|
| return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict) |
| |
|
|
| EPOCHS = 2 |
| BATCH_SIZE = 64 |
| LEARNING_RATE = 5e-5 |
|
|
| train_encode = encode(df_train).batch(BATCH_SIZE) |
| test_encode = encode(df_test).batch(BATCH_SIZE) |
| val_encode = encode(df_val).batch(BATCH_SIZE) |