MAX_LEN = 70

bertInput = bert_tokenizer.encode_plus(
    data.Text[id_data],                
    add_special_tokens = True,
    padding = 'max_length',
    truncation = 'longest_first',
    max_length = 50,
    return_attention_mask = True,
    return_token_type_ids = True
    )

bertInput.keys()

def convert_example_to_feature(sentence):  
  return bert_tokenizer.encode_plus(
      sentence, 
      add_special_tokens = True, 
      padding = 'max_length',
      truncation = 'longest_first',
      max_length = MAX_LEN, 
      return_attention_mask = True, 
      return_token_type_ids=True
    )
    
    
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
  return {
      "input_ids": input_ids,               # Sebagai token embedding
      "token_type_ids": token_type_ids,     # Sebagai segment embedding
      "attention_mask": attention_masks,    # Sebagai filter informasi mana yang kalkulasi oleh model
  }, label


def encode(data):
  input_ids_list = []
  token_type_ids_list = []
  attention_mask_list = []
  label_list = []
    
  for label,sentence in data.to_numpy():

    bert_input = convert_example_to_feature(sentence)
    input_ids_list.append(bert_input['input_ids'])
    token_type_ids_list.append(bert_input['token_type_ids'])
    attention_mask_list.append(bert_input['attention_mask'])
    label_list.append([label])

  return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)
  

EPOCHS = 2
BATCH_SIZE = 64
LEARNING_RATE = 5e-5

train_encode = encode(df_train).batch(BATCH_SIZE)
test_encode = encode(df_test).batch(BATCH_SIZE)
val_encode = encode(df_val).batch(BATCH_SIZE)