Teslim Olunlade commited on
Commit
5b67bcb
·
1 Parent(s): e4000d0

Training tweaks

Browse files
Files changed (1) hide show
  1. app/train.py +11 -8
app/train.py CHANGED
@@ -1,6 +1,7 @@
1
  #!/usr/bin/env python3
2
 
3
- import numpy as np
 
4
  import tensorflow as tf
5
  from datasets import load_dataset
6
  from tensorflow.keras.optimizers import Adam
@@ -18,7 +19,7 @@ label2id = {label: id for id, label in enumerate(labels)}
18
  id2label = {id: label for id, label in enumerate(labels)}
19
 
20
  dataset = load_dataset("csv", data_files="train.csv")
21
- tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
22
 
23
 
24
  def process_data(row):
@@ -27,19 +28,19 @@ def process_data(row):
27
 
28
  encoding = tokenizer(text, padding="max_length", truncation=True)
29
 
30
- labels_matrix = np.zeros((len(text), len(labels)))
31
 
32
  # fill numpy array
33
- for id, label in enumerate(labels):
34
- labels_matrix[:, id] = labels_batch[label]
35
 
36
- encoding["labels"] = labels_matrix.tolist()
37
 
38
  return encoding
39
 
40
 
41
  model = TFAutoModelForSequenceClassification.from_pretrained(
42
- "bert-base-cased",
43
  problem_type="multi_label_classification",
44
  num_labels=len(labels),
45
  label2id=label2id,
@@ -47,7 +48,9 @@ model = TFAutoModelForSequenceClassification.from_pretrained(
47
  )
48
 
49
  encoded = dataset.map(
50
- process_data, batched=True, remove_columns=[*labels, "id", "comment_text"]
 
 
51
  )
52
 
53
  tf_dataset = model.prepare_tf_dataset(
 
1
  #!/usr/bin/env python3
2
 
3
+ import multiprocessing
4
+
5
  import tensorflow as tf
6
  from datasets import load_dataset
7
  from tensorflow.keras.optimizers import Adam
 
19
  id2label = {id: label for id, label in enumerate(labels)}
20
 
21
  dataset = load_dataset("csv", data_files="train.csv")
22
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
23
 
24
 
25
  def process_data(row):
 
28
 
29
  encoding = tokenizer(text, padding="max_length", truncation=True)
30
 
31
+ label_arr = [0] * len(labels)
32
 
33
  # fill numpy array
34
+ for id, label in enumerate(labels_batch):
35
+ label_arr[id] = labels_batch[label]
36
 
37
+ encoding["labels"] = label_arr
38
 
39
  return encoding
40
 
41
 
42
  model = TFAutoModelForSequenceClassification.from_pretrained(
43
+ "bert-base-uncased",
44
  problem_type="multi_label_classification",
45
  num_labels=len(labels),
46
  label2id=label2id,
 
48
  )
49
 
50
  encoded = dataset.map(
51
+ process_data,
52
+ remove_columns=["id", "comment_text"],
53
+ num_proc=int(multiprocessing.cpu_count()),
54
  )
55
 
56
  tf_dataset = model.prepare_tf_dataset(