Teslim Olunlade commited on
Commit
131f8ea
·
1 Parent(s): 75c005d

Successfully training

Browse files
Files changed (2) hide show
  1. app/.gitignore +2 -0
  2. app/train.py +23 -27
app/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ out/
2
+ train.csv
app/train.py CHANGED
@@ -1,36 +1,34 @@
1
  #!/usr/bin/env python3
2
 
3
- from collections import defaultdict
4
-
5
- import pandas as pd
6
- import tensorflow as tf
7
- from transformers import (
8
- AutoTokenizer,
9
- TFAutoModelForSequenceClassification,
10
- TFTrainer,
11
- TFTrainingArguments,
12
- )
13
 
14
  labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
15
  label2id = {label: id for id, label in enumerate(labels)}
16
  id2label = {id: label for id, label in enumerate(labels)}
17
 
18
- data = pd.read_csv("./train.csv")
19
-
20
- batch_encodings = defaultdict(list)
21
- batch_labels = list()
22
  tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
23
 
24
- for _, row in data.sample(n=3).iterrows():
 
25
  text = row["comment_text"]
26
- text_labels = {k: row[k] for k in row.keys() if k in labels}
27
 
28
  encoding = tokenizer(text, padding="max_length", truncation=True)
29
 
30
- batch_labels.append([text_labels[id2label[id]] for id in range(len(labels))])
 
 
 
 
 
 
 
 
31
 
32
- for key in encoding.keys():
33
- batch_encodings[key].append(encoding[key])
34
 
35
  model = TFAutoModelForSequenceClassification.from_pretrained(
36
  "bert-base-cased",
@@ -40,15 +38,13 @@ model = TFAutoModelForSequenceClassification.from_pretrained(
40
  id2label=id2label,
41
  )
42
 
43
- training_args = TFTrainingArguments(output_dir="test_trainer")
44
- train_dataset = tf.data.Dataset.from_tensor_slices(
45
- (dict(batch_encodings), batch_labels)
46
  )
47
 
48
- trainer = TFTrainer(
49
- model=model,
50
- args=training_args,
51
- train_dataset=train_dataset,
52
  )
53
 
54
- trainer.train()
 
 
1
  #!/usr/bin/env python3
2
 
3
+ import numpy as np
4
+ from datasets import load_dataset
5
+ from tensorflow.keras.optimizers import Adam
6
+ from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
 
 
 
 
 
 
7
 
8
  labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
9
  label2id = {label: id for id, label in enumerate(labels)}
10
  id2label = {id: label for id, label in enumerate(labels)}
11
 
12
+ dataset = load_dataset("csv", data_files="train.csv")
 
 
 
13
  tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
14
 
15
+
16
+ def process_data(row):
17
  text = row["comment_text"]
18
+ labels_batch = {k: row[k] for k in row.keys() if k in labels}
19
 
20
  encoding = tokenizer(text, padding="max_length", truncation=True)
21
 
22
+ labels_matrix = np.zeros((len(text), len(labels)))
23
+
24
+ # fill numpy array
25
+ for id, label in enumerate(labels):
26
+ labels_matrix[:, id] = labels_batch[label]
27
+
28
+ encoding["labels"] = labels_matrix.tolist()
29
+
30
+ return encoding
31
 
 
 
32
 
33
  model = TFAutoModelForSequenceClassification.from_pretrained(
34
  "bert-base-cased",
 
38
  id2label=id2label,
39
  )
40
 
41
+ encoded = dataset.map(
42
+ process_data, batched=True, remove_columns=[*labels, "id", "comment_text"]
 
43
  )
44
 
45
+ tf_dataset = model.prepare_tf_dataset(
46
+ encoded["train"], batch_size=16, shuffle=True, tokenizer=tokenizer
 
 
47
  )
48
 
49
+ model.compile(optimizer=Adam(3e-5), loss="categorical_crossentropy")
50
+ model.fit(tf_dataset)