ajm19826 commited on
Commit
3da7b20
·
verified ·
1 Parent(s): 8549401

Create train.py

Browse files
Files changed (1) hide show
  1. train.py +60 -0
train.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from transformers import (
3
+ AutoTokenizer,
4
+ AutoModelForSequenceClassification,
5
+ Trainer,
6
+ TrainingArguments
7
+ )
8
+ import pandas as pd
9
+
10
+ # Load dataset
11
+ df = pd.read_csv("data/vibes.csv")
12
+ dataset = load_dataset("csv", data_files="data/vibes.csv")
13
+
14
+ labels = ["negative", "neutral", "positive"]
15
+ label2id = {l: i for i, l in enumerate(labels)}
16
+ id2label = {i: l for l, i in label2id.items()}
17
+
18
+ def encode_labels(example):
19
+ example["label"] = label2id[example["label"]]
20
+ return example
21
+
22
+ dataset = dataset.map(encode_labels)
23
+
24
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
25
+
26
+ def tokenize(batch):
27
+ return tokenizer(batch["text"], truncation=True, padding=True)
28
+
29
+ dataset = dataset.map(tokenize, batched=True)
30
+ dataset = dataset["train"].train_test_split(test_size=0.2)
31
+
32
+ model = AutoModelForSequenceClassification.from_pretrained(
33
+ "distilbert-base-uncased",
34
+ num_labels=3,
35
+ id2label=id2label,
36
+ label2id=label2id
37
+ )
38
+
39
+ training_args = TrainingArguments(
40
+ output_dir="./model",
41
+ evaluation_strategy="epoch",
42
+ per_device_train_batch_size=8,
43
+ per_device_eval_batch_size=8,
44
+ num_train_epochs=5,
45
+ save_strategy="epoch",
46
+ logging_dir="./logs",
47
+ logging_steps=10
48
+ )
49
+
50
+ trainer = Trainer(
51
+ model=model,
52
+ args=training_args,
53
+ train_dataset=dataset["train"],
54
+ eval_dataset=dataset["test"],
55
+ tokenizer=tokenizer
56
+ )
57
+
58
+ trainer.train()
59
+ trainer.save_model("./model")
60
+ tokenizer.save_pretrained("./model")