ajm19826 commited on
Commit
c569edd
·
verified ·
1 Parent(s): dd77dc3

Create train.py

Browse files
Files changed (1) hide show
  1. train.py +54 -0
train.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from datasets import Dataset
3
+ from transformers import (
4
+ AutoTokenizer,
5
+ AutoModelForSequenceClassification,
6
+ Trainer,
7
+ TrainingArguments
8
+ )
9
+
10
+ # Load data
11
+ df = pd.read_csv("data/intents.csv")
12
+ labels = sorted(df.intent.unique())
13
+ label2id = {l: i for i, l in enumerate(labels)}
14
+ id2label = {i: l for l, i in label2id.items()}
15
+
16
+ df["label"] = df.intent.map(label2id)
17
+ dataset = Dataset.from_pandas(df)
18
+
19
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
20
+
21
+ def tokenize(batch):
22
+ return tokenizer(batch["text"], truncation=True, padding=True)
23
+
24
+ dataset = dataset.map(tokenize, batched=True)
25
+ dataset = dataset.train_test_split(test_size=0.2)
26
+
27
+ model = AutoModelForSequenceClassification.from_pretrained(
28
+ "distilbert-base-uncased",
29
+ num_labels=len(labels),
30
+ id2label=id2label,
31
+ label2id=label2id
32
+ )
33
+
34
+ args = TrainingArguments(
35
+ output_dir="./model",
36
+ evaluation_strategy="epoch",
37
+ per_device_train_batch_size=8,
38
+ per_device_eval_batch_size=8,
39
+ num_train_epochs=6,
40
+ logging_steps=10,
41
+ save_strategy="epoch"
42
+ )
43
+
44
+ trainer = Trainer(
45
+ model=model,
46
+ args=args,
47
+ train_dataset=dataset["train"],
48
+ eval_dataset=dataset["test"],
49
+ tokenizer=tokenizer
50
+ )
51
+
52
+ trainer.train()
53
+ trainer.save_model("./model")
54
+ tokenizer.save_pretrained("./model")