Yanael commited on
Commit
fa1b8fd
·
1 Parent(s): 1051566

Upload train.py

Browse files
Files changed (1) hide show
  1. train.py +90 -0
train.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ os.environ["HF_HOME"] = "/home/jovyan/work/learn-ml/huggingface"
4
+
5
+ import torch
6
+ from torch.utils.data import DataLoader
7
+
8
+ from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, get_scheduler
9
+ from datasets import load_dataset, load_metric
10
+ from accelerate import Accelerator
11
+
12
+ from tqdm.auto import tqdm
13
+
14
+
15
+
16
+ checkpoint = 'bert-base-uncased'
17
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
18
+ model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
19
+
20
+
21
+ raw_datasets = load_dataset("glue", "mrpc")
22
+
23
+
24
+
25
+ def tokenize_function(example):
26
+ return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
27
+
28
+ tokenized_dataset = raw_datasets.map(tokenize_function, batched=True)
29
+
30
+
31
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
32
+
33
+
34
+ samples = tokenized_dataset["train"][:8]
35
+ samples = {k: v for k,v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
36
+ print([len(x) for x in samples["input_ids"]])
37
+
38
+
39
+ tokenized_dataset = tokenized_dataset.remove_columns(["sentence1","sentence2","idx"])
40
+ tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
41
+ tokenized_dataset.set_format("torch")
42
+ tokenized_dataset.column_names["train"]
43
+
44
+
45
+ train_dataloader = DataLoader(
46
+ tokenized_dataset["train"], shuffle=True, batch_size=8, collate_fn=data_collator,
47
+ )
48
+
49
+ eval_dataloader = DataLoader(
50
+ tokenized_dataset["validation"], batch_size=8, collate_fn=data_collator,
51
+ )
52
+
53
+
54
+
55
+ accelerator = Accelerator()
56
+
57
+ model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
58
+
59
+ optimizer = AdamW(model.parameters(), lr=3e-5)
60
+
61
+
62
+ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(train_dataloader, eval_dataloader, model, optimizer)
63
+
64
+ num_epochs = 3
65
+ num_training_steps = num_epochs * len(train_dataloader)
66
+ lr_scheduler = get_scheduler(
67
+ "linear",
68
+ optimizer=optimizer,
69
+ num_warmup_steps=0,
70
+ num_training_steps=num_training_steps,
71
+ )
72
+
73
+ print(num_training_steps)
74
+
75
+
76
+ progress_bar = tqdm(range(num_training_steps))
77
+
78
+
79
+ model.train()
80
+
81
+ for epoch in range(num_epochs):
82
+ for batch in train_dataloader:
83
+ outputs = model(**batch)
84
+ loss = outputs.loss
85
+ accelerator.backward(loss)
86
+
87
+ optimizer.step()
88
+ lr_scheduler.step()
89
+ optimizer.zero_grad()
90
+ progress_bar.update(1)