FranzderPapst commited on
Commit
6bce4f8
1 Parent(s): 88ab85c

Upload training_script.txt

Browse files
Files changed (1) hide show
  1. training_script.txt +118 -0
training_script.txt ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """ZGBot Training Script
3
+ File can be executed here (Link to Google Colab):
4
+ https://colab.research.google.com/drive/1Dyn37CljZnYaQ1dXs3rCOQmdpioKB6-t
5
+ """
6
+
7
+ !pip install datasets wandb evaluate accelerate -qU
8
+ !pip install transformers
9
+
10
+ from huggingface_hub import notebook_login
11
+ notebook_login()
12
+ import wandb
13
+ wandb.login()
14
+ from datasets import load_dataset
15
+ squad = load_dataset("squad", split="train[:5000]")
16
+ squad = squad.train_test_split(test_size=0.2)
17
+ from transformers import AutoTokenizer
18
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
19
+
20
+ def preprocess_function(examples):
21
+ questions = [q.strip() for q in examples["question"]]
22
+ inputs = tokenizer(
23
+ questions,
24
+ examples["context"],
25
+ max_length=384,
26
+ truncation="only_second",
27
+ return_offsets_mapping=True,
28
+ padding="max_length",
29
+ )
30
+
31
+ offset_mapping = inputs.pop("offset_mapping")
32
+ answers = examples["answers"]
33
+ start_positions = []
34
+ end_positions = []
35
+
36
+ for i, offset in enumerate(offset_mapping):
37
+ answer = answers[i]
38
+ start_char = answer["answer_start"][0]
39
+ end_char = answer["answer_start"][0] + len(answer["text"][0])
40
+ sequence_ids = inputs.sequence_ids(i)
41
+
42
+
43
+ idx = 0
44
+ while sequence_ids[idx] != 1:
45
+ idx += 1
46
+ context_start = idx
47
+ while sequence_ids[idx] == 1:
48
+ idx += 1
49
+ context_end = idx - 1
50
+
51
+
52
+ if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
53
+ start_positions.append(0)
54
+ end_positions.append(0)
55
+ else:
56
+
57
+ idx = context_start
58
+ while idx <= context_end and offset[idx][0] <= start_char:
59
+ idx += 1
60
+ start_positions.append(idx - 1)
61
+
62
+ idx = context_end
63
+ while idx >= context_start and offset[idx][1] >= end_char:
64
+ idx -= 1
65
+ end_positions.append(idx + 1)
66
+
67
+ inputs["start_positions"] = start_positions
68
+ inputs["end_positions"] = end_positions
69
+ return inputs
70
+
71
+ dataset = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
72
+ from transformers import DefaultDataCollator
73
+ data_collator = DefaultDataCollator()
74
+ from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
75
+ model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
76
+
77
+ !pip install transformers[torch]
78
+
79
+ import evaluate
80
+ import numpy as np
81
+ metric=evaluate.load("accuracy")
82
+ def compute_metrics(eval_pred):
83
+ logits, labels = eval_pred
84
+ predictions = np.argmax(logits, axis=-1)
85
+ return metric.compute(predictions=predictions, references=labels)
86
+
87
+ from transformers import Trainer, TrainingArguments
88
+
89
+ args = TrainingArguments(
90
+ output_dir = "MA-saemi-5",
91
+ report_to = 'wandb',
92
+ evaluation_strategy = 'steps',
93
+ learning_rate = 3e-5,
94
+ max_steps = 3000,
95
+ logging_steps = 100,
96
+ eval_steps = 250,
97
+ save_steps = 10000,
98
+ load_best_model_at_end = True,
99
+ metric_for_best_model = 'accuracy',
100
+ run_name = 'training5',
101
+ per_device_train_batch_size=16,
102
+ per_device_eval_batch_size=16,
103
+ push_to_hub=True,
104
+
105
+ )
106
+
107
+ trainer = Trainer(
108
+ model = model,
109
+ args = args,
110
+ train_dataset=dataset['train'],
111
+ eval_dataset=dataset['test'],
112
+ tokenizer=tokenizer,
113
+ compute_metrics=compute_metrics,
114
+ )
115
+
116
+ trainer.train()
117
+ wandb.finish()
118
+ trainer.push_to_hub()