gustoudu81 commited on
Commit
aea6eb7
·
verified ·
1 Parent(s): 28d6d4a

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,3 +1,38 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # BerTELEO
2
+
3
+ A bert model pre-trained on short DNA sequence the teleo marker from zhihan1996/DNABERT-2-117M
4
+ use this model for teleo sequence emmebdding
5
+
6
+ Paper not already release.
7
+
8
+ How use :
9
+
10
+
11
+ ```python
12
+
13
+ from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
14
+ import torch
15
+
16
+ model_id = "gustoudu81/BerTeleo"
17
+
18
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
19
+ model = AutoModelForMaskedLM.from_pretrained(model_id, trust_remote_code=True)
20
+
21
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
22
+ model = model.to(device).eval()
23
+
24
+ inputs = tokenizer("ACGTACGTACGT", return_tensors="pt")
25
+ inputs = {k: v.to(device) for k, v in inputs.items()}
26
+
27
+ with torch.no_grad():
28
+ hidden_states = model(**inputs)[0]
29
+
30
+
31
+ # embedding with mean pooling
32
+ embedding_mean = torch.mean(hidden_states[0], dim=0)
33
+ print(embedding_mean.shape) # expect to be 768
34
+
35
+ # embedding with max pooling
36
+ embedding_max = torch.max(hidden_states[0], dim=0)[0]
37
+ print(embedding_max.shape) # expect to be 768
38
+ ```
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alibi_starting_size": 512,
3
+ "architectures": [
4
+ "BertForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "gustoudu81/DNABERT-2-117M-tritonfix--configuration_bert.BertConfig",
9
+ "AutoModel": "gustoudu81/DNABERT-2-117M-tritonfix--bert_layers.BertModel",
10
+ "AutoModelForMaskedLM": "gustoudu81/DNABERT-2-117M-tritonfix-2-117M--bert_layers.BertForMaskedLM",
11
+ "AutoModelForSequenceClassification": "gustoudu81/DNABERT-2-117M-tritonfix--bert_layers.BertForSequenceClassification"
12
+ },
13
+ "classifier_dropout": null,
14
+ "gradient_checkpointing": false,
15
+ "hidden_act": "gelu",
16
+ "hidden_dropout_prob": 0.1,
17
+ "hidden_size": 768,
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 3072,
20
+ "layer_norm_eps": 1e-12,
21
+ "max_position_embeddings": 512,
22
+ "model_type": "bert",
23
+ "num_attention_heads": 12,
24
+ "num_hidden_layers": 12,
25
+ "pad_token_id": 0,
26
+ "position_embedding_type": "absolute",
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.51.3",
29
+ "type_vocab_size": 2,
30
+ "use_cache": true,
31
+ "vocab_size": 4096
32
+ }
generation_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "pad_token_id": 0,
4
+ "transformers_version": "4.51.3"
5
+ }
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:021362bcea4fcb6ffd4d39a71b75cc6e7a0b2b8c9f9a3e592a1155cd27f3effc
3
+ size 936711947
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8556c570222ca9bfb38ddcfacd463dc6f7e7e6920228ad95c5de80e366a2f54
3
+ size 468345303
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ torch
2
+ transformers
3
+ einops
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c974aa6e6f13f300b7ec93fe25ab0069903d684b1fb3bbb8a54b0173af962d2d
3
+ size 14645
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88ae2ad01e8731312f091eddfc4ec4e31d4cd78d95e717f33bff72c55a9c1dd0
3
+ size 1465
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[UNK]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[PAD]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "extra_special_tokens": {},
47
+ "mask_token": "[MASK]",
48
+ "model_max_length": 1000000000000000019884624838656,
49
+ "pad_token": "[PAD]",
50
+ "sep_token": "[SEP]",
51
+ "tokenizer_class": "PreTrainedTokenizer",
52
+ "unk_token": "[UNK]"
53
+ }
trainer_state.json ADDED
@@ -0,0 +1,1122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 67000,
3
+ "best_metric": 1.3910651206970215,
4
+ "best_model_checkpoint": "/home/auguste/Desktop/eDNA/TeleoClassification/scripts/DNABert2/experiments/masking_training/outputs/masking_teleo/checkpoints/checkpoint-67000",
5
+ "epoch": 108.06451612903226,
6
+ "eval_steps": 1000,
7
+ "global_step": 67000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0016129032258064516,
14
+ "grad_norm": 107.56168365478516,
15
+ "learning_rate": 2e-05,
16
+ "loss": 7.9233,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 1.6129032258064515,
21
+ "grad_norm": 16.32627296447754,
22
+ "learning_rate": 1.9785161290322584e-05,
23
+ "loss": 3.0779,
24
+ "step": 1000
25
+ },
26
+ {
27
+ "epoch": 1.6129032258064515,
28
+ "eval_loss": 2.738837718963623,
29
+ "eval_model_preparation_time": 0.0012,
30
+ "eval_runtime": 0.2955,
31
+ "eval_samples_per_second": 1766.611,
32
+ "eval_steps_per_second": 111.682,
33
+ "step": 1000
34
+ },
35
+ {
36
+ "epoch": 3.225806451612903,
37
+ "grad_norm": 12.881124496459961,
38
+ "learning_rate": 1.9570107526881724e-05,
39
+ "loss": 2.506,
40
+ "step": 2000
41
+ },
42
+ {
43
+ "epoch": 3.225806451612903,
44
+ "eval_loss": 2.4902684688568115,
45
+ "eval_model_preparation_time": 0.0012,
46
+ "eval_runtime": 0.2968,
47
+ "eval_samples_per_second": 1758.892,
48
+ "eval_steps_per_second": 111.194,
49
+ "step": 2000
50
+ },
51
+ {
52
+ "epoch": 4.838709677419355,
53
+ "grad_norm": 12.914713859558105,
54
+ "learning_rate": 1.935505376344086e-05,
55
+ "loss": 2.734,
56
+ "step": 3000
57
+ },
58
+ {
59
+ "epoch": 4.838709677419355,
60
+ "eval_loss": 2.305058479309082,
61
+ "eval_model_preparation_time": 0.0012,
62
+ "eval_runtime": 0.3006,
63
+ "eval_samples_per_second": 1736.761,
64
+ "eval_steps_per_second": 109.795,
65
+ "step": 3000
66
+ },
67
+ {
68
+ "epoch": 6.451612903225806,
69
+ "grad_norm": 13.617836952209473,
70
+ "learning_rate": 1.914e-05,
71
+ "loss": 2.2267,
72
+ "step": 4000
73
+ },
74
+ {
75
+ "epoch": 6.451612903225806,
76
+ "eval_loss": 2.3899621963500977,
77
+ "eval_model_preparation_time": 0.0012,
78
+ "eval_runtime": 0.3004,
79
+ "eval_samples_per_second": 1737.628,
80
+ "eval_steps_per_second": 109.85,
81
+ "step": 4000
82
+ },
83
+ {
84
+ "epoch": 8.064516129032258,
85
+ "grad_norm": 11.493875503540039,
86
+ "learning_rate": 1.892494623655914e-05,
87
+ "loss": 2.1095,
88
+ "step": 5000
89
+ },
90
+ {
91
+ "epoch": 8.064516129032258,
92
+ "eval_loss": 2.1791865825653076,
93
+ "eval_model_preparation_time": 0.0012,
94
+ "eval_runtime": 0.2998,
95
+ "eval_samples_per_second": 1740.976,
96
+ "eval_steps_per_second": 110.062,
97
+ "step": 5000
98
+ },
99
+ {
100
+ "epoch": 9.67741935483871,
101
+ "grad_norm": 16.104379653930664,
102
+ "learning_rate": 1.870989247311828e-05,
103
+ "loss": 1.9622,
104
+ "step": 6000
105
+ },
106
+ {
107
+ "epoch": 9.67741935483871,
108
+ "eval_loss": 2.0534751415252686,
109
+ "eval_model_preparation_time": 0.0012,
110
+ "eval_runtime": 0.3144,
111
+ "eval_samples_per_second": 1660.314,
112
+ "eval_steps_per_second": 104.962,
113
+ "step": 6000
114
+ },
115
+ {
116
+ "epoch": 11.290322580645162,
117
+ "grad_norm": 15.933501243591309,
118
+ "learning_rate": 1.8494838709677422e-05,
119
+ "loss": 1.8713,
120
+ "step": 7000
121
+ },
122
+ {
123
+ "epoch": 11.290322580645162,
124
+ "eval_loss": 2.1255111694335938,
125
+ "eval_model_preparation_time": 0.0012,
126
+ "eval_runtime": 0.31,
127
+ "eval_samples_per_second": 1684.034,
128
+ "eval_steps_per_second": 106.462,
129
+ "step": 7000
130
+ },
131
+ {
132
+ "epoch": 12.903225806451612,
133
+ "grad_norm": 9.397466659545898,
134
+ "learning_rate": 1.8279784946236562e-05,
135
+ "loss": 1.7906,
136
+ "step": 8000
137
+ },
138
+ {
139
+ "epoch": 12.903225806451612,
140
+ "eval_loss": 1.9397249221801758,
141
+ "eval_model_preparation_time": 0.0012,
142
+ "eval_runtime": 0.3026,
143
+ "eval_samples_per_second": 1724.803,
144
+ "eval_steps_per_second": 109.039,
145
+ "step": 8000
146
+ },
147
+ {
148
+ "epoch": 14.516129032258064,
149
+ "grad_norm": 14.291478157043457,
150
+ "learning_rate": 1.8064731182795702e-05,
151
+ "loss": 1.7149,
152
+ "step": 9000
153
+ },
154
+ {
155
+ "epoch": 14.516129032258064,
156
+ "eval_loss": 1.8910889625549316,
157
+ "eval_model_preparation_time": 0.0012,
158
+ "eval_runtime": 0.3013,
159
+ "eval_samples_per_second": 1732.385,
160
+ "eval_steps_per_second": 109.519,
161
+ "step": 9000
162
+ },
163
+ {
164
+ "epoch": 16.129032258064516,
165
+ "grad_norm": 15.776030540466309,
166
+ "learning_rate": 1.784967741935484e-05,
167
+ "loss": 1.634,
168
+ "step": 10000
169
+ },
170
+ {
171
+ "epoch": 16.129032258064516,
172
+ "eval_loss": 1.893878698348999,
173
+ "eval_model_preparation_time": 0.0012,
174
+ "eval_runtime": 0.3023,
175
+ "eval_samples_per_second": 1726.506,
176
+ "eval_steps_per_second": 109.147,
177
+ "step": 10000
178
+ },
179
+ {
180
+ "epoch": 17.741935483870968,
181
+ "grad_norm": 12.53177547454834,
182
+ "learning_rate": 1.763462365591398e-05,
183
+ "loss": 1.5991,
184
+ "step": 11000
185
+ },
186
+ {
187
+ "epoch": 17.741935483870968,
188
+ "eval_loss": 1.8701565265655518,
189
+ "eval_model_preparation_time": 0.0012,
190
+ "eval_runtime": 0.3035,
191
+ "eval_samples_per_second": 1720.089,
192
+ "eval_steps_per_second": 108.741,
193
+ "step": 11000
194
+ },
195
+ {
196
+ "epoch": 19.35483870967742,
197
+ "grad_norm": 13.62909984588623,
198
+ "learning_rate": 1.741956989247312e-05,
199
+ "loss": 1.5008,
200
+ "step": 12000
201
+ },
202
+ {
203
+ "epoch": 19.35483870967742,
204
+ "eval_loss": 1.7572582960128784,
205
+ "eval_model_preparation_time": 0.0012,
206
+ "eval_runtime": 0.3051,
207
+ "eval_samples_per_second": 1710.701,
208
+ "eval_steps_per_second": 108.148,
209
+ "step": 12000
210
+ },
211
+ {
212
+ "epoch": 20.967741935483872,
213
+ "grad_norm": 13.886764526367188,
214
+ "learning_rate": 1.720451612903226e-05,
215
+ "loss": 1.4469,
216
+ "step": 13000
217
+ },
218
+ {
219
+ "epoch": 20.967741935483872,
220
+ "eval_loss": 1.7456613779067993,
221
+ "eval_model_preparation_time": 0.0012,
222
+ "eval_runtime": 0.3048,
223
+ "eval_samples_per_second": 1712.389,
224
+ "eval_steps_per_second": 108.254,
225
+ "step": 13000
226
+ },
227
+ {
228
+ "epoch": 22.580645161290324,
229
+ "grad_norm": 16.04749870300293,
230
+ "learning_rate": 1.6989462365591397e-05,
231
+ "loss": 1.404,
232
+ "step": 14000
233
+ },
234
+ {
235
+ "epoch": 22.580645161290324,
236
+ "eval_loss": 1.7826714515686035,
237
+ "eval_model_preparation_time": 0.0012,
238
+ "eval_runtime": 0.3034,
239
+ "eval_samples_per_second": 1720.509,
240
+ "eval_steps_per_second": 108.768,
241
+ "step": 14000
242
+ },
243
+ {
244
+ "epoch": 24.193548387096776,
245
+ "grad_norm": 14.932185173034668,
246
+ "learning_rate": 1.6774408602150537e-05,
247
+ "loss": 1.3552,
248
+ "step": 15000
249
+ },
250
+ {
251
+ "epoch": 24.193548387096776,
252
+ "eval_loss": 1.7234201431274414,
253
+ "eval_model_preparation_time": 0.0012,
254
+ "eval_runtime": 0.3061,
255
+ "eval_samples_per_second": 1705.173,
256
+ "eval_steps_per_second": 107.798,
257
+ "step": 15000
258
+ },
259
+ {
260
+ "epoch": 25.806451612903224,
261
+ "grad_norm": 8.178566932678223,
262
+ "learning_rate": 1.6559354838709676e-05,
263
+ "loss": 1.313,
264
+ "step": 16000
265
+ },
266
+ {
267
+ "epoch": 25.806451612903224,
268
+ "eval_loss": 1.8201613426208496,
269
+ "eval_model_preparation_time": 0.0012,
270
+ "eval_runtime": 0.3194,
271
+ "eval_samples_per_second": 1634.314,
272
+ "eval_steps_per_second": 103.319,
273
+ "step": 16000
274
+ },
275
+ {
276
+ "epoch": 27.419354838709676,
277
+ "grad_norm": 16.086894989013672,
278
+ "learning_rate": 1.634430107526882e-05,
279
+ "loss": 1.2751,
280
+ "step": 17000
281
+ },
282
+ {
283
+ "epoch": 27.419354838709676,
284
+ "eval_loss": 1.6344752311706543,
285
+ "eval_model_preparation_time": 0.0012,
286
+ "eval_runtime": 0.3053,
287
+ "eval_samples_per_second": 1709.792,
288
+ "eval_steps_per_second": 108.09,
289
+ "step": 17000
290
+ },
291
+ {
292
+ "epoch": 29.032258064516128,
293
+ "grad_norm": 9.854013442993164,
294
+ "learning_rate": 1.612924731182796e-05,
295
+ "loss": 1.2377,
296
+ "step": 18000
297
+ },
298
+ {
299
+ "epoch": 29.032258064516128,
300
+ "eval_loss": 1.6381661891937256,
301
+ "eval_model_preparation_time": 0.0012,
302
+ "eval_runtime": 0.343,
303
+ "eval_samples_per_second": 1521.702,
304
+ "eval_steps_per_second": 96.2,
305
+ "step": 18000
306
+ },
307
+ {
308
+ "epoch": 30.64516129032258,
309
+ "grad_norm": 13.270648956298828,
310
+ "learning_rate": 1.59141935483871e-05,
311
+ "loss": 1.1772,
312
+ "step": 19000
313
+ },
314
+ {
315
+ "epoch": 30.64516129032258,
316
+ "eval_loss": 1.6601710319519043,
317
+ "eval_model_preparation_time": 0.0012,
318
+ "eval_runtime": 0.3037,
319
+ "eval_samples_per_second": 1718.661,
320
+ "eval_steps_per_second": 108.651,
321
+ "step": 19000
322
+ },
323
+ {
324
+ "epoch": 32.25806451612903,
325
+ "grad_norm": 20.389537811279297,
326
+ "learning_rate": 1.569913978494624e-05,
327
+ "loss": 1.176,
328
+ "step": 20000
329
+ },
330
+ {
331
+ "epoch": 32.25806451612903,
332
+ "eval_loss": 1.6632287502288818,
333
+ "eval_model_preparation_time": 0.0012,
334
+ "eval_runtime": 0.3045,
335
+ "eval_samples_per_second": 1714.128,
336
+ "eval_steps_per_second": 108.364,
337
+ "step": 20000
338
+ },
339
+ {
340
+ "epoch": 33.87096774193548,
341
+ "grad_norm": 11.229137420654297,
342
+ "learning_rate": 1.548408602150538e-05,
343
+ "loss": 1.1184,
344
+ "step": 21000
345
+ },
346
+ {
347
+ "epoch": 33.87096774193548,
348
+ "eval_loss": 1.7555991411209106,
349
+ "eval_model_preparation_time": 0.0012,
350
+ "eval_runtime": 0.3043,
351
+ "eval_samples_per_second": 1715.561,
352
+ "eval_steps_per_second": 108.455,
353
+ "step": 21000
354
+ },
355
+ {
356
+ "epoch": 35.483870967741936,
357
+ "grad_norm": 10.823155403137207,
358
+ "learning_rate": 1.5269032258064518e-05,
359
+ "loss": 1.0793,
360
+ "step": 22000
361
+ },
362
+ {
363
+ "epoch": 35.483870967741936,
364
+ "eval_loss": 1.6087384223937988,
365
+ "eval_model_preparation_time": 0.0012,
366
+ "eval_runtime": 0.3059,
367
+ "eval_samples_per_second": 1706.414,
368
+ "eval_steps_per_second": 107.877,
369
+ "step": 22000
370
+ },
371
+ {
372
+ "epoch": 37.096774193548384,
373
+ "grad_norm": 6.54916524887085,
374
+ "learning_rate": 1.5053978494623658e-05,
375
+ "loss": 1.0632,
376
+ "step": 23000
377
+ },
378
+ {
379
+ "epoch": 37.096774193548384,
380
+ "eval_loss": 1.6815119981765747,
381
+ "eval_model_preparation_time": 0.0012,
382
+ "eval_runtime": 0.3044,
383
+ "eval_samples_per_second": 1714.683,
384
+ "eval_steps_per_second": 108.399,
385
+ "step": 23000
386
+ },
387
+ {
388
+ "epoch": 38.70967741935484,
389
+ "grad_norm": 14.550293922424316,
390
+ "learning_rate": 1.4838924731182798e-05,
391
+ "loss": 1.0185,
392
+ "step": 24000
393
+ },
394
+ {
395
+ "epoch": 38.70967741935484,
396
+ "eval_loss": 1.6611889600753784,
397
+ "eval_model_preparation_time": 0.0012,
398
+ "eval_runtime": 0.3028,
399
+ "eval_samples_per_second": 1724.12,
400
+ "eval_steps_per_second": 108.996,
401
+ "step": 24000
402
+ },
403
+ {
404
+ "epoch": 40.32258064516129,
405
+ "grad_norm": 14.825828552246094,
406
+ "learning_rate": 1.4623870967741937e-05,
407
+ "loss": 1.0148,
408
+ "step": 25000
409
+ },
410
+ {
411
+ "epoch": 40.32258064516129,
412
+ "eval_loss": 1.5314302444458008,
413
+ "eval_model_preparation_time": 0.0012,
414
+ "eval_runtime": 0.3061,
415
+ "eval_samples_per_second": 1705.416,
416
+ "eval_steps_per_second": 107.814,
417
+ "step": 25000
418
+ },
419
+ {
420
+ "epoch": 41.935483870967744,
421
+ "grad_norm": 15.808582305908203,
422
+ "learning_rate": 1.4408817204301075e-05,
423
+ "loss": 0.9492,
424
+ "step": 26000
425
+ },
426
+ {
427
+ "epoch": 41.935483870967744,
428
+ "eval_loss": 1.717032790184021,
429
+ "eval_model_preparation_time": 0.0012,
430
+ "eval_runtime": 0.3131,
431
+ "eval_samples_per_second": 1667.386,
432
+ "eval_steps_per_second": 105.409,
433
+ "step": 26000
434
+ },
435
+ {
436
+ "epoch": 43.54838709677419,
437
+ "grad_norm": 13.56778621673584,
438
+ "learning_rate": 1.4193763440860215e-05,
439
+ "loss": 0.9352,
440
+ "step": 27000
441
+ },
442
+ {
443
+ "epoch": 43.54838709677419,
444
+ "eval_loss": 1.631635069847107,
445
+ "eval_model_preparation_time": 0.0012,
446
+ "eval_runtime": 0.3065,
447
+ "eval_samples_per_second": 1703.186,
448
+ "eval_steps_per_second": 107.673,
449
+ "step": 27000
450
+ },
451
+ {
452
+ "epoch": 45.16129032258065,
453
+ "grad_norm": 14.375411987304688,
454
+ "learning_rate": 1.3978709677419355e-05,
455
+ "loss": 0.9287,
456
+ "step": 28000
457
+ },
458
+ {
459
+ "epoch": 45.16129032258065,
460
+ "eval_loss": 1.643862247467041,
461
+ "eval_model_preparation_time": 0.0012,
462
+ "eval_runtime": 0.3075,
463
+ "eval_samples_per_second": 1697.381,
464
+ "eval_steps_per_second": 107.306,
465
+ "step": 28000
466
+ },
467
+ {
468
+ "epoch": 46.774193548387096,
469
+ "grad_norm": 12.451338768005371,
470
+ "learning_rate": 1.3763655913978495e-05,
471
+ "loss": 0.9052,
472
+ "step": 29000
473
+ },
474
+ {
475
+ "epoch": 46.774193548387096,
476
+ "eval_loss": 1.4976590871810913,
477
+ "eval_model_preparation_time": 0.0012,
478
+ "eval_runtime": 0.3151,
479
+ "eval_samples_per_second": 1656.651,
480
+ "eval_steps_per_second": 104.731,
481
+ "step": 29000
482
+ },
483
+ {
484
+ "epoch": 48.38709677419355,
485
+ "grad_norm": 15.790621757507324,
486
+ "learning_rate": 1.3548602150537636e-05,
487
+ "loss": 0.8897,
488
+ "step": 30000
489
+ },
490
+ {
491
+ "epoch": 48.38709677419355,
492
+ "eval_loss": 1.544758915901184,
493
+ "eval_model_preparation_time": 0.0012,
494
+ "eval_runtime": 0.3045,
495
+ "eval_samples_per_second": 1714.222,
496
+ "eval_steps_per_second": 108.37,
497
+ "step": 30000
498
+ },
499
+ {
500
+ "epoch": 50.0,
501
+ "grad_norm": 15.337139129638672,
502
+ "learning_rate": 1.3333548387096776e-05,
503
+ "loss": 0.9353,
504
+ "step": 31000
505
+ },
506
+ {
507
+ "epoch": 50.0,
508
+ "eval_loss": 1.7019206285476685,
509
+ "eval_model_preparation_time": 0.0012,
510
+ "eval_runtime": 0.3181,
511
+ "eval_samples_per_second": 1640.743,
512
+ "eval_steps_per_second": 103.725,
513
+ "step": 31000
514
+ },
515
+ {
516
+ "epoch": 51.61290322580645,
517
+ "grad_norm": 17.48087501525879,
518
+ "learning_rate": 1.3118494623655916e-05,
519
+ "loss": 0.8976,
520
+ "step": 32000
521
+ },
522
+ {
523
+ "epoch": 51.61290322580645,
524
+ "eval_loss": 1.6256884336471558,
525
+ "eval_model_preparation_time": 0.0012,
526
+ "eval_runtime": 0.3131,
527
+ "eval_samples_per_second": 1667.367,
528
+ "eval_steps_per_second": 105.408,
529
+ "step": 32000
530
+ },
531
+ {
532
+ "epoch": 53.225806451612904,
533
+ "grad_norm": 15.387638092041016,
534
+ "learning_rate": 1.2903440860215055e-05,
535
+ "loss": 0.8414,
536
+ "step": 33000
537
+ },
538
+ {
539
+ "epoch": 53.225806451612904,
540
+ "eval_loss": 1.5139249563217163,
541
+ "eval_model_preparation_time": 0.0012,
542
+ "eval_runtime": 0.3191,
543
+ "eval_samples_per_second": 1635.972,
544
+ "eval_steps_per_second": 103.424,
545
+ "step": 33000
546
+ },
547
+ {
548
+ "epoch": 54.83870967741935,
549
+ "grad_norm": 15.2994384765625,
550
+ "learning_rate": 1.2688387096774195e-05,
551
+ "loss": 0.7897,
552
+ "step": 34000
553
+ },
554
+ {
555
+ "epoch": 54.83870967741935,
556
+ "eval_loss": 1.7013849020004272,
557
+ "eval_model_preparation_time": 0.0012,
558
+ "eval_runtime": 0.3268,
559
+ "eval_samples_per_second": 1597.145,
560
+ "eval_steps_per_second": 100.969,
561
+ "step": 34000
562
+ },
563
+ {
564
+ "epoch": 56.45161290322581,
565
+ "grad_norm": 14.40909481048584,
566
+ "learning_rate": 1.2473333333333335e-05,
567
+ "loss": 0.8627,
568
+ "step": 35000
569
+ },
570
+ {
571
+ "epoch": 56.45161290322581,
572
+ "eval_loss": 1.7141073942184448,
573
+ "eval_model_preparation_time": 0.0012,
574
+ "eval_runtime": 0.3089,
575
+ "eval_samples_per_second": 1689.899,
576
+ "eval_steps_per_second": 106.833,
577
+ "step": 35000
578
+ },
579
+ {
580
+ "epoch": 58.064516129032256,
581
+ "grad_norm": 19.243818283081055,
582
+ "learning_rate": 1.2258279569892474e-05,
583
+ "loss": 0.9135,
584
+ "step": 36000
585
+ },
586
+ {
587
+ "epoch": 58.064516129032256,
588
+ "eval_loss": 1.678747296333313,
589
+ "eval_model_preparation_time": 0.0012,
590
+ "eval_runtime": 0.3278,
591
+ "eval_samples_per_second": 1592.41,
592
+ "eval_steps_per_second": 100.67,
593
+ "step": 36000
594
+ },
595
+ {
596
+ "epoch": 59.67741935483871,
597
+ "grad_norm": 14.35431957244873,
598
+ "learning_rate": 1.2043225806451614e-05,
599
+ "loss": 0.9226,
600
+ "step": 37000
601
+ },
602
+ {
603
+ "epoch": 59.67741935483871,
604
+ "eval_loss": 1.9941015243530273,
605
+ "eval_model_preparation_time": 0.0012,
606
+ "eval_runtime": 0.3066,
607
+ "eval_samples_per_second": 1702.667,
608
+ "eval_steps_per_second": 107.64,
609
+ "step": 37000
610
+ },
611
+ {
612
+ "epoch": 61.29032258064516,
613
+ "grad_norm": 16.02369499206543,
614
+ "learning_rate": 1.1828172043010752e-05,
615
+ "loss": 0.8849,
616
+ "step": 38000
617
+ },
618
+ {
619
+ "epoch": 61.29032258064516,
620
+ "eval_loss": 1.5911988019943237,
621
+ "eval_model_preparation_time": 0.0012,
622
+ "eval_runtime": 0.3059,
623
+ "eval_samples_per_second": 1706.321,
624
+ "eval_steps_per_second": 107.871,
625
+ "step": 38000
626
+ },
627
+ {
628
+ "epoch": 62.903225806451616,
629
+ "grad_norm": 24.164094924926758,
630
+ "learning_rate": 1.1613118279569892e-05,
631
+ "loss": 0.7974,
632
+ "step": 39000
633
+ },
634
+ {
635
+ "epoch": 62.903225806451616,
636
+ "eval_loss": 1.5700287818908691,
637
+ "eval_model_preparation_time": 0.0012,
638
+ "eval_runtime": 0.3059,
639
+ "eval_samples_per_second": 1706.437,
640
+ "eval_steps_per_second": 107.878,
641
+ "step": 39000
642
+ },
643
+ {
644
+ "epoch": 64.51612903225806,
645
+ "grad_norm": 10.7676420211792,
646
+ "learning_rate": 1.1398064516129033e-05,
647
+ "loss": 0.7892,
648
+ "step": 40000
649
+ },
650
+ {
651
+ "epoch": 64.51612903225806,
652
+ "eval_loss": 1.6208666563034058,
653
+ "eval_model_preparation_time": 0.0012,
654
+ "eval_runtime": 0.3135,
655
+ "eval_samples_per_second": 1665.325,
656
+ "eval_steps_per_second": 105.279,
657
+ "step": 40000
658
+ },
659
+ {
660
+ "epoch": 66.12903225806451,
661
+ "grad_norm": 8.90040111541748,
662
+ "learning_rate": 1.1183010752688173e-05,
663
+ "loss": 0.7728,
664
+ "step": 41000
665
+ },
666
+ {
667
+ "epoch": 66.12903225806451,
668
+ "eval_loss": 1.5275108814239502,
669
+ "eval_model_preparation_time": 0.0012,
670
+ "eval_runtime": 0.3064,
671
+ "eval_samples_per_second": 1703.395,
672
+ "eval_steps_per_second": 107.686,
673
+ "step": 41000
674
+ },
675
+ {
676
+ "epoch": 67.74193548387096,
677
+ "grad_norm": 16.836742401123047,
678
+ "learning_rate": 1.0967956989247313e-05,
679
+ "loss": 0.7309,
680
+ "step": 42000
681
+ },
682
+ {
683
+ "epoch": 67.74193548387096,
684
+ "eval_loss": 1.6568617820739746,
685
+ "eval_model_preparation_time": 0.0012,
686
+ "eval_runtime": 0.3052,
687
+ "eval_samples_per_second": 1710.328,
688
+ "eval_steps_per_second": 108.124,
689
+ "step": 42000
690
+ },
691
+ {
692
+ "epoch": 69.35483870967742,
693
+ "grad_norm": 16.19956398010254,
694
+ "learning_rate": 1.0752903225806453e-05,
695
+ "loss": 0.6891,
696
+ "step": 43000
697
+ },
698
+ {
699
+ "epoch": 69.35483870967742,
700
+ "eval_loss": 1.4376003742218018,
701
+ "eval_model_preparation_time": 0.0012,
702
+ "eval_runtime": 0.3272,
703
+ "eval_samples_per_second": 1595.464,
704
+ "eval_steps_per_second": 100.863,
705
+ "step": 43000
706
+ },
707
+ {
708
+ "epoch": 70.96774193548387,
709
+ "grad_norm": 19.571664810180664,
710
+ "learning_rate": 1.0537849462365592e-05,
711
+ "loss": 0.6732,
712
+ "step": 44000
713
+ },
714
+ {
715
+ "epoch": 70.96774193548387,
716
+ "eval_loss": 1.6094655990600586,
717
+ "eval_model_preparation_time": 0.0012,
718
+ "eval_runtime": 0.3144,
719
+ "eval_samples_per_second": 1660.491,
720
+ "eval_steps_per_second": 104.974,
721
+ "step": 44000
722
+ },
723
+ {
724
+ "epoch": 72.58064516129032,
725
+ "grad_norm": 11.60450267791748,
726
+ "learning_rate": 1.0322795698924732e-05,
727
+ "loss": 0.6475,
728
+ "step": 45000
729
+ },
730
+ {
731
+ "epoch": 72.58064516129032,
732
+ "eval_loss": 1.569161295890808,
733
+ "eval_model_preparation_time": 0.0012,
734
+ "eval_runtime": 0.3241,
735
+ "eval_samples_per_second": 1610.77,
736
+ "eval_steps_per_second": 101.83,
737
+ "step": 45000
738
+ },
739
+ {
740
+ "epoch": 74.19354838709677,
741
+ "grad_norm": 14.973388671875,
742
+ "learning_rate": 1.0107741935483872e-05,
743
+ "loss": 0.674,
744
+ "step": 46000
745
+ },
746
+ {
747
+ "epoch": 74.19354838709677,
748
+ "eval_loss": 1.4532381296157837,
749
+ "eval_model_preparation_time": 0.0012,
750
+ "eval_runtime": 0.3117,
751
+ "eval_samples_per_second": 1674.469,
752
+ "eval_steps_per_second": 105.857,
753
+ "step": 46000
754
+ },
755
+ {
756
+ "epoch": 75.80645161290323,
757
+ "grad_norm": 19.416486740112305,
758
+ "learning_rate": 9.892688172043012e-06,
759
+ "loss": 0.6339,
760
+ "step": 47000
761
+ },
762
+ {
763
+ "epoch": 75.80645161290323,
764
+ "eval_loss": 1.5601801872253418,
765
+ "eval_model_preparation_time": 0.0012,
766
+ "eval_runtime": 0.3055,
767
+ "eval_samples_per_second": 1708.783,
768
+ "eval_steps_per_second": 108.027,
769
+ "step": 47000
770
+ },
771
+ {
772
+ "epoch": 77.41935483870968,
773
+ "grad_norm": 12.237533569335938,
774
+ "learning_rate": 9.677634408602151e-06,
775
+ "loss": 0.628,
776
+ "step": 48000
777
+ },
778
+ {
779
+ "epoch": 77.41935483870968,
780
+ "eval_loss": 1.5352447032928467,
781
+ "eval_model_preparation_time": 0.0012,
782
+ "eval_runtime": 0.3243,
783
+ "eval_samples_per_second": 1609.448,
784
+ "eval_steps_per_second": 101.747,
785
+ "step": 48000
786
+ },
787
+ {
788
+ "epoch": 79.03225806451613,
789
+ "grad_norm": 8.90131664276123,
790
+ "learning_rate": 9.462580645161291e-06,
791
+ "loss": 0.6123,
792
+ "step": 49000
793
+ },
794
+ {
795
+ "epoch": 79.03225806451613,
796
+ "eval_loss": 1.6023005247116089,
797
+ "eval_model_preparation_time": 0.0012,
798
+ "eval_runtime": 0.3064,
799
+ "eval_samples_per_second": 1703.925,
800
+ "eval_steps_per_second": 107.719,
801
+ "step": 49000
802
+ },
803
+ {
804
+ "epoch": 80.64516129032258,
805
+ "grad_norm": 19.542125701904297,
806
+ "learning_rate": 9.24752688172043e-06,
807
+ "loss": 0.5913,
808
+ "step": 50000
809
+ },
810
+ {
811
+ "epoch": 80.64516129032258,
812
+ "eval_loss": 1.4985138177871704,
813
+ "eval_model_preparation_time": 0.0012,
814
+ "eval_runtime": 0.3143,
815
+ "eval_samples_per_second": 1660.843,
816
+ "eval_steps_per_second": 104.996,
817
+ "step": 50000
818
+ },
819
+ {
820
+ "epoch": 82.25806451612904,
821
+ "grad_norm": 15.9403715133667,
822
+ "learning_rate": 9.03247311827957e-06,
823
+ "loss": 0.5919,
824
+ "step": 51000
825
+ },
826
+ {
827
+ "epoch": 82.25806451612904,
828
+ "eval_loss": 1.557279109954834,
829
+ "eval_model_preparation_time": 0.0012,
830
+ "eval_runtime": 0.3138,
831
+ "eval_samples_per_second": 1663.684,
832
+ "eval_steps_per_second": 105.175,
833
+ "step": 51000
834
+ },
835
+ {
836
+ "epoch": 83.87096774193549,
837
+ "grad_norm": 16.341463088989258,
838
+ "learning_rate": 8.81741935483871e-06,
839
+ "loss": 0.5849,
840
+ "step": 52000
841
+ },
842
+ {
843
+ "epoch": 83.87096774193549,
844
+ "eval_loss": 1.744088888168335,
845
+ "eval_model_preparation_time": 0.0012,
846
+ "eval_runtime": 0.3074,
847
+ "eval_samples_per_second": 1698.241,
848
+ "eval_steps_per_second": 107.36,
849
+ "step": 52000
850
+ },
851
+ {
852
+ "epoch": 85.48387096774194,
853
+ "grad_norm": 17.496572494506836,
854
+ "learning_rate": 8.60236559139785e-06,
855
+ "loss": 0.5798,
856
+ "step": 53000
857
+ },
858
+ {
859
+ "epoch": 85.48387096774194,
860
+ "eval_loss": 1.5605759620666504,
861
+ "eval_model_preparation_time": 0.0012,
862
+ "eval_runtime": 0.3312,
863
+ "eval_samples_per_second": 1576.078,
864
+ "eval_steps_per_second": 99.637,
865
+ "step": 53000
866
+ },
867
+ {
868
+ "epoch": 87.09677419354838,
869
+ "grad_norm": 22.154132843017578,
870
+ "learning_rate": 8.38731182795699e-06,
871
+ "loss": 0.5627,
872
+ "step": 54000
873
+ },
874
+ {
875
+ "epoch": 87.09677419354838,
876
+ "eval_loss": 1.486401081085205,
877
+ "eval_model_preparation_time": 0.0012,
878
+ "eval_runtime": 0.3046,
879
+ "eval_samples_per_second": 1713.463,
880
+ "eval_steps_per_second": 108.322,
881
+ "step": 54000
882
+ },
883
+ {
884
+ "epoch": 88.70967741935483,
885
+ "grad_norm": 12.007641792297363,
886
+ "learning_rate": 8.17225806451613e-06,
887
+ "loss": 0.5926,
888
+ "step": 55000
889
+ },
890
+ {
891
+ "epoch": 88.70967741935483,
892
+ "eval_loss": 1.533622145652771,
893
+ "eval_model_preparation_time": 0.0012,
894
+ "eval_runtime": 0.3189,
895
+ "eval_samples_per_second": 1636.753,
896
+ "eval_steps_per_second": 103.473,
897
+ "step": 55000
898
+ },
899
+ {
900
+ "epoch": 90.3225806451613,
901
+ "grad_norm": 16.921255111694336,
902
+ "learning_rate": 7.957204301075269e-06,
903
+ "loss": 0.5737,
904
+ "step": 56000
905
+ },
906
+ {
907
+ "epoch": 90.3225806451613,
908
+ "eval_loss": 1.595588207244873,
909
+ "eval_model_preparation_time": 0.0012,
910
+ "eval_runtime": 0.3248,
911
+ "eval_samples_per_second": 1607.001,
912
+ "eval_steps_per_second": 101.592,
913
+ "step": 56000
914
+ },
915
+ {
916
+ "epoch": 91.93548387096774,
917
+ "grad_norm": 14.567840576171875,
918
+ "learning_rate": 7.74215053763441e-06,
919
+ "loss": 0.5521,
920
+ "step": 57000
921
+ },
922
+ {
923
+ "epoch": 91.93548387096774,
924
+ "eval_loss": 1.6286988258361816,
925
+ "eval_model_preparation_time": 0.0012,
926
+ "eval_runtime": 0.308,
927
+ "eval_samples_per_second": 1694.658,
928
+ "eval_steps_per_second": 107.134,
929
+ "step": 57000
930
+ },
931
+ {
932
+ "epoch": 93.54838709677419,
933
+ "grad_norm": 7.83158016204834,
934
+ "learning_rate": 7.5270967741935486e-06,
935
+ "loss": 0.5672,
936
+ "step": 58000
937
+ },
938
+ {
939
+ "epoch": 93.54838709677419,
940
+ "eval_loss": 1.6612709760665894,
941
+ "eval_model_preparation_time": 0.0012,
942
+ "eval_runtime": 0.3047,
943
+ "eval_samples_per_second": 1713.282,
944
+ "eval_steps_per_second": 108.311,
945
+ "step": 58000
946
+ },
947
+ {
948
+ "epoch": 95.16129032258064,
949
+ "grad_norm": 20.766202926635742,
950
+ "learning_rate": 7.312043010752688e-06,
951
+ "loss": 0.5685,
952
+ "step": 59000
953
+ },
954
+ {
955
+ "epoch": 95.16129032258064,
956
+ "eval_loss": 1.5319266319274902,
957
+ "eval_model_preparation_time": 0.0012,
958
+ "eval_runtime": 0.3061,
959
+ "eval_samples_per_second": 1705.367,
960
+ "eval_steps_per_second": 107.811,
961
+ "step": 59000
962
+ },
963
+ {
964
+ "epoch": 96.7741935483871,
965
+ "grad_norm": 13.834534645080566,
966
+ "learning_rate": 7.096989247311829e-06,
967
+ "loss": 0.5394,
968
+ "step": 60000
969
+ },
970
+ {
971
+ "epoch": 96.7741935483871,
972
+ "eval_loss": 1.5068557262420654,
973
+ "eval_model_preparation_time": 0.0012,
974
+ "eval_runtime": 0.3061,
975
+ "eval_samples_per_second": 1705.255,
976
+ "eval_steps_per_second": 107.803,
977
+ "step": 60000
978
+ },
979
+ {
980
+ "epoch": 98.38709677419355,
981
+ "grad_norm": 9.130626678466797,
982
+ "learning_rate": 6.881935483870969e-06,
983
+ "loss": 0.5095,
984
+ "step": 61000
985
+ },
986
+ {
987
+ "epoch": 98.38709677419355,
988
+ "eval_loss": 1.4926313161849976,
989
+ "eval_model_preparation_time": 0.0012,
990
+ "eval_runtime": 0.3074,
991
+ "eval_samples_per_second": 1698.19,
992
+ "eval_steps_per_second": 107.357,
993
+ "step": 61000
994
+ },
995
+ {
996
+ "epoch": 100.0,
997
+ "grad_norm": 18.79903793334961,
998
+ "learning_rate": 6.666881720430108e-06,
999
+ "loss": 0.5327,
1000
+ "step": 62000
1001
+ },
1002
+ {
1003
+ "epoch": 100.0,
1004
+ "eval_loss": 1.4378135204315186,
1005
+ "eval_model_preparation_time": 0.0012,
1006
+ "eval_runtime": 0.3146,
1007
+ "eval_samples_per_second": 1659.02,
1008
+ "eval_steps_per_second": 104.881,
1009
+ "step": 62000
1010
+ },
1011
+ {
1012
+ "epoch": 101.61290322580645,
1013
+ "grad_norm": 17.528038024902344,
1014
+ "learning_rate": 6.451827956989248e-06,
1015
+ "loss": 0.5108,
1016
+ "step": 63000
1017
+ },
1018
+ {
1019
+ "epoch": 101.61290322580645,
1020
+ "eval_loss": 1.4716895818710327,
1021
+ "eval_model_preparation_time": 0.0012,
1022
+ "eval_runtime": 0.3118,
1023
+ "eval_samples_per_second": 1673.899,
1024
+ "eval_steps_per_second": 105.821,
1025
+ "step": 63000
1026
+ },
1027
+ {
1028
+ "epoch": 103.2258064516129,
1029
+ "grad_norm": 9.862174034118652,
1030
+ "learning_rate": 6.236774193548387e-06,
1031
+ "loss": 0.4874,
1032
+ "step": 64000
1033
+ },
1034
+ {
1035
+ "epoch": 103.2258064516129,
1036
+ "eval_loss": 1.519917368888855,
1037
+ "eval_model_preparation_time": 0.0012,
1038
+ "eval_runtime": 0.3105,
1039
+ "eval_samples_per_second": 1681.31,
1040
+ "eval_steps_per_second": 106.29,
1041
+ "step": 64000
1042
+ },
1043
+ {
1044
+ "epoch": 104.83870967741936,
1045
+ "grad_norm": 11.85350513458252,
1046
+ "learning_rate": 6.0217204301075275e-06,
1047
+ "loss": 0.4856,
1048
+ "step": 65000
1049
+ },
1050
+ {
1051
+ "epoch": 104.83870967741936,
1052
+ "eval_loss": 1.5175796747207642,
1053
+ "eval_model_preparation_time": 0.0012,
1054
+ "eval_runtime": 0.3143,
1055
+ "eval_samples_per_second": 1661.035,
1056
+ "eval_steps_per_second": 105.008,
1057
+ "step": 65000
1058
+ },
1059
+ {
1060
+ "epoch": 106.45161290322581,
1061
+ "grad_norm": 21.145742416381836,
1062
+ "learning_rate": 5.806666666666667e-06,
1063
+ "loss": 0.4665,
1064
+ "step": 66000
1065
+ },
1066
+ {
1067
+ "epoch": 106.45161290322581,
1068
+ "eval_loss": 1.5837030410766602,
1069
+ "eval_model_preparation_time": 0.0012,
1070
+ "eval_runtime": 0.3108,
1071
+ "eval_samples_per_second": 1679.611,
1072
+ "eval_steps_per_second": 106.182,
1073
+ "step": 66000
1074
+ },
1075
+ {
1076
+ "epoch": 108.06451612903226,
1077
+ "grad_norm": 8.358002662658691,
1078
+ "learning_rate": 5.591612903225807e-06,
1079
+ "loss": 0.4846,
1080
+ "step": 67000
1081
+ },
1082
+ {
1083
+ "epoch": 108.06451612903226,
1084
+ "eval_loss": 1.3910651206970215,
1085
+ "eval_model_preparation_time": 0.0012,
1086
+ "eval_runtime": 0.3115,
1087
+ "eval_samples_per_second": 1676.029,
1088
+ "eval_steps_per_second": 105.956,
1089
+ "step": 67000
1090
+ }
1091
+ ],
1092
+ "logging_steps": 1000,
1093
+ "max_steps": 93000,
1094
+ "num_input_tokens_seen": 0,
1095
+ "num_train_epochs": 150,
1096
+ "save_steps": 1000,
1097
+ "stateful_callbacks": {
1098
+ "EarlyStoppingCallback": {
1099
+ "args": {
1100
+ "early_stopping_patience": 80,
1101
+ "early_stopping_threshold": 0.0
1102
+ },
1103
+ "attributes": {
1104
+ "early_stopping_patience_counter": 0
1105
+ }
1106
+ },
1107
+ "TrainerControl": {
1108
+ "args": {
1109
+ "should_epoch_stop": false,
1110
+ "should_evaluate": false,
1111
+ "should_log": false,
1112
+ "should_save": true,
1113
+ "should_training_stop": false
1114
+ },
1115
+ "attributes": {}
1116
+ }
1117
+ },
1118
+ "total_flos": 1.1978332035428352e+16,
1119
+ "train_batch_size": 16,
1120
+ "trial_name": null,
1121
+ "trial_params": null
1122
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:015f7697b0ed907e67af5984ad66287dbd7e699787c9cb3398b8db4208a469cb
3
+ size 5905