n0w0f commited on
Commit
7d4c272
·
verified ·
1 Parent(s): 5cd47d5

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "dtype": "float32",
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 512,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 1024,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 8,
18
+ "num_hidden_layers": 4,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.57.6",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 30522
25
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b14cc370cfe695d3c5eaf07a244b371faee412b2bbcffdf5bad4a2b9db7f6eed
3
+ size 133031496
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbfa41a72074a592c9293db69bc21282559d5478f88f8cb8f82c13810ca83ff1
3
+ size 266109515
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:370c11d5210c48ba734effb72c47adabe71eeaabda234ab557f4180bf0244198
3
+ size 14645
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdc18a37e9464c7595b7a4474c47e6cce72b32224386ab16ff5fdc19d7020ebf
3
+ size 1465
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "[BOS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "[CLS]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "[EOS]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "[MASK]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "[SEP]",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "[UNK]",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[UNK]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[PAD]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "30001": {
44
+ "content": "[EOS]",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "30002": {
52
+ "content": "[BOS]",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ }
59
+ },
60
+ "bos_token": "[BOS]",
61
+ "clean_up_tokenization_spaces": false,
62
+ "cls_token": "[CLS]",
63
+ "eos_token": "[EOS]",
64
+ "extra_special_tokens": {},
65
+ "mask_token": "[MASK]",
66
+ "model_max_length": 1000000000000000019884624838656,
67
+ "pad_token": "[PAD]",
68
+ "sep_token": "[SEP]",
69
+ "tokenizer_class": "PreTrainedTokenizerFast",
70
+ "unk_token": "[UNK]"
71
+ }
trainer_state.json ADDED
@@ -0,0 +1,1843 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 6000,
3
+ "best_metric": 0.15993832051753998,
4
+ "best_model_checkpoint": "/data/alamparan/mattext_ckpt/results/2026-02-05/13-04-49/pretrain/checkpoints/robocrys_rep_test-pretrain/checkpoint-6000",
5
+ "epoch": 1.938610662358643,
6
+ "eval_steps": 50,
7
+ "global_step": 6000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.01615508885298869,
14
+ "grad_norm": 1.0515121221542358,
15
+ "learning_rate": 0.00019993667205169628,
16
+ "loss": 6.0534,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.01615508885298869,
21
+ "eval_loss": 4.074550151824951,
22
+ "eval_runtime": 174.8477,
23
+ "eval_samples_per_second": 108.689,
24
+ "eval_steps_per_second": 2.265,
25
+ "step": 50
26
+ },
27
+ {
28
+ "epoch": 0.03231017770597738,
29
+ "grad_norm": 1.2968676090240479,
30
+ "learning_rate": 0.00019987205169628432,
31
+ "loss": 3.8424,
32
+ "step": 100
33
+ },
34
+ {
35
+ "epoch": 0.03231017770597738,
36
+ "eval_loss": 3.6214287281036377,
37
+ "eval_runtime": 175.7646,
38
+ "eval_samples_per_second": 108.122,
39
+ "eval_steps_per_second": 2.253,
40
+ "step": 100
41
+ },
42
+ {
43
+ "epoch": 0.048465266558966075,
44
+ "grad_norm": 1.0487699508666992,
45
+ "learning_rate": 0.0001998074313408724,
46
+ "loss": 3.566,
47
+ "step": 150
48
+ },
49
+ {
50
+ "epoch": 0.048465266558966075,
51
+ "eval_loss": 3.430216073989868,
52
+ "eval_runtime": 174.1519,
53
+ "eval_samples_per_second": 109.123,
54
+ "eval_steps_per_second": 2.274,
55
+ "step": 150
56
+ },
57
+ {
58
+ "epoch": 0.06462035541195477,
59
+ "grad_norm": 1.1100188493728638,
60
+ "learning_rate": 0.00019974281098546044,
61
+ "loss": 3.3895,
62
+ "step": 200
63
+ },
64
+ {
65
+ "epoch": 0.06462035541195477,
66
+ "eval_loss": 3.2904512882232666,
67
+ "eval_runtime": 174.3178,
68
+ "eval_samples_per_second": 109.019,
69
+ "eval_steps_per_second": 2.272,
70
+ "step": 200
71
+ },
72
+ {
73
+ "epoch": 0.08077544426494346,
74
+ "grad_norm": 1.1103720664978027,
75
+ "learning_rate": 0.00019967819063004846,
76
+ "loss": 3.2487,
77
+ "step": 250
78
+ },
79
+ {
80
+ "epoch": 0.08077544426494346,
81
+ "eval_loss": 3.155512571334839,
82
+ "eval_runtime": 175.7563,
83
+ "eval_samples_per_second": 108.127,
84
+ "eval_steps_per_second": 2.253,
85
+ "step": 250
86
+ },
87
+ {
88
+ "epoch": 0.09693053311793215,
89
+ "grad_norm": 0.9341705441474915,
90
+ "learning_rate": 0.00019961357027463653,
91
+ "loss": 3.1506,
92
+ "step": 300
93
+ },
94
+ {
95
+ "epoch": 0.09693053311793215,
96
+ "eval_loss": 3.034304618835449,
97
+ "eval_runtime": 173.9011,
98
+ "eval_samples_per_second": 109.28,
99
+ "eval_steps_per_second": 2.277,
100
+ "step": 300
101
+ },
102
+ {
103
+ "epoch": 0.11308562197092084,
104
+ "grad_norm": 1.2189605236053467,
105
+ "learning_rate": 0.00019954894991922457,
106
+ "loss": 3.0199,
107
+ "step": 350
108
+ },
109
+ {
110
+ "epoch": 0.11308562197092084,
111
+ "eval_loss": 2.9081249237060547,
112
+ "eval_runtime": 174.6288,
113
+ "eval_samples_per_second": 108.825,
114
+ "eval_steps_per_second": 2.268,
115
+ "step": 350
116
+ },
117
+ {
118
+ "epoch": 0.12924071082390953,
119
+ "grad_norm": 1.398779273033142,
120
+ "learning_rate": 0.0001994843295638126,
121
+ "loss": 2.9081,
122
+ "step": 400
123
+ },
124
+ {
125
+ "epoch": 0.12924071082390953,
126
+ "eval_loss": 2.8029439449310303,
127
+ "eval_runtime": 175.8793,
128
+ "eval_samples_per_second": 108.051,
129
+ "eval_steps_per_second": 2.252,
130
+ "step": 400
131
+ },
132
+ {
133
+ "epoch": 0.14539579967689822,
134
+ "grad_norm": 1.3859950304031372,
135
+ "learning_rate": 0.00019941970920840066,
136
+ "loss": 2.7946,
137
+ "step": 450
138
+ },
139
+ {
140
+ "epoch": 0.14539579967689822,
141
+ "eval_loss": 2.647810935974121,
142
+ "eval_runtime": 173.9196,
143
+ "eval_samples_per_second": 109.269,
144
+ "eval_steps_per_second": 2.277,
145
+ "step": 450
146
+ },
147
+ {
148
+ "epoch": 0.16155088852988692,
149
+ "grad_norm": 1.5629290342330933,
150
+ "learning_rate": 0.0001993550888529887,
151
+ "loss": 2.6013,
152
+ "step": 500
153
+ },
154
+ {
155
+ "epoch": 0.16155088852988692,
156
+ "eval_loss": 2.3850767612457275,
157
+ "eval_runtime": 175.9348,
158
+ "eval_samples_per_second": 108.017,
159
+ "eval_steps_per_second": 2.251,
160
+ "step": 500
161
+ },
162
+ {
163
+ "epoch": 0.1777059773828756,
164
+ "grad_norm": 2.2329182624816895,
165
+ "learning_rate": 0.00019929046849757675,
166
+ "loss": 2.3045,
167
+ "step": 550
168
+ },
169
+ {
170
+ "epoch": 0.1777059773828756,
171
+ "eval_loss": 1.9701340198516846,
172
+ "eval_runtime": 174.0917,
173
+ "eval_samples_per_second": 109.161,
174
+ "eval_steps_per_second": 2.275,
175
+ "step": 550
176
+ },
177
+ {
178
+ "epoch": 0.1938610662358643,
179
+ "grad_norm": 2.0642428398132324,
180
+ "learning_rate": 0.0001992258481421648,
181
+ "loss": 1.8478,
182
+ "step": 600
183
+ },
184
+ {
185
+ "epoch": 0.1938610662358643,
186
+ "eval_loss": 1.3872605562210083,
187
+ "eval_runtime": 182.0251,
188
+ "eval_samples_per_second": 104.403,
189
+ "eval_steps_per_second": 2.176,
190
+ "step": 600
191
+ },
192
+ {
193
+ "epoch": 0.210016155088853,
194
+ "grad_norm": 1.663743495941162,
195
+ "learning_rate": 0.00019916122778675284,
196
+ "loss": 1.4464,
197
+ "step": 650
198
+ },
199
+ {
200
+ "epoch": 0.210016155088853,
201
+ "eval_loss": 1.0700706243515015,
202
+ "eval_runtime": 184.3893,
203
+ "eval_samples_per_second": 103.065,
204
+ "eval_steps_per_second": 2.148,
205
+ "step": 650
206
+ },
207
+ {
208
+ "epoch": 0.22617124394184168,
209
+ "grad_norm": 1.1930618286132812,
210
+ "learning_rate": 0.00019909660743134088,
211
+ "loss": 1.1074,
212
+ "step": 700
213
+ },
214
+ {
215
+ "epoch": 0.22617124394184168,
216
+ "eval_loss": 0.8680539727210999,
217
+ "eval_runtime": 178.1716,
218
+ "eval_samples_per_second": 106.661,
219
+ "eval_steps_per_second": 2.223,
220
+ "step": 700
221
+ },
222
+ {
223
+ "epoch": 0.24232633279483037,
224
+ "grad_norm": 1.1257418394088745,
225
+ "learning_rate": 0.00019903198707592893,
226
+ "loss": 0.9421,
227
+ "step": 750
228
+ },
229
+ {
230
+ "epoch": 0.24232633279483037,
231
+ "eval_loss": 0.7728319764137268,
232
+ "eval_runtime": 176.3872,
233
+ "eval_samples_per_second": 107.74,
234
+ "eval_steps_per_second": 2.245,
235
+ "step": 750
236
+ },
237
+ {
238
+ "epoch": 0.25848142164781907,
239
+ "grad_norm": 0.774366021156311,
240
+ "learning_rate": 0.00019896736672051697,
241
+ "loss": 0.815,
242
+ "step": 800
243
+ },
244
+ {
245
+ "epoch": 0.25848142164781907,
246
+ "eval_loss": 0.7130251526832581,
247
+ "eval_runtime": 177.1295,
248
+ "eval_samples_per_second": 107.289,
249
+ "eval_steps_per_second": 2.236,
250
+ "step": 800
251
+ },
252
+ {
253
+ "epoch": 0.27463651050080773,
254
+ "grad_norm": 0.8781099915504456,
255
+ "learning_rate": 0.00019890274636510502,
256
+ "loss": 0.7576,
257
+ "step": 850
258
+ },
259
+ {
260
+ "epoch": 0.27463651050080773,
261
+ "eval_loss": 0.6742915511131287,
262
+ "eval_runtime": 175.5733,
263
+ "eval_samples_per_second": 108.24,
264
+ "eval_steps_per_second": 2.255,
265
+ "step": 850
266
+ },
267
+ {
268
+ "epoch": 0.29079159935379645,
269
+ "grad_norm": 0.7646985054016113,
270
+ "learning_rate": 0.00019883812600969306,
271
+ "loss": 0.7248,
272
+ "step": 900
273
+ },
274
+ {
275
+ "epoch": 0.29079159935379645,
276
+ "eval_loss": 0.6413119435310364,
277
+ "eval_runtime": 182.9224,
278
+ "eval_samples_per_second": 103.891,
279
+ "eval_steps_per_second": 2.165,
280
+ "step": 900
281
+ },
282
+ {
283
+ "epoch": 0.3069466882067851,
284
+ "grad_norm": 0.7410991191864014,
285
+ "learning_rate": 0.0001987735056542811,
286
+ "loss": 0.706,
287
+ "step": 950
288
+ },
289
+ {
290
+ "epoch": 0.3069466882067851,
291
+ "eval_loss": 0.6033533215522766,
292
+ "eval_runtime": 179.7077,
293
+ "eval_samples_per_second": 105.75,
294
+ "eval_steps_per_second": 2.204,
295
+ "step": 950
296
+ },
297
+ {
298
+ "epoch": 0.32310177705977383,
299
+ "grad_norm": 0.7484062910079956,
300
+ "learning_rate": 0.00019870888529886915,
301
+ "loss": 0.6663,
302
+ "step": 1000
303
+ },
304
+ {
305
+ "epoch": 0.32310177705977383,
306
+ "eval_loss": 0.5805819034576416,
307
+ "eval_runtime": 178.089,
308
+ "eval_samples_per_second": 106.711,
309
+ "eval_steps_per_second": 2.224,
310
+ "step": 1000
311
+ },
312
+ {
313
+ "epoch": 0.3392568659127625,
314
+ "grad_norm": 0.749854326248169,
315
+ "learning_rate": 0.0001986442649434572,
316
+ "loss": 0.6597,
317
+ "step": 1050
318
+ },
319
+ {
320
+ "epoch": 0.3392568659127625,
321
+ "eval_loss": 0.5619820356369019,
322
+ "eval_runtime": 177.4502,
323
+ "eval_samples_per_second": 107.095,
324
+ "eval_steps_per_second": 2.232,
325
+ "step": 1050
326
+ },
327
+ {
328
+ "epoch": 0.3554119547657512,
329
+ "grad_norm": 0.7001623511314392,
330
+ "learning_rate": 0.00019857964458804524,
331
+ "loss": 0.6189,
332
+ "step": 1100
333
+ },
334
+ {
335
+ "epoch": 0.3554119547657512,
336
+ "eval_loss": 0.5405182242393494,
337
+ "eval_runtime": 177.9338,
338
+ "eval_samples_per_second": 106.804,
339
+ "eval_steps_per_second": 2.226,
340
+ "step": 1100
341
+ },
342
+ {
343
+ "epoch": 0.3715670436187399,
344
+ "grad_norm": 0.6307182312011719,
345
+ "learning_rate": 0.00019851502423263328,
346
+ "loss": 0.5879,
347
+ "step": 1150
348
+ },
349
+ {
350
+ "epoch": 0.3715670436187399,
351
+ "eval_loss": 0.5224936604499817,
352
+ "eval_runtime": 179.0782,
353
+ "eval_samples_per_second": 106.121,
354
+ "eval_steps_per_second": 2.211,
355
+ "step": 1150
356
+ },
357
+ {
358
+ "epoch": 0.3877221324717286,
359
+ "grad_norm": 0.755315899848938,
360
+ "learning_rate": 0.00019845040387722132,
361
+ "loss": 0.5659,
362
+ "step": 1200
363
+ },
364
+ {
365
+ "epoch": 0.3877221324717286,
366
+ "eval_loss": 0.4987003803253174,
367
+ "eval_runtime": 180.4854,
368
+ "eval_samples_per_second": 105.294,
369
+ "eval_steps_per_second": 2.194,
370
+ "step": 1200
371
+ },
372
+ {
373
+ "epoch": 0.40387722132471726,
374
+ "grad_norm": 0.6601123213768005,
375
+ "learning_rate": 0.0001983857835218094,
376
+ "loss": 0.5484,
377
+ "step": 1250
378
+ },
379
+ {
380
+ "epoch": 0.40387722132471726,
381
+ "eval_loss": 0.48691466450691223,
382
+ "eval_runtime": 185.1754,
383
+ "eval_samples_per_second": 102.627,
384
+ "eval_steps_per_second": 2.139,
385
+ "step": 1250
386
+ },
387
+ {
388
+ "epoch": 0.420032310177706,
389
+ "grad_norm": 0.6862411499023438,
390
+ "learning_rate": 0.0001983211631663974,
391
+ "loss": 0.5404,
392
+ "step": 1300
393
+ },
394
+ {
395
+ "epoch": 0.420032310177706,
396
+ "eval_loss": 0.4613674581050873,
397
+ "eval_runtime": 184.3912,
398
+ "eval_samples_per_second": 103.063,
399
+ "eval_steps_per_second": 2.148,
400
+ "step": 1300
401
+ },
402
+ {
403
+ "epoch": 0.43618739903069464,
404
+ "grad_norm": 0.705555260181427,
405
+ "learning_rate": 0.00019825654281098546,
406
+ "loss": 0.5047,
407
+ "step": 1350
408
+ },
409
+ {
410
+ "epoch": 0.43618739903069464,
411
+ "eval_loss": 0.43498361110687256,
412
+ "eval_runtime": 183.8266,
413
+ "eval_samples_per_second": 103.38,
414
+ "eval_steps_per_second": 2.154,
415
+ "step": 1350
416
+ },
417
+ {
418
+ "epoch": 0.45234248788368336,
419
+ "grad_norm": 0.6185953617095947,
420
+ "learning_rate": 0.00019819192245557353,
421
+ "loss": 0.4912,
422
+ "step": 1400
423
+ },
424
+ {
425
+ "epoch": 0.45234248788368336,
426
+ "eval_loss": 0.411850243806839,
427
+ "eval_runtime": 174.5999,
428
+ "eval_samples_per_second": 108.843,
429
+ "eval_steps_per_second": 2.268,
430
+ "step": 1400
431
+ },
432
+ {
433
+ "epoch": 0.46849757673667203,
434
+ "grad_norm": 0.6479601860046387,
435
+ "learning_rate": 0.00019812730210016157,
436
+ "loss": 0.4377,
437
+ "step": 1450
438
+ },
439
+ {
440
+ "epoch": 0.46849757673667203,
441
+ "eval_loss": 0.4002995789051056,
442
+ "eval_runtime": 174.5791,
443
+ "eval_samples_per_second": 108.856,
444
+ "eval_steps_per_second": 2.268,
445
+ "step": 1450
446
+ },
447
+ {
448
+ "epoch": 0.48465266558966075,
449
+ "grad_norm": 0.7221212983131409,
450
+ "learning_rate": 0.0001980626817447496,
451
+ "loss": 0.4106,
452
+ "step": 1500
453
+ },
454
+ {
455
+ "epoch": 0.48465266558966075,
456
+ "eval_loss": 0.38361039757728577,
457
+ "eval_runtime": 176.2527,
458
+ "eval_samples_per_second": 107.822,
459
+ "eval_steps_per_second": 2.247,
460
+ "step": 1500
461
+ },
462
+ {
463
+ "epoch": 0.5008077544426495,
464
+ "grad_norm": 0.6264716386795044,
465
+ "learning_rate": 0.00019799806138933766,
466
+ "loss": 0.416,
467
+ "step": 1550
468
+ },
469
+ {
470
+ "epoch": 0.5008077544426495,
471
+ "eval_loss": 0.3711036145687103,
472
+ "eval_runtime": 175.7893,
473
+ "eval_samples_per_second": 108.107,
474
+ "eval_steps_per_second": 2.253,
475
+ "step": 1550
476
+ },
477
+ {
478
+ "epoch": 0.5169628432956381,
479
+ "grad_norm": 0.6347914934158325,
480
+ "learning_rate": 0.0001979334410339257,
481
+ "loss": 0.4034,
482
+ "step": 1600
483
+ },
484
+ {
485
+ "epoch": 0.5169628432956381,
486
+ "eval_loss": 0.3516336679458618,
487
+ "eval_runtime": 174.6119,
488
+ "eval_samples_per_second": 108.836,
489
+ "eval_steps_per_second": 2.268,
490
+ "step": 1600
491
+ },
492
+ {
493
+ "epoch": 0.5331179321486268,
494
+ "grad_norm": 0.6225407123565674,
495
+ "learning_rate": 0.00019786882067851372,
496
+ "loss": 0.3762,
497
+ "step": 1650
498
+ },
499
+ {
500
+ "epoch": 0.5331179321486268,
501
+ "eval_loss": 0.3361986577510834,
502
+ "eval_runtime": 175.0444,
503
+ "eval_samples_per_second": 108.567,
504
+ "eval_steps_per_second": 2.262,
505
+ "step": 1650
506
+ },
507
+ {
508
+ "epoch": 0.5492730210016155,
509
+ "grad_norm": 0.7712708711624146,
510
+ "learning_rate": 0.0001978042003231018,
511
+ "loss": 0.3669,
512
+ "step": 1700
513
+ },
514
+ {
515
+ "epoch": 0.5492730210016155,
516
+ "eval_loss": 0.31636813282966614,
517
+ "eval_runtime": 175.3287,
518
+ "eval_samples_per_second": 108.391,
519
+ "eval_steps_per_second": 2.259,
520
+ "step": 1700
521
+ },
522
+ {
523
+ "epoch": 0.5654281098546042,
524
+ "grad_norm": 0.6715072989463806,
525
+ "learning_rate": 0.00019773957996768984,
526
+ "loss": 0.3441,
527
+ "step": 1750
528
+ },
529
+ {
530
+ "epoch": 0.5654281098546042,
531
+ "eval_loss": 0.30088016390800476,
532
+ "eval_runtime": 165.2219,
533
+ "eval_samples_per_second": 115.021,
534
+ "eval_steps_per_second": 2.397,
535
+ "step": 1750
536
+ },
537
+ {
538
+ "epoch": 0.5815831987075929,
539
+ "grad_norm": 0.585584282875061,
540
+ "learning_rate": 0.00019767495961227788,
541
+ "loss": 0.3267,
542
+ "step": 1800
543
+ },
544
+ {
545
+ "epoch": 0.5815831987075929,
546
+ "eval_loss": 0.28820616006851196,
547
+ "eval_runtime": 163.4002,
548
+ "eval_samples_per_second": 116.303,
549
+ "eval_steps_per_second": 2.423,
550
+ "step": 1800
551
+ },
552
+ {
553
+ "epoch": 0.5977382875605816,
554
+ "grad_norm": 0.7109031677246094,
555
+ "learning_rate": 0.00019761033925686593,
556
+ "loss": 0.3056,
557
+ "step": 1850
558
+ },
559
+ {
560
+ "epoch": 0.5977382875605816,
561
+ "eval_loss": 0.2841907739639282,
562
+ "eval_runtime": 163.4248,
563
+ "eval_samples_per_second": 116.286,
564
+ "eval_steps_per_second": 2.423,
565
+ "step": 1850
566
+ },
567
+ {
568
+ "epoch": 0.6138933764135702,
569
+ "grad_norm": 0.5637331008911133,
570
+ "learning_rate": 0.00019754571890145397,
571
+ "loss": 0.3018,
572
+ "step": 1900
573
+ },
574
+ {
575
+ "epoch": 0.6138933764135702,
576
+ "eval_loss": 0.27677446603775024,
577
+ "eval_runtime": 168.0175,
578
+ "eval_samples_per_second": 113.107,
579
+ "eval_steps_per_second": 2.357,
580
+ "step": 1900
581
+ },
582
+ {
583
+ "epoch": 0.630048465266559,
584
+ "grad_norm": 0.6805331110954285,
585
+ "learning_rate": 0.00019748109854604202,
586
+ "loss": 0.2974,
587
+ "step": 1950
588
+ },
589
+ {
590
+ "epoch": 0.630048465266559,
591
+ "eval_loss": 0.2690126597881317,
592
+ "eval_runtime": 169.2939,
593
+ "eval_samples_per_second": 112.254,
594
+ "eval_steps_per_second": 2.339,
595
+ "step": 1950
596
+ },
597
+ {
598
+ "epoch": 0.6462035541195477,
599
+ "grad_norm": 0.5953163504600525,
600
+ "learning_rate": 0.00019741647819063006,
601
+ "loss": 0.2824,
602
+ "step": 2000
603
+ },
604
+ {
605
+ "epoch": 0.6462035541195477,
606
+ "eval_loss": 0.2670688331127167,
607
+ "eval_runtime": 165.9823,
608
+ "eval_samples_per_second": 114.494,
609
+ "eval_steps_per_second": 2.386,
610
+ "step": 2000
611
+ },
612
+ {
613
+ "epoch": 0.6623586429725363,
614
+ "grad_norm": 0.6007310152053833,
615
+ "learning_rate": 0.0001973518578352181,
616
+ "loss": 0.2772,
617
+ "step": 2050
618
+ },
619
+ {
620
+ "epoch": 0.6623586429725363,
621
+ "eval_loss": 0.26082342863082886,
622
+ "eval_runtime": 171.6359,
623
+ "eval_samples_per_second": 110.723,
624
+ "eval_steps_per_second": 2.307,
625
+ "step": 2050
626
+ },
627
+ {
628
+ "epoch": 0.678513731825525,
629
+ "grad_norm": 0.4502333700656891,
630
+ "learning_rate": 0.00019728723747980615,
631
+ "loss": 0.2689,
632
+ "step": 2100
633
+ },
634
+ {
635
+ "epoch": 0.678513731825525,
636
+ "eval_loss": 0.2507447898387909,
637
+ "eval_runtime": 165.8831,
638
+ "eval_samples_per_second": 114.563,
639
+ "eval_steps_per_second": 2.387,
640
+ "step": 2100
641
+ },
642
+ {
643
+ "epoch": 0.6946688206785138,
644
+ "grad_norm": 0.5430248975753784,
645
+ "learning_rate": 0.0001972226171243942,
646
+ "loss": 0.2599,
647
+ "step": 2150
648
+ },
649
+ {
650
+ "epoch": 0.6946688206785138,
651
+ "eval_loss": 0.247538760304451,
652
+ "eval_runtime": 163.9496,
653
+ "eval_samples_per_second": 115.914,
654
+ "eval_steps_per_second": 2.415,
655
+ "step": 2150
656
+ },
657
+ {
658
+ "epoch": 0.7108239095315024,
659
+ "grad_norm": 0.6169001460075378,
660
+ "learning_rate": 0.00019715799676898224,
661
+ "loss": 0.2568,
662
+ "step": 2200
663
+ },
664
+ {
665
+ "epoch": 0.7108239095315024,
666
+ "eval_loss": 0.24606221914291382,
667
+ "eval_runtime": 169.0704,
668
+ "eval_samples_per_second": 112.403,
669
+ "eval_steps_per_second": 2.342,
670
+ "step": 2200
671
+ },
672
+ {
673
+ "epoch": 0.7269789983844911,
674
+ "grad_norm": 0.48222729563713074,
675
+ "learning_rate": 0.00019709337641357028,
676
+ "loss": 0.258,
677
+ "step": 2250
678
+ },
679
+ {
680
+ "epoch": 0.7269789983844911,
681
+ "eval_loss": 0.24414058029651642,
682
+ "eval_runtime": 165.7073,
683
+ "eval_samples_per_second": 114.684,
684
+ "eval_steps_per_second": 2.39,
685
+ "step": 2250
686
+ },
687
+ {
688
+ "epoch": 0.7431340872374798,
689
+ "grad_norm": 0.5104192495346069,
690
+ "learning_rate": 0.00019702875605815833,
691
+ "loss": 0.2428,
692
+ "step": 2300
693
+ },
694
+ {
695
+ "epoch": 0.7431340872374798,
696
+ "eval_loss": 0.2404615581035614,
697
+ "eval_runtime": 168.1658,
698
+ "eval_samples_per_second": 113.008,
699
+ "eval_steps_per_second": 2.355,
700
+ "step": 2300
701
+ },
702
+ {
703
+ "epoch": 0.7592891760904685,
704
+ "grad_norm": 0.4499678909778595,
705
+ "learning_rate": 0.00019696413570274637,
706
+ "loss": 0.2503,
707
+ "step": 2350
708
+ },
709
+ {
710
+ "epoch": 0.7592891760904685,
711
+ "eval_loss": 0.23556502163410187,
712
+ "eval_runtime": 166.7646,
713
+ "eval_samples_per_second": 113.957,
714
+ "eval_steps_per_second": 2.375,
715
+ "step": 2350
716
+ },
717
+ {
718
+ "epoch": 0.7754442649434572,
719
+ "grad_norm": 0.5658778548240662,
720
+ "learning_rate": 0.00019689951534733441,
721
+ "loss": 0.2501,
722
+ "step": 2400
723
+ },
724
+ {
725
+ "epoch": 0.7754442649434572,
726
+ "eval_loss": 0.2322016805410385,
727
+ "eval_runtime": 165.6498,
728
+ "eval_samples_per_second": 114.724,
729
+ "eval_steps_per_second": 2.391,
730
+ "step": 2400
731
+ },
732
+ {
733
+ "epoch": 0.7915993537964459,
734
+ "grad_norm": 0.5279048681259155,
735
+ "learning_rate": 0.00019683489499192246,
736
+ "loss": 0.2439,
737
+ "step": 2450
738
+ },
739
+ {
740
+ "epoch": 0.7915993537964459,
741
+ "eval_loss": 0.22799938917160034,
742
+ "eval_runtime": 172.0629,
743
+ "eval_samples_per_second": 110.448,
744
+ "eval_steps_per_second": 2.301,
745
+ "step": 2450
746
+ },
747
+ {
748
+ "epoch": 0.8077544426494345,
749
+ "grad_norm": 0.5539276003837585,
750
+ "learning_rate": 0.00019677027463651053,
751
+ "loss": 0.2371,
752
+ "step": 2500
753
+ },
754
+ {
755
+ "epoch": 0.8077544426494345,
756
+ "eval_loss": 0.22852076590061188,
757
+ "eval_runtime": 164.2885,
758
+ "eval_samples_per_second": 115.675,
759
+ "eval_steps_per_second": 2.41,
760
+ "step": 2500
761
+ },
762
+ {
763
+ "epoch": 0.8239095315024233,
764
+ "grad_norm": 0.46706199645996094,
765
+ "learning_rate": 0.00019670565428109855,
766
+ "loss": 0.2265,
767
+ "step": 2550
768
+ },
769
+ {
770
+ "epoch": 0.8239095315024233,
771
+ "eval_loss": 0.22647793591022491,
772
+ "eval_runtime": 166.7134,
773
+ "eval_samples_per_second": 113.992,
774
+ "eval_steps_per_second": 2.375,
775
+ "step": 2550
776
+ },
777
+ {
778
+ "epoch": 0.840064620355412,
779
+ "grad_norm": 0.47346100211143494,
780
+ "learning_rate": 0.0001966410339256866,
781
+ "loss": 0.2389,
782
+ "step": 2600
783
+ },
784
+ {
785
+ "epoch": 0.840064620355412,
786
+ "eval_loss": 0.22408178448677063,
787
+ "eval_runtime": 163.986,
788
+ "eval_samples_per_second": 115.888,
789
+ "eval_steps_per_second": 2.415,
790
+ "step": 2600
791
+ },
792
+ {
793
+ "epoch": 0.8562197092084006,
794
+ "grad_norm": 0.4981846809387207,
795
+ "learning_rate": 0.00019657641357027466,
796
+ "loss": 0.233,
797
+ "step": 2650
798
+ },
799
+ {
800
+ "epoch": 0.8562197092084006,
801
+ "eval_loss": 0.21772241592407227,
802
+ "eval_runtime": 164.9909,
803
+ "eval_samples_per_second": 115.182,
804
+ "eval_steps_per_second": 2.4,
805
+ "step": 2650
806
+ },
807
+ {
808
+ "epoch": 0.8723747980613893,
809
+ "grad_norm": 0.39221659302711487,
810
+ "learning_rate": 0.00019651179321486268,
811
+ "loss": 0.2274,
812
+ "step": 2700
813
+ },
814
+ {
815
+ "epoch": 0.8723747980613893,
816
+ "eval_loss": 0.21865881979465485,
817
+ "eval_runtime": 164.8791,
818
+ "eval_samples_per_second": 115.26,
819
+ "eval_steps_per_second": 2.402,
820
+ "step": 2700
821
+ },
822
+ {
823
+ "epoch": 0.8885298869143781,
824
+ "grad_norm": 0.3755420446395874,
825
+ "learning_rate": 0.00019644717285945072,
826
+ "loss": 0.2212,
827
+ "step": 2750
828
+ },
829
+ {
830
+ "epoch": 0.8885298869143781,
831
+ "eval_loss": 0.2188514620065689,
832
+ "eval_runtime": 163.9572,
833
+ "eval_samples_per_second": 115.908,
834
+ "eval_steps_per_second": 2.415,
835
+ "step": 2750
836
+ },
837
+ {
838
+ "epoch": 0.9046849757673667,
839
+ "grad_norm": 0.4831067621707916,
840
+ "learning_rate": 0.0001963825525040388,
841
+ "loss": 0.2174,
842
+ "step": 2800
843
+ },
844
+ {
845
+ "epoch": 0.9046849757673667,
846
+ "eval_loss": 0.2164352685213089,
847
+ "eval_runtime": 164.722,
848
+ "eval_samples_per_second": 115.37,
849
+ "eval_steps_per_second": 2.404,
850
+ "step": 2800
851
+ },
852
+ {
853
+ "epoch": 0.9208400646203554,
854
+ "grad_norm": 0.4211012125015259,
855
+ "learning_rate": 0.00019631793214862684,
856
+ "loss": 0.2193,
857
+ "step": 2850
858
+ },
859
+ {
860
+ "epoch": 0.9208400646203554,
861
+ "eval_loss": 0.21268193423748016,
862
+ "eval_runtime": 173.6453,
863
+ "eval_samples_per_second": 109.442,
864
+ "eval_steps_per_second": 2.281,
865
+ "step": 2850
866
+ },
867
+ {
868
+ "epoch": 0.9369951534733441,
869
+ "grad_norm": 0.41934114694595337,
870
+ "learning_rate": 0.00019625331179321486,
871
+ "loss": 0.2146,
872
+ "step": 2900
873
+ },
874
+ {
875
+ "epoch": 0.9369951534733441,
876
+ "eval_loss": 0.21176370978355408,
877
+ "eval_runtime": 167.1164,
878
+ "eval_samples_per_second": 113.717,
879
+ "eval_steps_per_second": 2.37,
880
+ "step": 2900
881
+ },
882
+ {
883
+ "epoch": 0.9531502423263328,
884
+ "grad_norm": 0.4452735483646393,
885
+ "learning_rate": 0.00019618869143780293,
886
+ "loss": 0.2073,
887
+ "step": 2950
888
+ },
889
+ {
890
+ "epoch": 0.9531502423263328,
891
+ "eval_loss": 0.20772910118103027,
892
+ "eval_runtime": 126.5423,
893
+ "eval_samples_per_second": 150.179,
894
+ "eval_steps_per_second": 3.129,
895
+ "step": 2950
896
+ },
897
+ {
898
+ "epoch": 0.9693053311793215,
899
+ "grad_norm": 0.46377331018447876,
900
+ "learning_rate": 0.00019612407108239097,
901
+ "loss": 0.2093,
902
+ "step": 3000
903
+ },
904
+ {
905
+ "epoch": 0.9693053311793215,
906
+ "eval_loss": 0.2042500376701355,
907
+ "eval_runtime": 109.4901,
908
+ "eval_samples_per_second": 173.568,
909
+ "eval_steps_per_second": 3.617,
910
+ "step": 3000
911
+ },
912
+ {
913
+ "epoch": 0.9854604200323102,
914
+ "grad_norm": 0.5356667041778564,
915
+ "learning_rate": 0.000196059450726979,
916
+ "loss": 0.2082,
917
+ "step": 3050
918
+ },
919
+ {
920
+ "epoch": 0.9854604200323102,
921
+ "eval_loss": 0.20467719435691833,
922
+ "eval_runtime": 108.3027,
923
+ "eval_samples_per_second": 175.471,
924
+ "eval_steps_per_second": 3.656,
925
+ "step": 3050
926
+ },
927
+ {
928
+ "epoch": 1.001615508885299,
929
+ "grad_norm": 0.5465859770774841,
930
+ "learning_rate": 0.00019599483037156706,
931
+ "loss": 0.2021,
932
+ "step": 3100
933
+ },
934
+ {
935
+ "epoch": 1.001615508885299,
936
+ "eval_loss": 0.2056320309638977,
937
+ "eval_runtime": 112.7152,
938
+ "eval_samples_per_second": 168.602,
939
+ "eval_steps_per_second": 3.513,
940
+ "step": 3100
941
+ },
942
+ {
943
+ "epoch": 1.0177705977382876,
944
+ "grad_norm": 0.5674402713775635,
945
+ "learning_rate": 0.0001959302100161551,
946
+ "loss": 0.2052,
947
+ "step": 3150
948
+ },
949
+ {
950
+ "epoch": 1.0177705977382876,
951
+ "eval_loss": 0.20261099934577942,
952
+ "eval_runtime": 114.1889,
953
+ "eval_samples_per_second": 166.426,
954
+ "eval_steps_per_second": 3.468,
955
+ "step": 3150
956
+ },
957
+ {
958
+ "epoch": 1.0339256865912763,
959
+ "grad_norm": 0.44871240854263306,
960
+ "learning_rate": 0.00019586558966074315,
961
+ "loss": 0.2091,
962
+ "step": 3200
963
+ },
964
+ {
965
+ "epoch": 1.0339256865912763,
966
+ "eval_loss": 0.19862857460975647,
967
+ "eval_runtime": 111.4553,
968
+ "eval_samples_per_second": 170.508,
969
+ "eval_steps_per_second": 3.553,
970
+ "step": 3200
971
+ },
972
+ {
973
+ "epoch": 1.050080775444265,
974
+ "grad_norm": 0.3680683374404907,
975
+ "learning_rate": 0.0001958009693053312,
976
+ "loss": 0.1998,
977
+ "step": 3250
978
+ },
979
+ {
980
+ "epoch": 1.050080775444265,
981
+ "eval_loss": 0.1985771656036377,
982
+ "eval_runtime": 108.0946,
983
+ "eval_samples_per_second": 175.809,
984
+ "eval_steps_per_second": 3.663,
985
+ "step": 3250
986
+ },
987
+ {
988
+ "epoch": 1.0662358642972536,
989
+ "grad_norm": 0.4568157196044922,
990
+ "learning_rate": 0.00019573634894991924,
991
+ "loss": 0.19,
992
+ "step": 3300
993
+ },
994
+ {
995
+ "epoch": 1.0662358642972536,
996
+ "eval_loss": 0.19872696697711945,
997
+ "eval_runtime": 109.2206,
998
+ "eval_samples_per_second": 173.997,
999
+ "eval_steps_per_second": 3.626,
1000
+ "step": 3300
1001
+ },
1002
+ {
1003
+ "epoch": 1.0823909531502423,
1004
+ "grad_norm": 0.4335425794124603,
1005
+ "learning_rate": 0.00019567172859450728,
1006
+ "loss": 0.2,
1007
+ "step": 3350
1008
+ },
1009
+ {
1010
+ "epoch": 1.0823909531502423,
1011
+ "eval_loss": 0.19521376490592957,
1012
+ "eval_runtime": 109.0433,
1013
+ "eval_samples_per_second": 174.279,
1014
+ "eval_steps_per_second": 3.632,
1015
+ "step": 3350
1016
+ },
1017
+ {
1018
+ "epoch": 1.098546042003231,
1019
+ "grad_norm": 0.3882080316543579,
1020
+ "learning_rate": 0.00019560710823909533,
1021
+ "loss": 0.1937,
1022
+ "step": 3400
1023
+ },
1024
+ {
1025
+ "epoch": 1.098546042003231,
1026
+ "eval_loss": 0.19395685195922852,
1027
+ "eval_runtime": 111.4962,
1028
+ "eval_samples_per_second": 170.445,
1029
+ "eval_steps_per_second": 3.552,
1030
+ "step": 3400
1031
+ },
1032
+ {
1033
+ "epoch": 1.1147011308562198,
1034
+ "grad_norm": 0.4501712918281555,
1035
+ "learning_rate": 0.00019554248788368337,
1036
+ "loss": 0.2047,
1037
+ "step": 3450
1038
+ },
1039
+ {
1040
+ "epoch": 1.1147011308562198,
1041
+ "eval_loss": 0.1951920986175537,
1042
+ "eval_runtime": 108.988,
1043
+ "eval_samples_per_second": 174.368,
1044
+ "eval_steps_per_second": 3.633,
1045
+ "step": 3450
1046
+ },
1047
+ {
1048
+ "epoch": 1.1308562197092085,
1049
+ "grad_norm": 0.37776461243629456,
1050
+ "learning_rate": 0.00019547786752827141,
1051
+ "loss": 0.1935,
1052
+ "step": 3500
1053
+ },
1054
+ {
1055
+ "epoch": 1.1308562197092085,
1056
+ "eval_loss": 0.19314581155776978,
1057
+ "eval_runtime": 106.1233,
1058
+ "eval_samples_per_second": 179.075,
1059
+ "eval_steps_per_second": 3.732,
1060
+ "step": 3500
1061
+ },
1062
+ {
1063
+ "epoch": 1.1470113085621971,
1064
+ "grad_norm": 0.43368467688560486,
1065
+ "learning_rate": 0.00019541324717285946,
1066
+ "loss": 0.1915,
1067
+ "step": 3550
1068
+ },
1069
+ {
1070
+ "epoch": 1.1470113085621971,
1071
+ "eval_loss": 0.1909024566411972,
1072
+ "eval_runtime": 114.5384,
1073
+ "eval_samples_per_second": 165.918,
1074
+ "eval_steps_per_second": 3.457,
1075
+ "step": 3550
1076
+ },
1077
+ {
1078
+ "epoch": 1.1631663974151858,
1079
+ "grad_norm": 0.4507925510406494,
1080
+ "learning_rate": 0.0001953486268174475,
1081
+ "loss": 0.1893,
1082
+ "step": 3600
1083
+ },
1084
+ {
1085
+ "epoch": 1.1631663974151858,
1086
+ "eval_loss": 0.1882883608341217,
1087
+ "eval_runtime": 113.0984,
1088
+ "eval_samples_per_second": 168.031,
1089
+ "eval_steps_per_second": 3.501,
1090
+ "step": 3600
1091
+ },
1092
+ {
1093
+ "epoch": 1.1793214862681745,
1094
+ "grad_norm": 0.4633695185184479,
1095
+ "learning_rate": 0.00019528400646203555,
1096
+ "loss": 0.1877,
1097
+ "step": 3650
1098
+ },
1099
+ {
1100
+ "epoch": 1.1793214862681745,
1101
+ "eval_loss": 0.19144870340824127,
1102
+ "eval_runtime": 110.1685,
1103
+ "eval_samples_per_second": 172.499,
1104
+ "eval_steps_per_second": 3.594,
1105
+ "step": 3650
1106
+ },
1107
+ {
1108
+ "epoch": 1.1954765751211631,
1109
+ "grad_norm": 0.37872394919395447,
1110
+ "learning_rate": 0.0001952193861066236,
1111
+ "loss": 0.1899,
1112
+ "step": 3700
1113
+ },
1114
+ {
1115
+ "epoch": 1.1954765751211631,
1116
+ "eval_loss": 0.18969100713729858,
1117
+ "eval_runtime": 114.4541,
1118
+ "eval_samples_per_second": 166.04,
1119
+ "eval_steps_per_second": 3.46,
1120
+ "step": 3700
1121
+ },
1122
+ {
1123
+ "epoch": 1.2116316639741518,
1124
+ "grad_norm": 0.4261837899684906,
1125
+ "learning_rate": 0.00019515476575121164,
1126
+ "loss": 0.189,
1127
+ "step": 3750
1128
+ },
1129
+ {
1130
+ "epoch": 1.2116316639741518,
1131
+ "eval_loss": 0.18924662470817566,
1132
+ "eval_runtime": 111.8806,
1133
+ "eval_samples_per_second": 169.86,
1134
+ "eval_steps_per_second": 3.539,
1135
+ "step": 3750
1136
+ },
1137
+ {
1138
+ "epoch": 1.2277867528271407,
1139
+ "grad_norm": 0.4276933968067169,
1140
+ "learning_rate": 0.00019509014539579968,
1141
+ "loss": 0.1882,
1142
+ "step": 3800
1143
+ },
1144
+ {
1145
+ "epoch": 1.2277867528271407,
1146
+ "eval_loss": 0.18727388978004456,
1147
+ "eval_runtime": 114.5697,
1148
+ "eval_samples_per_second": 165.873,
1149
+ "eval_steps_per_second": 3.456,
1150
+ "step": 3800
1151
+ },
1152
+ {
1153
+ "epoch": 1.2439418416801293,
1154
+ "grad_norm": 0.451194167137146,
1155
+ "learning_rate": 0.00019502552504038772,
1156
+ "loss": 0.1876,
1157
+ "step": 3850
1158
+ },
1159
+ {
1160
+ "epoch": 1.2439418416801293,
1161
+ "eval_loss": 0.18400390446186066,
1162
+ "eval_runtime": 110.0659,
1163
+ "eval_samples_per_second": 172.66,
1164
+ "eval_steps_per_second": 3.598,
1165
+ "step": 3850
1166
+ },
1167
+ {
1168
+ "epoch": 1.260096930533118,
1169
+ "grad_norm": 0.42289528250694275,
1170
+ "learning_rate": 0.0001949609046849758,
1171
+ "loss": 0.1764,
1172
+ "step": 3900
1173
+ },
1174
+ {
1175
+ "epoch": 1.260096930533118,
1176
+ "eval_loss": 0.1845184564590454,
1177
+ "eval_runtime": 109.3321,
1178
+ "eval_samples_per_second": 173.819,
1179
+ "eval_steps_per_second": 3.622,
1180
+ "step": 3900
1181
+ },
1182
+ {
1183
+ "epoch": 1.2762520193861067,
1184
+ "grad_norm": 0.4245447814464569,
1185
+ "learning_rate": 0.0001948962843295638,
1186
+ "loss": 0.1843,
1187
+ "step": 3950
1188
+ },
1189
+ {
1190
+ "epoch": 1.2762520193861067,
1191
+ "eval_loss": 0.18434682488441467,
1192
+ "eval_runtime": 110.8391,
1193
+ "eval_samples_per_second": 171.456,
1194
+ "eval_steps_per_second": 3.573,
1195
+ "step": 3950
1196
+ },
1197
+ {
1198
+ "epoch": 1.2924071082390953,
1199
+ "grad_norm": 0.4191521406173706,
1200
+ "learning_rate": 0.00019483166397415186,
1201
+ "loss": 0.1851,
1202
+ "step": 4000
1203
+ },
1204
+ {
1205
+ "epoch": 1.2924071082390953,
1206
+ "eval_loss": 0.18124856054782867,
1207
+ "eval_runtime": 107.9981,
1208
+ "eval_samples_per_second": 175.966,
1209
+ "eval_steps_per_second": 3.667,
1210
+ "step": 4000
1211
+ },
1212
+ {
1213
+ "epoch": 1.308562197092084,
1214
+ "grad_norm": 0.3390869200229645,
1215
+ "learning_rate": 0.00019476704361873993,
1216
+ "loss": 0.1843,
1217
+ "step": 4050
1218
+ },
1219
+ {
1220
+ "epoch": 1.308562197092084,
1221
+ "eval_loss": 0.18725259602069855,
1222
+ "eval_runtime": 106.2756,
1223
+ "eval_samples_per_second": 178.818,
1224
+ "eval_steps_per_second": 3.726,
1225
+ "step": 4050
1226
+ },
1227
+ {
1228
+ "epoch": 1.3247172859450727,
1229
+ "grad_norm": 0.444640189409256,
1230
+ "learning_rate": 0.00019470242326332794,
1231
+ "loss": 0.1827,
1232
+ "step": 4100
1233
+ },
1234
+ {
1235
+ "epoch": 1.3247172859450727,
1236
+ "eval_loss": 0.1816088706254959,
1237
+ "eval_runtime": 110.0768,
1238
+ "eval_samples_per_second": 172.643,
1239
+ "eval_steps_per_second": 3.597,
1240
+ "step": 4100
1241
+ },
1242
+ {
1243
+ "epoch": 1.3408723747980613,
1244
+ "grad_norm": 0.42469364404678345,
1245
+ "learning_rate": 0.000194637802907916,
1246
+ "loss": 0.1839,
1247
+ "step": 4150
1248
+ },
1249
+ {
1250
+ "epoch": 1.3408723747980613,
1251
+ "eval_loss": 0.18320384621620178,
1252
+ "eval_runtime": 110.8523,
1253
+ "eval_samples_per_second": 171.435,
1254
+ "eval_steps_per_second": 3.572,
1255
+ "step": 4150
1256
+ },
1257
+ {
1258
+ "epoch": 1.35702746365105,
1259
+ "grad_norm": 0.48619943857192993,
1260
+ "learning_rate": 0.00019457318255250406,
1261
+ "loss": 0.1792,
1262
+ "step": 4200
1263
+ },
1264
+ {
1265
+ "epoch": 1.35702746365105,
1266
+ "eval_loss": 0.17806969583034515,
1267
+ "eval_runtime": 112.0106,
1268
+ "eval_samples_per_second": 169.663,
1269
+ "eval_steps_per_second": 3.535,
1270
+ "step": 4200
1271
+ },
1272
+ {
1273
+ "epoch": 1.3731825525040389,
1274
+ "grad_norm": 0.4447220265865326,
1275
+ "learning_rate": 0.0001945085621970921,
1276
+ "loss": 0.1772,
1277
+ "step": 4250
1278
+ },
1279
+ {
1280
+ "epoch": 1.3731825525040389,
1281
+ "eval_loss": 0.18053770065307617,
1282
+ "eval_runtime": 109.3432,
1283
+ "eval_samples_per_second": 173.801,
1284
+ "eval_steps_per_second": 3.622,
1285
+ "step": 4250
1286
+ },
1287
+ {
1288
+ "epoch": 1.3893376413570275,
1289
+ "grad_norm": 0.418562114238739,
1290
+ "learning_rate": 0.00019444394184168012,
1291
+ "loss": 0.1805,
1292
+ "step": 4300
1293
+ },
1294
+ {
1295
+ "epoch": 1.3893376413570275,
1296
+ "eval_loss": 0.17956310510635376,
1297
+ "eval_runtime": 109.4642,
1298
+ "eval_samples_per_second": 173.609,
1299
+ "eval_steps_per_second": 3.618,
1300
+ "step": 4300
1301
+ },
1302
+ {
1303
+ "epoch": 1.4054927302100162,
1304
+ "grad_norm": 0.4397905170917511,
1305
+ "learning_rate": 0.0001943793214862682,
1306
+ "loss": 0.176,
1307
+ "step": 4350
1308
+ },
1309
+ {
1310
+ "epoch": 1.4054927302100162,
1311
+ "eval_loss": 0.1766211986541748,
1312
+ "eval_runtime": 118.1975,
1313
+ "eval_samples_per_second": 160.782,
1314
+ "eval_steps_per_second": 3.35,
1315
+ "step": 4350
1316
+ },
1317
+ {
1318
+ "epoch": 1.4216478190630049,
1319
+ "grad_norm": 0.3560314476490021,
1320
+ "learning_rate": 0.00019431470113085624,
1321
+ "loss": 0.1719,
1322
+ "step": 4400
1323
+ },
1324
+ {
1325
+ "epoch": 1.4216478190630049,
1326
+ "eval_loss": 0.17780330777168274,
1327
+ "eval_runtime": 109.6268,
1328
+ "eval_samples_per_second": 173.352,
1329
+ "eval_steps_per_second": 3.612,
1330
+ "step": 4400
1331
+ },
1332
+ {
1333
+ "epoch": 1.4378029079159935,
1334
+ "grad_norm": 0.3879343271255493,
1335
+ "learning_rate": 0.00019425008077544425,
1336
+ "loss": 0.1733,
1337
+ "step": 4450
1338
+ },
1339
+ {
1340
+ "epoch": 1.4378029079159935,
1341
+ "eval_loss": 0.17606207728385925,
1342
+ "eval_runtime": 109.7272,
1343
+ "eval_samples_per_second": 173.193,
1344
+ "eval_steps_per_second": 3.609,
1345
+ "step": 4450
1346
+ },
1347
+ {
1348
+ "epoch": 1.4539579967689822,
1349
+ "grad_norm": 0.4320586919784546,
1350
+ "learning_rate": 0.00019418546042003233,
1351
+ "loss": 0.1721,
1352
+ "step": 4500
1353
+ },
1354
+ {
1355
+ "epoch": 1.4539579967689822,
1356
+ "eval_loss": 0.17723843455314636,
1357
+ "eval_runtime": 107.8177,
1358
+ "eval_samples_per_second": 176.261,
1359
+ "eval_steps_per_second": 3.673,
1360
+ "step": 4500
1361
+ },
1362
+ {
1363
+ "epoch": 1.4701130856219708,
1364
+ "grad_norm": 0.31035879254341125,
1365
+ "learning_rate": 0.00019412084006462037,
1366
+ "loss": 0.1689,
1367
+ "step": 4550
1368
+ },
1369
+ {
1370
+ "epoch": 1.4701130856219708,
1371
+ "eval_loss": 0.17564034461975098,
1372
+ "eval_runtime": 109.0906,
1373
+ "eval_samples_per_second": 174.204,
1374
+ "eval_steps_per_second": 3.63,
1375
+ "step": 4550
1376
+ },
1377
+ {
1378
+ "epoch": 1.4862681744749597,
1379
+ "grad_norm": 0.37017735838890076,
1380
+ "learning_rate": 0.00019405621970920841,
1381
+ "loss": 0.1771,
1382
+ "step": 4600
1383
+ },
1384
+ {
1385
+ "epoch": 1.4862681744749597,
1386
+ "eval_loss": 0.1736358106136322,
1387
+ "eval_runtime": 110.4481,
1388
+ "eval_samples_per_second": 172.063,
1389
+ "eval_steps_per_second": 3.585,
1390
+ "step": 4600
1391
+ },
1392
+ {
1393
+ "epoch": 1.5024232633279482,
1394
+ "grad_norm": 0.5021731853485107,
1395
+ "learning_rate": 0.00019399159935379646,
1396
+ "loss": 0.1717,
1397
+ "step": 4650
1398
+ },
1399
+ {
1400
+ "epoch": 1.5024232633279482,
1401
+ "eval_loss": 0.17395810782909393,
1402
+ "eval_runtime": 110.9158,
1403
+ "eval_samples_per_second": 171.337,
1404
+ "eval_steps_per_second": 3.57,
1405
+ "step": 4650
1406
+ },
1407
+ {
1408
+ "epoch": 1.518578352180937,
1409
+ "grad_norm": 0.3692797124385834,
1410
+ "learning_rate": 0.0001939269789983845,
1411
+ "loss": 0.17,
1412
+ "step": 4700
1413
+ },
1414
+ {
1415
+ "epoch": 1.518578352180937,
1416
+ "eval_loss": 0.17615529894828796,
1417
+ "eval_runtime": 111.7907,
1418
+ "eval_samples_per_second": 169.996,
1419
+ "eval_steps_per_second": 3.542,
1420
+ "step": 4700
1421
+ },
1422
+ {
1423
+ "epoch": 1.5347334410339257,
1424
+ "grad_norm": 0.44357678294181824,
1425
+ "learning_rate": 0.00019386235864297255,
1426
+ "loss": 0.1635,
1427
+ "step": 4750
1428
+ },
1429
+ {
1430
+ "epoch": 1.5347334410339257,
1431
+ "eval_loss": 0.17349159717559814,
1432
+ "eval_runtime": 108.9427,
1433
+ "eval_samples_per_second": 174.44,
1434
+ "eval_steps_per_second": 3.635,
1435
+ "step": 4750
1436
+ },
1437
+ {
1438
+ "epoch": 1.5508885298869144,
1439
+ "grad_norm": 0.4116641581058502,
1440
+ "learning_rate": 0.0001937977382875606,
1441
+ "loss": 0.166,
1442
+ "step": 4800
1443
+ },
1444
+ {
1445
+ "epoch": 1.5508885298869144,
1446
+ "eval_loss": 0.17150762677192688,
1447
+ "eval_runtime": 112.722,
1448
+ "eval_samples_per_second": 168.592,
1449
+ "eval_steps_per_second": 3.513,
1450
+ "step": 4800
1451
+ },
1452
+ {
1453
+ "epoch": 1.567043618739903,
1454
+ "grad_norm": 0.4795362651348114,
1455
+ "learning_rate": 0.00019373311793214864,
1456
+ "loss": 0.1635,
1457
+ "step": 4850
1458
+ },
1459
+ {
1460
+ "epoch": 1.567043618739903,
1461
+ "eval_loss": 0.17290830612182617,
1462
+ "eval_runtime": 109.9865,
1463
+ "eval_samples_per_second": 172.785,
1464
+ "eval_steps_per_second": 3.6,
1465
+ "step": 4850
1466
+ },
1467
+ {
1468
+ "epoch": 1.5831987075928917,
1469
+ "grad_norm": 0.4252488315105438,
1470
+ "learning_rate": 0.00019366849757673668,
1471
+ "loss": 0.1657,
1472
+ "step": 4900
1473
+ },
1474
+ {
1475
+ "epoch": 1.5831987075928917,
1476
+ "eval_loss": 0.1683739274740219,
1477
+ "eval_runtime": 110.2434,
1478
+ "eval_samples_per_second": 172.382,
1479
+ "eval_steps_per_second": 3.592,
1480
+ "step": 4900
1481
+ },
1482
+ {
1483
+ "epoch": 1.5993537964458806,
1484
+ "grad_norm": 0.3322688639163971,
1485
+ "learning_rate": 0.00019360387722132472,
1486
+ "loss": 0.1686,
1487
+ "step": 4950
1488
+ },
1489
+ {
1490
+ "epoch": 1.5993537964458806,
1491
+ "eval_loss": 0.16992688179016113,
1492
+ "eval_runtime": 109.0229,
1493
+ "eval_samples_per_second": 174.312,
1494
+ "eval_steps_per_second": 3.632,
1495
+ "step": 4950
1496
+ },
1497
+ {
1498
+ "epoch": 1.615508885298869,
1499
+ "grad_norm": 0.3991793394088745,
1500
+ "learning_rate": 0.00019353925686591277,
1501
+ "loss": 0.163,
1502
+ "step": 5000
1503
+ },
1504
+ {
1505
+ "epoch": 1.615508885298869,
1506
+ "eval_loss": 0.16902555525302887,
1507
+ "eval_runtime": 120.177,
1508
+ "eval_samples_per_second": 158.133,
1509
+ "eval_steps_per_second": 3.295,
1510
+ "step": 5000
1511
+ },
1512
+ {
1513
+ "epoch": 1.631663974151858,
1514
+ "grad_norm": 0.2803505063056946,
1515
+ "learning_rate": 0.0001934746365105008,
1516
+ "loss": 0.1574,
1517
+ "step": 5050
1518
+ },
1519
+ {
1520
+ "epoch": 1.631663974151858,
1521
+ "eval_loss": 0.17043916881084442,
1522
+ "eval_runtime": 108.4069,
1523
+ "eval_samples_per_second": 175.303,
1524
+ "eval_steps_per_second": 3.653,
1525
+ "step": 5050
1526
+ },
1527
+ {
1528
+ "epoch": 1.6478190630048464,
1529
+ "grad_norm": 0.39425361156463623,
1530
+ "learning_rate": 0.00019341001615508886,
1531
+ "loss": 0.1575,
1532
+ "step": 5100
1533
+ },
1534
+ {
1535
+ "epoch": 1.6478190630048464,
1536
+ "eval_loss": 0.17015740275382996,
1537
+ "eval_runtime": 110.4234,
1538
+ "eval_samples_per_second": 172.101,
1539
+ "eval_steps_per_second": 3.586,
1540
+ "step": 5100
1541
+ },
1542
+ {
1543
+ "epoch": 1.6639741518578353,
1544
+ "grad_norm": 0.3668546676635742,
1545
+ "learning_rate": 0.0001933453957996769,
1546
+ "loss": 0.1697,
1547
+ "step": 5150
1548
+ },
1549
+ {
1550
+ "epoch": 1.6639741518578353,
1551
+ "eval_loss": 0.17117071151733398,
1552
+ "eval_runtime": 131.1419,
1553
+ "eval_samples_per_second": 144.912,
1554
+ "eval_steps_per_second": 3.02,
1555
+ "step": 5150
1556
+ },
1557
+ {
1558
+ "epoch": 1.680129240710824,
1559
+ "grad_norm": 0.4108649790287018,
1560
+ "learning_rate": 0.00019328077544426494,
1561
+ "loss": 0.1637,
1562
+ "step": 5200
1563
+ },
1564
+ {
1565
+ "epoch": 1.680129240710824,
1566
+ "eval_loss": 0.1672579050064087,
1567
+ "eval_runtime": 116.3456,
1568
+ "eval_samples_per_second": 163.341,
1569
+ "eval_steps_per_second": 3.404,
1570
+ "step": 5200
1571
+ },
1572
+ {
1573
+ "epoch": 1.6962843295638126,
1574
+ "grad_norm": 0.3385171592235565,
1575
+ "learning_rate": 0.000193216155088853,
1576
+ "loss": 0.1645,
1577
+ "step": 5250
1578
+ },
1579
+ {
1580
+ "epoch": 1.6962843295638126,
1581
+ "eval_loss": 0.1657264679670334,
1582
+ "eval_runtime": 113.7291,
1583
+ "eval_samples_per_second": 167.099,
1584
+ "eval_steps_per_second": 3.482,
1585
+ "step": 5250
1586
+ },
1587
+ {
1588
+ "epoch": 1.7124394184168013,
1589
+ "grad_norm": 0.3096817135810852,
1590
+ "learning_rate": 0.00019315153473344106,
1591
+ "loss": 0.1637,
1592
+ "step": 5300
1593
+ },
1594
+ {
1595
+ "epoch": 1.7124394184168013,
1596
+ "eval_loss": 0.1663082391023636,
1597
+ "eval_runtime": 112.0716,
1598
+ "eval_samples_per_second": 169.57,
1599
+ "eval_steps_per_second": 3.533,
1600
+ "step": 5300
1601
+ },
1602
+ {
1603
+ "epoch": 1.72859450726979,
1604
+ "grad_norm": 0.34010905027389526,
1605
+ "learning_rate": 0.00019308691437802908,
1606
+ "loss": 0.1625,
1607
+ "step": 5350
1608
+ },
1609
+ {
1610
+ "epoch": 1.72859450726979,
1611
+ "eval_loss": 0.16775698959827423,
1612
+ "eval_runtime": 107.0653,
1613
+ "eval_samples_per_second": 177.499,
1614
+ "eval_steps_per_second": 3.699,
1615
+ "step": 5350
1616
+ },
1617
+ {
1618
+ "epoch": 1.7447495961227788,
1619
+ "grad_norm": 0.34848374128341675,
1620
+ "learning_rate": 0.00019302229402261712,
1621
+ "loss": 0.1611,
1622
+ "step": 5400
1623
+ },
1624
+ {
1625
+ "epoch": 1.7447495961227788,
1626
+ "eval_loss": 0.1658620685338974,
1627
+ "eval_runtime": 114.0524,
1628
+ "eval_samples_per_second": 166.625,
1629
+ "eval_steps_per_second": 3.472,
1630
+ "step": 5400
1631
+ },
1632
+ {
1633
+ "epoch": 1.7609046849757672,
1634
+ "grad_norm": 0.3644295334815979,
1635
+ "learning_rate": 0.0001929576736672052,
1636
+ "loss": 0.1611,
1637
+ "step": 5450
1638
+ },
1639
+ {
1640
+ "epoch": 1.7609046849757672,
1641
+ "eval_loss": 0.1643301099538803,
1642
+ "eval_runtime": 109.7052,
1643
+ "eval_samples_per_second": 173.228,
1644
+ "eval_steps_per_second": 3.61,
1645
+ "step": 5450
1646
+ },
1647
+ {
1648
+ "epoch": 1.7770597738287561,
1649
+ "grad_norm": 0.4286295473575592,
1650
+ "learning_rate": 0.0001928930533117932,
1651
+ "loss": 0.1572,
1652
+ "step": 5500
1653
+ },
1654
+ {
1655
+ "epoch": 1.7770597738287561,
1656
+ "eval_loss": 0.16254638135433197,
1657
+ "eval_runtime": 111.7116,
1658
+ "eval_samples_per_second": 170.117,
1659
+ "eval_steps_per_second": 3.545,
1660
+ "step": 5500
1661
+ },
1662
+ {
1663
+ "epoch": 1.7932148626817448,
1664
+ "grad_norm": 0.3320305645465851,
1665
+ "learning_rate": 0.00019282843295638125,
1666
+ "loss": 0.1588,
1667
+ "step": 5550
1668
+ },
1669
+ {
1670
+ "epoch": 1.7932148626817448,
1671
+ "eval_loss": 0.16255541145801544,
1672
+ "eval_runtime": 113.7525,
1673
+ "eval_samples_per_second": 167.064,
1674
+ "eval_steps_per_second": 3.481,
1675
+ "step": 5550
1676
+ },
1677
+ {
1678
+ "epoch": 1.8093699515347335,
1679
+ "grad_norm": 0.3314014673233032,
1680
+ "learning_rate": 0.00019276381260096933,
1681
+ "loss": 0.1571,
1682
+ "step": 5600
1683
+ },
1684
+ {
1685
+ "epoch": 1.8093699515347335,
1686
+ "eval_loss": 0.16239432990550995,
1687
+ "eval_runtime": 113.2812,
1688
+ "eval_samples_per_second": 167.76,
1689
+ "eval_steps_per_second": 3.496,
1690
+ "step": 5600
1691
+ },
1692
+ {
1693
+ "epoch": 1.8255250403877221,
1694
+ "grad_norm": 0.3537631034851074,
1695
+ "learning_rate": 0.00019269919224555737,
1696
+ "loss": 0.1551,
1697
+ "step": 5650
1698
+ },
1699
+ {
1700
+ "epoch": 1.8255250403877221,
1701
+ "eval_loss": 0.16390350461006165,
1702
+ "eval_runtime": 116.9761,
1703
+ "eval_samples_per_second": 162.461,
1704
+ "eval_steps_per_second": 3.385,
1705
+ "step": 5650
1706
+ },
1707
+ {
1708
+ "epoch": 1.8416801292407108,
1709
+ "grad_norm": 0.38042768836021423,
1710
+ "learning_rate": 0.0001926345718901454,
1711
+ "loss": 0.1521,
1712
+ "step": 5700
1713
+ },
1714
+ {
1715
+ "epoch": 1.8416801292407108,
1716
+ "eval_loss": 0.16476310789585114,
1717
+ "eval_runtime": 110.127,
1718
+ "eval_samples_per_second": 172.564,
1719
+ "eval_steps_per_second": 3.596,
1720
+ "step": 5700
1721
+ },
1722
+ {
1723
+ "epoch": 1.8578352180936997,
1724
+ "grad_norm": 0.29028207063674927,
1725
+ "learning_rate": 0.00019256995153473346,
1726
+ "loss": 0.1583,
1727
+ "step": 5750
1728
+ },
1729
+ {
1730
+ "epoch": 1.8578352180936997,
1731
+ "eval_loss": 0.16223183274269104,
1732
+ "eval_runtime": 108.7981,
1733
+ "eval_samples_per_second": 174.672,
1734
+ "eval_steps_per_second": 3.64,
1735
+ "step": 5750
1736
+ },
1737
+ {
1738
+ "epoch": 1.8739903069466881,
1739
+ "grad_norm": 0.28699570894241333,
1740
+ "learning_rate": 0.0001925053311793215,
1741
+ "loss": 0.1552,
1742
+ "step": 5800
1743
+ },
1744
+ {
1745
+ "epoch": 1.8739903069466881,
1746
+ "eval_loss": 0.16354109346866608,
1747
+ "eval_runtime": 108.9341,
1748
+ "eval_samples_per_second": 174.454,
1749
+ "eval_steps_per_second": 3.635,
1750
+ "step": 5800
1751
+ },
1752
+ {
1753
+ "epoch": 1.890145395799677,
1754
+ "grad_norm": 0.34706467390060425,
1755
+ "learning_rate": 0.00019244071082390952,
1756
+ "loss": 0.1596,
1757
+ "step": 5850
1758
+ },
1759
+ {
1760
+ "epoch": 1.890145395799677,
1761
+ "eval_loss": 0.16145038604736328,
1762
+ "eval_runtime": 111.1377,
1763
+ "eval_samples_per_second": 170.995,
1764
+ "eval_steps_per_second": 3.563,
1765
+ "step": 5850
1766
+ },
1767
+ {
1768
+ "epoch": 1.9063004846526654,
1769
+ "grad_norm": 0.46458888053894043,
1770
+ "learning_rate": 0.0001923760904684976,
1771
+ "loss": 0.1582,
1772
+ "step": 5900
1773
+ },
1774
+ {
1775
+ "epoch": 1.9063004846526654,
1776
+ "eval_loss": 0.165074422955513,
1777
+ "eval_runtime": 111.4361,
1778
+ "eval_samples_per_second": 170.537,
1779
+ "eval_steps_per_second": 3.554,
1780
+ "step": 5900
1781
+ },
1782
+ {
1783
+ "epoch": 1.9224555735056543,
1784
+ "grad_norm": 0.35297635197639465,
1785
+ "learning_rate": 0.00019231147011308564,
1786
+ "loss": 0.157,
1787
+ "step": 5950
1788
+ },
1789
+ {
1790
+ "epoch": 1.9224555735056543,
1791
+ "eval_loss": 0.1603960543870926,
1792
+ "eval_runtime": 114.7674,
1793
+ "eval_samples_per_second": 165.587,
1794
+ "eval_steps_per_second": 3.45,
1795
+ "step": 5950
1796
+ },
1797
+ {
1798
+ "epoch": 1.938610662358643,
1799
+ "grad_norm": 0.4734126031398773,
1800
+ "learning_rate": 0.00019224684975767368,
1801
+ "loss": 0.1531,
1802
+ "step": 6000
1803
+ },
1804
+ {
1805
+ "epoch": 1.938610662358643,
1806
+ "eval_loss": 0.15993832051753998,
1807
+ "eval_runtime": 113.255,
1808
+ "eval_samples_per_second": 167.798,
1809
+ "eval_steps_per_second": 3.497,
1810
+ "step": 6000
1811
+ }
1812
+ ],
1813
+ "logging_steps": 50,
1814
+ "max_steps": 154750,
1815
+ "num_input_tokens_seen": 0,
1816
+ "num_train_epochs": 50,
1817
+ "save_steps": 1000,
1818
+ "stateful_callbacks": {
1819
+ "EarlyStoppingCallback": {
1820
+ "args": {
1821
+ "early_stopping_patience": 10,
1822
+ "early_stopping_threshold": 0.001
1823
+ },
1824
+ "attributes": {
1825
+ "early_stopping_patience_counter": 1
1826
+ }
1827
+ },
1828
+ "TrainerControl": {
1829
+ "args": {
1830
+ "should_epoch_stop": false,
1831
+ "should_evaluate": false,
1832
+ "should_log": false,
1833
+ "should_save": true,
1834
+ "should_training_stop": false
1835
+ },
1836
+ "attributes": {}
1837
+ }
1838
+ },
1839
+ "total_flos": 6.052398008610816e+16,
1840
+ "train_batch_size": 96,
1841
+ "trial_name": null,
1842
+ "trial_params": null
1843
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:168f2162eb27dab75485a9f1ea4ae472ec64e43a60809a43a73aac1ef28bb474
3
+ size 15825