MihaiPopa-1 commited on
Commit
0b261dd
·
verified ·
1 Parent(s): 4f2292a

Upload folder using huggingface_hub

Browse files
chat_template.jinja ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system
2
+ You are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>
3
+ ' }}{% endif %}{{'<|im_start|>' + message['role'] + '
4
+ ' + message['content'] + '<|im_end|>' + '
5
+ '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
6
+ ' }}{% endif %}
config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 2,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 576,
13
+ "initializer_range": 0.041666666666666664,
14
+ "intermediate_size": 1536,
15
+ "is_llama_config": true,
16
+ "max_position_embeddings": 8192,
17
+ "mlp_bias": false,
18
+ "model_type": "llama",
19
+ "num_attention_heads": 9,
20
+ "num_hidden_layers": 30,
21
+ "num_key_value_heads": 3,
22
+ "pad_token_id": 2,
23
+ "pretraining_tp": 1,
24
+ "rms_norm_eps": 1e-05,
25
+ "rope_interleaved": false,
26
+ "rope_parameters": {
27
+ "rope_theta": 100000,
28
+ "rope_type": "default"
29
+ },
30
+ "tie_word_embeddings": true,
31
+ "transformers.js_config": {
32
+ "kv_cache_dtype": {
33
+ "fp16": "float16",
34
+ "q4f16": "float16"
35
+ }
36
+ },
37
+ "transformers_version": "5.0.0",
38
+ "use_cache": false,
39
+ "vocab_size": 49152
40
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 2,
6
+ "transformers_version": "5.0.0"
7
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82f447a455940a896048f8b2f115d2933867c50f0980b8429722b502e2ce9fde
3
+ size 269060552
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fddf1d3cb89f82e24f5e570a4864a45bb1f887d6974edc5c53c9ce3cca90ad8d
3
+ size 538293323
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4a9f217e852f439efa6bd32fde98d6867f11aa6ea13ddc021ba10af6a0b0934
3
+ size 14645
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ebf781c5a5513927f21974a6d69dc04390db4fd082cbb36d0f415e67d837ef7
3
+ size 1465
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<|im_start|>",
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "extra_special_tokens": [
8
+ "<|im_start|>",
9
+ "<|im_end|>"
10
+ ],
11
+ "is_local": false,
12
+ "model_max_length": 8192,
13
+ "pad_token": "<|im_end|>",
14
+ "tokenizer_class": "TokenizersBackend",
15
+ "unk_token": "<|endoftext|>",
16
+ "vocab_size": 49152
17
+ }
trainer_state.json ADDED
@@ -0,0 +1,1994 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2805,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.010703773080010704,
14
+ "grad_norm": 2.640625,
15
+ "learning_rate": 1.993582887700535e-05,
16
+ "loss": 1.5584056854248047,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.02140754616002141,
21
+ "grad_norm": 2.53125,
22
+ "learning_rate": 1.9864527629233515e-05,
23
+ "loss": 1.562470054626465,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.03211131924003211,
28
+ "grad_norm": 2.46875,
29
+ "learning_rate": 1.9793226381461677e-05,
30
+ "loss": 1.5967525482177733,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.04281509232004282,
35
+ "grad_norm": 2.546875,
36
+ "learning_rate": 1.972192513368984e-05,
37
+ "loss": 1.5231587409973144,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.05351886540005352,
42
+ "grad_norm": 2.015625,
43
+ "learning_rate": 1.9650623885918005e-05,
44
+ "loss": 1.4803240776062012,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.06422263848006422,
49
+ "grad_norm": 2.21875,
50
+ "learning_rate": 1.957932263814617e-05,
51
+ "loss": 1.5196543693542481,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.07492641156007493,
56
+ "grad_norm": 2.21875,
57
+ "learning_rate": 1.9508021390374332e-05,
58
+ "loss": 1.4593828201293946,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.08563018464008564,
63
+ "grad_norm": 2.28125,
64
+ "learning_rate": 1.9436720142602497e-05,
65
+ "loss": 1.4797739028930663,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.09633395772009633,
70
+ "grad_norm": 2.078125,
71
+ "learning_rate": 1.9365418894830663e-05,
72
+ "loss": 1.5016543388366699,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.10703773080010703,
77
+ "grad_norm": 2.6875,
78
+ "learning_rate": 1.9294117647058825e-05,
79
+ "loss": 1.4800514221191405,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.11774150388011774,
84
+ "grad_norm": 1.75,
85
+ "learning_rate": 1.922281639928699e-05,
86
+ "loss": 1.445749568939209,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.12844527696012845,
91
+ "grad_norm": 2.0625,
92
+ "learning_rate": 1.9151515151515152e-05,
93
+ "loss": 1.4519201278686524,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.13914905004013914,
98
+ "grad_norm": 2.03125,
99
+ "learning_rate": 1.9080213903743317e-05,
100
+ "loss": 1.3476288795471192,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.14985282312014986,
105
+ "grad_norm": 2.234375,
106
+ "learning_rate": 1.9008912655971482e-05,
107
+ "loss": 1.4554848670959473,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.16055659620016055,
112
+ "grad_norm": 1.9375,
113
+ "learning_rate": 1.8937611408199644e-05,
114
+ "loss": 1.4435239791870118,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.17126036928017127,
119
+ "grad_norm": 1.671875,
120
+ "learning_rate": 1.886631016042781e-05,
121
+ "loss": 1.3767672538757325,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.18196414236018196,
126
+ "grad_norm": 2.09375,
127
+ "learning_rate": 1.8795008912655972e-05,
128
+ "loss": 1.4352334022521973,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.19266791544019266,
133
+ "grad_norm": 2.0,
134
+ "learning_rate": 1.8723707664884137e-05,
135
+ "loss": 1.382822322845459,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.20337168852020338,
140
+ "grad_norm": 1.953125,
141
+ "learning_rate": 1.8652406417112302e-05,
142
+ "loss": 1.429026985168457,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.21407546160021407,
147
+ "grad_norm": 1.859375,
148
+ "learning_rate": 1.8581105169340464e-05,
149
+ "loss": 1.3564122200012207,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.2247792346802248,
154
+ "grad_norm": 1.7109375,
155
+ "learning_rate": 1.850980392156863e-05,
156
+ "loss": 1.457004451751709,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.23548300776023548,
161
+ "grad_norm": 1.9921875,
162
+ "learning_rate": 1.843850267379679e-05,
163
+ "loss": 1.3679749488830566,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.2461867808402462,
168
+ "grad_norm": 2.0625,
169
+ "learning_rate": 1.8367201426024957e-05,
170
+ "loss": 1.4186459541320802,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.2568905539202569,
175
+ "grad_norm": 1.8984375,
176
+ "learning_rate": 1.8295900178253122e-05,
177
+ "loss": 1.3645942687988282,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.2675943270002676,
182
+ "grad_norm": 1.796875,
183
+ "learning_rate": 1.8224598930481284e-05,
184
+ "loss": 1.3659990310668946,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.2782981000802783,
189
+ "grad_norm": 1.9375,
190
+ "learning_rate": 1.815329768270945e-05,
191
+ "loss": 1.3751505851745605,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.289001873160289,
196
+ "grad_norm": 1.96875,
197
+ "learning_rate": 1.808199643493761e-05,
198
+ "loss": 1.394303798675537,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.2997056462402997,
203
+ "grad_norm": 1.71875,
204
+ "learning_rate": 1.8010695187165777e-05,
205
+ "loss": 1.3266244888305665,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.3104094193203104,
210
+ "grad_norm": 1.6328125,
211
+ "learning_rate": 1.7939393939393942e-05,
212
+ "loss": 1.3767006874084473,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.3211131924003211,
217
+ "grad_norm": 1.8359375,
218
+ "learning_rate": 1.7868092691622104e-05,
219
+ "loss": 1.3508996963500977,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.3318169654803318,
224
+ "grad_norm": 1.8984375,
225
+ "learning_rate": 1.779679144385027e-05,
226
+ "loss": 1.299268627166748,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.34252073856034254,
231
+ "grad_norm": 1.6015625,
232
+ "learning_rate": 1.772549019607843e-05,
233
+ "loss": 1.335693073272705,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.35322451164035323,
238
+ "grad_norm": 1.6171875,
239
+ "learning_rate": 1.7654188948306597e-05,
240
+ "loss": 1.3631214141845702,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.3639282847203639,
245
+ "grad_norm": 1.7421875,
246
+ "learning_rate": 1.7582887700534762e-05,
247
+ "loss": 1.349259376525879,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.3746320578003746,
252
+ "grad_norm": 1.8515625,
253
+ "learning_rate": 1.7511586452762924e-05,
254
+ "loss": 1.3234673500061036,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.3853358308803853,
259
+ "grad_norm": 1.734375,
260
+ "learning_rate": 1.744028520499109e-05,
261
+ "loss": 1.34688138961792,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.39603960396039606,
266
+ "grad_norm": 1.796875,
267
+ "learning_rate": 1.736898395721925e-05,
268
+ "loss": 1.310294246673584,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.40674337704040675,
273
+ "grad_norm": 1.5703125,
274
+ "learning_rate": 1.7297682709447417e-05,
275
+ "loss": 1.3146047592163086,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.41744715012041744,
280
+ "grad_norm": 2.078125,
281
+ "learning_rate": 1.7226381461675582e-05,
282
+ "loss": 1.3516902923583984,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.42815092320042814,
287
+ "grad_norm": 1.6875,
288
+ "learning_rate": 1.7155080213903744e-05,
289
+ "loss": 1.3631095886230469,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.4388546962804388,
294
+ "grad_norm": 1.578125,
295
+ "learning_rate": 1.708377896613191e-05,
296
+ "loss": 1.3395885467529296,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.4495584693604496,
301
+ "grad_norm": 1.7890625,
302
+ "learning_rate": 1.701247771836007e-05,
303
+ "loss": 1.322316837310791,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.46026224244046027,
308
+ "grad_norm": 2.03125,
309
+ "learning_rate": 1.6941176470588237e-05,
310
+ "loss": 1.3892762184143066,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.47096601552047096,
315
+ "grad_norm": 1.765625,
316
+ "learning_rate": 1.6869875222816402e-05,
317
+ "loss": 1.3081950187683105,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.48166978860048165,
322
+ "grad_norm": 1.75,
323
+ "learning_rate": 1.6798573975044564e-05,
324
+ "loss": 1.3405800819396974,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.4923735616804924,
329
+ "grad_norm": 1.6171875,
330
+ "learning_rate": 1.672727272727273e-05,
331
+ "loss": 1.3331517219543456,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.5030773347605031,
336
+ "grad_norm": 1.703125,
337
+ "learning_rate": 1.665597147950089e-05,
338
+ "loss": 1.3040351867675781,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.5137811078405138,
343
+ "grad_norm": 1.7265625,
344
+ "learning_rate": 1.6584670231729056e-05,
345
+ "loss": 1.319422149658203,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.5244848809205245,
350
+ "grad_norm": 1.8125,
351
+ "learning_rate": 1.6513368983957222e-05,
352
+ "loss": 1.3433240890502929,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.5351886540005352,
357
+ "grad_norm": 1.7265625,
358
+ "learning_rate": 1.6442067736185384e-05,
359
+ "loss": 1.3346479415893555,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.5458924270805459,
364
+ "grad_norm": 1.5625,
365
+ "learning_rate": 1.637076648841355e-05,
366
+ "loss": 1.3032867431640625,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.5565962001605566,
371
+ "grad_norm": 1.78125,
372
+ "learning_rate": 1.629946524064171e-05,
373
+ "loss": 1.3006314277648925,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.5672999732405672,
378
+ "grad_norm": 1.765625,
379
+ "learning_rate": 1.6228163992869876e-05,
380
+ "loss": 1.3416614532470703,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.578003746320578,
385
+ "grad_norm": 2.015625,
386
+ "learning_rate": 1.615686274509804e-05,
387
+ "loss": 1.303782081604004,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.5887075194005887,
392
+ "grad_norm": 1.578125,
393
+ "learning_rate": 1.6085561497326207e-05,
394
+ "loss": 1.2814931869506836,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.5994112924805994,
399
+ "grad_norm": 1.53125,
400
+ "learning_rate": 1.601426024955437e-05,
401
+ "loss": 1.3404861450195313,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.6101150655606101,
406
+ "grad_norm": 1.7734375,
407
+ "learning_rate": 1.594295900178253e-05,
408
+ "loss": 1.3594398498535156,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.6208188386406208,
413
+ "grad_norm": 1.609375,
414
+ "learning_rate": 1.5871657754010696e-05,
415
+ "loss": 1.2768223762512207,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.6315226117206315,
420
+ "grad_norm": 1.609375,
421
+ "learning_rate": 1.580035650623886e-05,
422
+ "loss": 1.3110815048217774,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 0.6422263848006422,
427
+ "grad_norm": 1.6640625,
428
+ "learning_rate": 1.5729055258467027e-05,
429
+ "loss": 1.2639217376708984,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 0.6529301578806529,
434
+ "grad_norm": 1.8203125,
435
+ "learning_rate": 1.565775401069519e-05,
436
+ "loss": 1.3356239318847656,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 0.6636339309606636,
441
+ "grad_norm": 1.8828125,
442
+ "learning_rate": 1.558645276292335e-05,
443
+ "loss": 1.3733593940734863,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 0.6743377040406744,
448
+ "grad_norm": 1.5703125,
449
+ "learning_rate": 1.5515151515151516e-05,
450
+ "loss": 1.2768065452575683,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 0.6850414771206851,
455
+ "grad_norm": 1.7578125,
456
+ "learning_rate": 1.544385026737968e-05,
457
+ "loss": 1.345008659362793,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 0.6957452502006958,
462
+ "grad_norm": 1.4921875,
463
+ "learning_rate": 1.5372549019607847e-05,
464
+ "loss": 1.2327005386352539,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 0.7064490232807065,
469
+ "grad_norm": 1.765625,
470
+ "learning_rate": 1.530124777183601e-05,
471
+ "loss": 1.327579879760742,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 0.7171527963607172,
476
+ "grad_norm": 1.6640625,
477
+ "learning_rate": 1.5229946524064172e-05,
478
+ "loss": 1.2693171501159668,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 0.7278565694407279,
483
+ "grad_norm": 1.640625,
484
+ "learning_rate": 1.5158645276292336e-05,
485
+ "loss": 1.3229084014892578,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 0.7385603425207385,
490
+ "grad_norm": 1.7265625,
491
+ "learning_rate": 1.5087344028520501e-05,
492
+ "loss": 1.3010024070739745,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 0.7492641156007492,
497
+ "grad_norm": 1.59375,
498
+ "learning_rate": 1.5016042780748665e-05,
499
+ "loss": 1.304527473449707,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 0.7599678886807599,
504
+ "grad_norm": 1.8671875,
505
+ "learning_rate": 1.4944741532976827e-05,
506
+ "loss": 1.2771072387695312,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 0.7706716617607706,
511
+ "grad_norm": 1.671875,
512
+ "learning_rate": 1.4873440285204992e-05,
513
+ "loss": 1.285037899017334,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 0.7813754348407814,
518
+ "grad_norm": 1.6171875,
519
+ "learning_rate": 1.4802139037433156e-05,
520
+ "loss": 1.2612761497497558,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 0.7920792079207921,
525
+ "grad_norm": 1.8125,
526
+ "learning_rate": 1.4730837789661321e-05,
527
+ "loss": 1.3110386848449707,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 0.8027829810008028,
532
+ "grad_norm": 1.671875,
533
+ "learning_rate": 1.4659536541889485e-05,
534
+ "loss": 1.3450962066650392,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 0.8134867540808135,
539
+ "grad_norm": 1.796875,
540
+ "learning_rate": 1.4588235294117647e-05,
541
+ "loss": 1.294900608062744,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 0.8241905271608242,
546
+ "grad_norm": 1.4765625,
547
+ "learning_rate": 1.4516934046345812e-05,
548
+ "loss": 1.3215585708618165,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 0.8348943002408349,
553
+ "grad_norm": 1.5859375,
554
+ "learning_rate": 1.4445632798573976e-05,
555
+ "loss": 1.3044111251831054,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 0.8455980733208456,
560
+ "grad_norm": 1.8671875,
561
+ "learning_rate": 1.4374331550802141e-05,
562
+ "loss": 1.3348912239074706,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 0.8563018464008563,
567
+ "grad_norm": 1.65625,
568
+ "learning_rate": 1.4303030303030305e-05,
569
+ "loss": 1.3434508323669434,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 0.867005619480867,
574
+ "grad_norm": 1.625,
575
+ "learning_rate": 1.4231729055258467e-05,
576
+ "loss": 1.291652297973633,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 0.8777093925608777,
581
+ "grad_norm": 1.4921875,
582
+ "learning_rate": 1.4160427807486632e-05,
583
+ "loss": 1.3067720413208008,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 0.8884131656408885,
588
+ "grad_norm": 1.9453125,
589
+ "learning_rate": 1.4089126559714796e-05,
590
+ "loss": 1.3196195602416991,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 0.8991169387208992,
595
+ "grad_norm": 1.5390625,
596
+ "learning_rate": 1.4017825311942961e-05,
597
+ "loss": 1.3129652976989745,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 0.9098207118009098,
602
+ "grad_norm": 1.5,
603
+ "learning_rate": 1.3946524064171123e-05,
604
+ "loss": 1.2702789306640625,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 0.9205244848809205,
609
+ "grad_norm": 1.9453125,
610
+ "learning_rate": 1.3875222816399288e-05,
611
+ "loss": 1.29964599609375,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 0.9312282579609312,
616
+ "grad_norm": 1.5703125,
617
+ "learning_rate": 1.3803921568627452e-05,
618
+ "loss": 1.301185131072998,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 0.9419320310409419,
623
+ "grad_norm": 1.4140625,
624
+ "learning_rate": 1.3732620320855616e-05,
625
+ "loss": 1.2731993675231934,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 0.9526358041209526,
630
+ "grad_norm": 1.421875,
631
+ "learning_rate": 1.3661319073083781e-05,
632
+ "loss": 1.2806821823120118,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 0.9633395772009633,
637
+ "grad_norm": 1.84375,
638
+ "learning_rate": 1.3590017825311943e-05,
639
+ "loss": 1.2375809669494628,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 0.974043350280974,
644
+ "grad_norm": 1.8046875,
645
+ "learning_rate": 1.3518716577540108e-05,
646
+ "loss": 1.2453808784484863,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 0.9847471233609848,
651
+ "grad_norm": 2.03125,
652
+ "learning_rate": 1.3447415329768272e-05,
653
+ "loss": 1.3074142456054687,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 0.9954508964409955,
658
+ "grad_norm": 1.484375,
659
+ "learning_rate": 1.3376114081996437e-05,
660
+ "loss": 1.2914584159851075,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 1.0053518865400053,
665
+ "grad_norm": 2.0625,
666
+ "learning_rate": 1.33048128342246e-05,
667
+ "loss": 1.3543176651000977,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 1.016055659620016,
672
+ "grad_norm": 1.765625,
673
+ "learning_rate": 1.3233511586452763e-05,
674
+ "loss": 1.3298683166503906,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 1.0267594327000267,
679
+ "grad_norm": 1.59375,
680
+ "learning_rate": 1.3162210338680928e-05,
681
+ "loss": 1.3020204544067382,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 1.0374632057800375,
686
+ "grad_norm": 1.6484375,
687
+ "learning_rate": 1.3090909090909092e-05,
688
+ "loss": 1.3046648025512695,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 1.048166978860048,
693
+ "grad_norm": 1.65625,
694
+ "learning_rate": 1.3019607843137257e-05,
695
+ "loss": 1.2308432579040527,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 1.0588707519400589,
700
+ "grad_norm": 1.65625,
701
+ "learning_rate": 1.294830659536542e-05,
702
+ "loss": 1.2811461448669434,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 1.0695745250200697,
707
+ "grad_norm": 1.6640625,
708
+ "learning_rate": 1.2877005347593583e-05,
709
+ "loss": 1.3090335845947265,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 1.0802782981000802,
714
+ "grad_norm": 1.9921875,
715
+ "learning_rate": 1.2805704099821748e-05,
716
+ "loss": 1.2958572387695313,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 1.090982071180091,
721
+ "grad_norm": 1.6953125,
722
+ "learning_rate": 1.2734402852049912e-05,
723
+ "loss": 1.326209259033203,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 1.1016858442601016,
728
+ "grad_norm": 1.75,
729
+ "learning_rate": 1.2663101604278077e-05,
730
+ "loss": 1.2520675659179688,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 1.1123896173401124,
735
+ "grad_norm": 1.546875,
736
+ "learning_rate": 1.259180035650624e-05,
737
+ "loss": 1.3478898048400878,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 1.123093390420123,
742
+ "grad_norm": 1.4921875,
743
+ "learning_rate": 1.2520499108734403e-05,
744
+ "loss": 1.2806931495666505,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 1.1337971635001338,
749
+ "grad_norm": 1.75,
750
+ "learning_rate": 1.2449197860962568e-05,
751
+ "loss": 1.2603809356689453,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 1.1445009365801444,
756
+ "grad_norm": 1.484375,
757
+ "learning_rate": 1.2377896613190731e-05,
758
+ "loss": 1.2837313652038573,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 1.1552047096601552,
763
+ "grad_norm": 1.859375,
764
+ "learning_rate": 1.2306595365418897e-05,
765
+ "loss": 1.271355152130127,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 1.165908482740166,
770
+ "grad_norm": 1.6015625,
771
+ "learning_rate": 1.223529411764706e-05,
772
+ "loss": 1.2751256942749023,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 1.1766122558201766,
777
+ "grad_norm": 1.5390625,
778
+ "learning_rate": 1.2163992869875222e-05,
779
+ "loss": 1.2217981338500976,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 1.1873160289001874,
784
+ "grad_norm": 1.46875,
785
+ "learning_rate": 1.2092691622103388e-05,
786
+ "loss": 1.3460000038146973,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 1.198019801980198,
791
+ "grad_norm": 1.6328125,
792
+ "learning_rate": 1.2021390374331551e-05,
793
+ "loss": 1.3119497299194336,
794
+ "step": 1120
795
+ },
796
+ {
797
+ "epoch": 1.2087235750602088,
798
+ "grad_norm": 1.5,
799
+ "learning_rate": 1.1950089126559717e-05,
800
+ "loss": 1.326594066619873,
801
+ "step": 1130
802
+ },
803
+ {
804
+ "epoch": 1.2194273481402194,
805
+ "grad_norm": 1.8671875,
806
+ "learning_rate": 1.187878787878788e-05,
807
+ "loss": 1.313736343383789,
808
+ "step": 1140
809
+ },
810
+ {
811
+ "epoch": 1.2301311212202302,
812
+ "grad_norm": 1.625,
813
+ "learning_rate": 1.1807486631016042e-05,
814
+ "loss": 1.2580394744873047,
815
+ "step": 1150
816
+ },
817
+ {
818
+ "epoch": 1.2408348943002407,
819
+ "grad_norm": 1.6953125,
820
+ "learning_rate": 1.1736185383244208e-05,
821
+ "loss": 1.3472198486328124,
822
+ "step": 1160
823
+ },
824
+ {
825
+ "epoch": 1.2515386673802515,
826
+ "grad_norm": 1.6875,
827
+ "learning_rate": 1.1664884135472371e-05,
828
+ "loss": 1.3223270416259765,
829
+ "step": 1170
830
+ },
831
+ {
832
+ "epoch": 1.2622424404602621,
833
+ "grad_norm": 1.5390625,
834
+ "learning_rate": 1.1593582887700537e-05,
835
+ "loss": 1.3479475021362304,
836
+ "step": 1180
837
+ },
838
+ {
839
+ "epoch": 1.272946213540273,
840
+ "grad_norm": 1.46875,
841
+ "learning_rate": 1.15222816399287e-05,
842
+ "loss": 1.2691156387329101,
843
+ "step": 1190
844
+ },
845
+ {
846
+ "epoch": 1.2836499866202837,
847
+ "grad_norm": 1.578125,
848
+ "learning_rate": 1.1450980392156862e-05,
849
+ "loss": 1.3078096389770508,
850
+ "step": 1200
851
+ },
852
+ {
853
+ "epoch": 1.2943537597002943,
854
+ "grad_norm": 1.7265625,
855
+ "learning_rate": 1.1379679144385028e-05,
856
+ "loss": 1.2821264266967773,
857
+ "step": 1210
858
+ },
859
+ {
860
+ "epoch": 1.3050575327803051,
861
+ "grad_norm": 1.6015625,
862
+ "learning_rate": 1.1308377896613191e-05,
863
+ "loss": 1.2466256141662597,
864
+ "step": 1220
865
+ },
866
+ {
867
+ "epoch": 1.3157613058603157,
868
+ "grad_norm": 1.84375,
869
+ "learning_rate": 1.1237076648841357e-05,
870
+ "loss": 1.301154327392578,
871
+ "step": 1230
872
+ },
873
+ {
874
+ "epoch": 1.3264650789403265,
875
+ "grad_norm": 1.6171875,
876
+ "learning_rate": 1.116577540106952e-05,
877
+ "loss": 1.3058858871459962,
878
+ "step": 1240
879
+ },
880
+ {
881
+ "epoch": 1.337168852020337,
882
+ "grad_norm": 1.6875,
883
+ "learning_rate": 1.1094474153297684e-05,
884
+ "loss": 1.257982349395752,
885
+ "step": 1250
886
+ },
887
+ {
888
+ "epoch": 1.3478726251003479,
889
+ "grad_norm": 1.546875,
890
+ "learning_rate": 1.1023172905525847e-05,
891
+ "loss": 1.278379535675049,
892
+ "step": 1260
893
+ },
894
+ {
895
+ "epoch": 1.3585763981803587,
896
+ "grad_norm": 1.890625,
897
+ "learning_rate": 1.0951871657754011e-05,
898
+ "loss": 1.2998493194580079,
899
+ "step": 1270
900
+ },
901
+ {
902
+ "epoch": 1.3692801712603693,
903
+ "grad_norm": 1.5859375,
904
+ "learning_rate": 1.0880570409982176e-05,
905
+ "loss": 1.3042527198791505,
906
+ "step": 1280
907
+ },
908
+ {
909
+ "epoch": 1.3799839443403799,
910
+ "grad_norm": 1.6015625,
911
+ "learning_rate": 1.0809269162210338e-05,
912
+ "loss": 1.2903579711914062,
913
+ "step": 1290
914
+ },
915
+ {
916
+ "epoch": 1.3906877174203907,
917
+ "grad_norm": 1.5390625,
918
+ "learning_rate": 1.0737967914438504e-05,
919
+ "loss": 1.216090202331543,
920
+ "step": 1300
921
+ },
922
+ {
923
+ "epoch": 1.4013914905004015,
924
+ "grad_norm": 1.4296875,
925
+ "learning_rate": 1.0666666666666667e-05,
926
+ "loss": 1.2497664451599122,
927
+ "step": 1310
928
+ },
929
+ {
930
+ "epoch": 1.412095263580412,
931
+ "grad_norm": 1.609375,
932
+ "learning_rate": 1.0595365418894833e-05,
933
+ "loss": 1.2592049598693849,
934
+ "step": 1320
935
+ },
936
+ {
937
+ "epoch": 1.4227990366604228,
938
+ "grad_norm": 1.703125,
939
+ "learning_rate": 1.0524064171122996e-05,
940
+ "loss": 1.3062689781188965,
941
+ "step": 1330
942
+ },
943
+ {
944
+ "epoch": 1.4335028097404334,
945
+ "grad_norm": 1.515625,
946
+ "learning_rate": 1.0452762923351158e-05,
947
+ "loss": 1.2577032089233398,
948
+ "step": 1340
949
+ },
950
+ {
951
+ "epoch": 1.4442065828204442,
952
+ "grad_norm": 1.75,
953
+ "learning_rate": 1.0381461675579324e-05,
954
+ "loss": 1.2874650001525878,
955
+ "step": 1350
956
+ },
957
+ {
958
+ "epoch": 1.4549103559004548,
959
+ "grad_norm": 2.0625,
960
+ "learning_rate": 1.0310160427807487e-05,
961
+ "loss": 1.2887776374816895,
962
+ "step": 1360
963
+ },
964
+ {
965
+ "epoch": 1.4656141289804656,
966
+ "grad_norm": 1.59375,
967
+ "learning_rate": 1.0238859180035653e-05,
968
+ "loss": 1.2869946479797363,
969
+ "step": 1370
970
+ },
971
+ {
972
+ "epoch": 1.4763179020604764,
973
+ "grad_norm": 1.7421875,
974
+ "learning_rate": 1.0167557932263816e-05,
975
+ "loss": 1.3055774688720703,
976
+ "step": 1380
977
+ },
978
+ {
979
+ "epoch": 1.487021675140487,
980
+ "grad_norm": 1.7421875,
981
+ "learning_rate": 1.0096256684491978e-05,
982
+ "loss": 1.2925223350524901,
983
+ "step": 1390
984
+ },
985
+ {
986
+ "epoch": 1.4977254482204978,
987
+ "grad_norm": 1.5390625,
988
+ "learning_rate": 1.0024955436720143e-05,
989
+ "loss": 1.3624143600463867,
990
+ "step": 1400
991
+ },
992
+ {
993
+ "epoch": 1.5084292213005084,
994
+ "grad_norm": 1.7265625,
995
+ "learning_rate": 9.953654188948307e-06,
996
+ "loss": 1.3100957870483398,
997
+ "step": 1410
998
+ },
999
+ {
1000
+ "epoch": 1.5191329943805192,
1001
+ "grad_norm": 1.734375,
1002
+ "learning_rate": 9.882352941176472e-06,
1003
+ "loss": 1.2667318344116212,
1004
+ "step": 1420
1005
+ },
1006
+ {
1007
+ "epoch": 1.5298367674605298,
1008
+ "grad_norm": 1.5,
1009
+ "learning_rate": 9.811051693404634e-06,
1010
+ "loss": 1.2964338302612304,
1011
+ "step": 1430
1012
+ },
1013
+ {
1014
+ "epoch": 1.5405405405405406,
1015
+ "grad_norm": 1.703125,
1016
+ "learning_rate": 9.7397504456328e-06,
1017
+ "loss": 1.2451062202453613,
1018
+ "step": 1440
1019
+ },
1020
+ {
1021
+ "epoch": 1.5512443136205514,
1022
+ "grad_norm": 1.6484375,
1023
+ "learning_rate": 9.668449197860963e-06,
1024
+ "loss": 1.2622719764709474,
1025
+ "step": 1450
1026
+ },
1027
+ {
1028
+ "epoch": 1.561948086700562,
1029
+ "grad_norm": 1.8359375,
1030
+ "learning_rate": 9.597147950089127e-06,
1031
+ "loss": 1.2830778121948243,
1032
+ "step": 1460
1033
+ },
1034
+ {
1035
+ "epoch": 1.5726518597805725,
1036
+ "grad_norm": 1.5390625,
1037
+ "learning_rate": 9.525846702317292e-06,
1038
+ "loss": 1.3212904930114746,
1039
+ "step": 1470
1040
+ },
1041
+ {
1042
+ "epoch": 1.5833556328605833,
1043
+ "grad_norm": 1.515625,
1044
+ "learning_rate": 9.454545454545456e-06,
1045
+ "loss": 1.301555347442627,
1046
+ "step": 1480
1047
+ },
1048
+ {
1049
+ "epoch": 1.5940594059405941,
1050
+ "grad_norm": 1.453125,
1051
+ "learning_rate": 9.38324420677362e-06,
1052
+ "loss": 1.2626118659973145,
1053
+ "step": 1490
1054
+ },
1055
+ {
1056
+ "epoch": 1.6047631790206047,
1057
+ "grad_norm": 1.890625,
1058
+ "learning_rate": 9.311942959001783e-06,
1059
+ "loss": 1.2342555046081543,
1060
+ "step": 1500
1061
+ },
1062
+ {
1063
+ "epoch": 1.6154669521006153,
1064
+ "grad_norm": 1.7421875,
1065
+ "learning_rate": 9.240641711229947e-06,
1066
+ "loss": 1.3167900085449218,
1067
+ "step": 1510
1068
+ },
1069
+ {
1070
+ "epoch": 1.6261707251806263,
1071
+ "grad_norm": 2.0,
1072
+ "learning_rate": 9.169340463458112e-06,
1073
+ "loss": 1.296627902984619,
1074
+ "step": 1520
1075
+ },
1076
+ {
1077
+ "epoch": 1.636874498260637,
1078
+ "grad_norm": 1.4453125,
1079
+ "learning_rate": 9.098039215686276e-06,
1080
+ "loss": 1.275075340270996,
1081
+ "step": 1530
1082
+ },
1083
+ {
1084
+ "epoch": 1.6475782713406475,
1085
+ "grad_norm": 1.53125,
1086
+ "learning_rate": 9.02673796791444e-06,
1087
+ "loss": 1.2771642684936524,
1088
+ "step": 1540
1089
+ },
1090
+ {
1091
+ "epoch": 1.6582820444206583,
1092
+ "grad_norm": 1.8046875,
1093
+ "learning_rate": 8.955436720142603e-06,
1094
+ "loss": 1.2907758712768556,
1095
+ "step": 1550
1096
+ },
1097
+ {
1098
+ "epoch": 1.668985817500669,
1099
+ "grad_norm": 1.6953125,
1100
+ "learning_rate": 8.884135472370767e-06,
1101
+ "loss": 1.2778194427490235,
1102
+ "step": 1560
1103
+ },
1104
+ {
1105
+ "epoch": 1.6796895905806797,
1106
+ "grad_norm": 1.5859375,
1107
+ "learning_rate": 8.81283422459893e-06,
1108
+ "loss": 1.2820199012756348,
1109
+ "step": 1570
1110
+ },
1111
+ {
1112
+ "epoch": 1.6903933636606903,
1113
+ "grad_norm": 1.8984375,
1114
+ "learning_rate": 8.741532976827096e-06,
1115
+ "loss": 1.3197799682617188,
1116
+ "step": 1580
1117
+ },
1118
+ {
1119
+ "epoch": 1.701097136740701,
1120
+ "grad_norm": 1.796875,
1121
+ "learning_rate": 8.67023172905526e-06,
1122
+ "loss": 1.2711196899414063,
1123
+ "step": 1590
1124
+ },
1125
+ {
1126
+ "epoch": 1.7118009098207119,
1127
+ "grad_norm": 1.7265625,
1128
+ "learning_rate": 8.598930481283423e-06,
1129
+ "loss": 1.3094602584838868,
1130
+ "step": 1600
1131
+ },
1132
+ {
1133
+ "epoch": 1.7225046829007225,
1134
+ "grad_norm": 1.65625,
1135
+ "learning_rate": 8.527629233511587e-06,
1136
+ "loss": 1.3037433624267578,
1137
+ "step": 1610
1138
+ },
1139
+ {
1140
+ "epoch": 1.7332084559807333,
1141
+ "grad_norm": 1.6484375,
1142
+ "learning_rate": 8.45632798573975e-06,
1143
+ "loss": 1.2734570503234863,
1144
+ "step": 1620
1145
+ },
1146
+ {
1147
+ "epoch": 1.743912229060744,
1148
+ "grad_norm": 1.5703125,
1149
+ "learning_rate": 8.385026737967916e-06,
1150
+ "loss": 1.2476407051086427,
1151
+ "step": 1630
1152
+ },
1153
+ {
1154
+ "epoch": 1.7546160021407546,
1155
+ "grad_norm": 1.7265625,
1156
+ "learning_rate": 8.31372549019608e-06,
1157
+ "loss": 1.3427558898925782,
1158
+ "step": 1640
1159
+ },
1160
+ {
1161
+ "epoch": 1.7653197752207652,
1162
+ "grad_norm": 1.65625,
1163
+ "learning_rate": 8.242424242424243e-06,
1164
+ "loss": 1.273496437072754,
1165
+ "step": 1650
1166
+ },
1167
+ {
1168
+ "epoch": 1.776023548300776,
1169
+ "grad_norm": 1.71875,
1170
+ "learning_rate": 8.171122994652407e-06,
1171
+ "loss": 1.2626665115356446,
1172
+ "step": 1660
1173
+ },
1174
+ {
1175
+ "epoch": 1.7867273213807868,
1176
+ "grad_norm": 1.8046875,
1177
+ "learning_rate": 8.09982174688057e-06,
1178
+ "loss": 1.2670047760009766,
1179
+ "step": 1670
1180
+ },
1181
+ {
1182
+ "epoch": 1.7974310944607974,
1183
+ "grad_norm": 1.7265625,
1184
+ "learning_rate": 8.028520499108736e-06,
1185
+ "loss": 1.349191665649414,
1186
+ "step": 1680
1187
+ },
1188
+ {
1189
+ "epoch": 1.808134867540808,
1190
+ "grad_norm": 1.578125,
1191
+ "learning_rate": 7.9572192513369e-06,
1192
+ "loss": 1.2989972114562989,
1193
+ "step": 1690
1194
+ },
1195
+ {
1196
+ "epoch": 1.8188386406208188,
1197
+ "grad_norm": 1.6875,
1198
+ "learning_rate": 7.885918003565063e-06,
1199
+ "loss": 1.1850922584533692,
1200
+ "step": 1700
1201
+ },
1202
+ {
1203
+ "epoch": 1.8295424137008296,
1204
+ "grad_norm": 1.953125,
1205
+ "learning_rate": 7.814616755793228e-06,
1206
+ "loss": 1.3360312461853028,
1207
+ "step": 1710
1208
+ },
1209
+ {
1210
+ "epoch": 1.8402461867808402,
1211
+ "grad_norm": 1.6328125,
1212
+ "learning_rate": 7.74331550802139e-06,
1213
+ "loss": 1.2957257270812987,
1214
+ "step": 1720
1215
+ },
1216
+ {
1217
+ "epoch": 1.850949959860851,
1218
+ "grad_norm": 1.6015625,
1219
+ "learning_rate": 7.672014260249555e-06,
1220
+ "loss": 1.2536530494689941,
1221
+ "step": 1730
1222
+ },
1223
+ {
1224
+ "epoch": 1.8616537329408618,
1225
+ "grad_norm": 1.7109375,
1226
+ "learning_rate": 7.60071301247772e-06,
1227
+ "loss": 1.2660930633544922,
1228
+ "step": 1740
1229
+ },
1230
+ {
1231
+ "epoch": 1.8723575060208724,
1232
+ "grad_norm": 1.7265625,
1233
+ "learning_rate": 7.529411764705883e-06,
1234
+ "loss": 1.3080876350402832,
1235
+ "step": 1750
1236
+ },
1237
+ {
1238
+ "epoch": 1.883061279100883,
1239
+ "grad_norm": 1.640625,
1240
+ "learning_rate": 7.458110516934047e-06,
1241
+ "loss": 1.3132406234741212,
1242
+ "step": 1760
1243
+ },
1244
+ {
1245
+ "epoch": 1.8937650521808937,
1246
+ "grad_norm": 1.8203125,
1247
+ "learning_rate": 7.386809269162211e-06,
1248
+ "loss": 1.31253080368042,
1249
+ "step": 1770
1250
+ },
1251
+ {
1252
+ "epoch": 1.9044688252609046,
1253
+ "grad_norm": 1.4140625,
1254
+ "learning_rate": 7.315508021390375e-06,
1255
+ "loss": 1.3013240814208984,
1256
+ "step": 1780
1257
+ },
1258
+ {
1259
+ "epoch": 1.9151725983409151,
1260
+ "grad_norm": 1.4765625,
1261
+ "learning_rate": 7.244206773618538e-06,
1262
+ "loss": 1.2744117736816407,
1263
+ "step": 1790
1264
+ },
1265
+ {
1266
+ "epoch": 1.9258763714209257,
1267
+ "grad_norm": 1.4609375,
1268
+ "learning_rate": 7.172905525846703e-06,
1269
+ "loss": 1.3057758331298828,
1270
+ "step": 1800
1271
+ },
1272
+ {
1273
+ "epoch": 1.9365801445009367,
1274
+ "grad_norm": 1.515625,
1275
+ "learning_rate": 7.101604278074867e-06,
1276
+ "loss": 1.224927043914795,
1277
+ "step": 1810
1278
+ },
1279
+ {
1280
+ "epoch": 1.9472839175809473,
1281
+ "grad_norm": 1.7109375,
1282
+ "learning_rate": 7.030303030303031e-06,
1283
+ "loss": 1.3182221412658692,
1284
+ "step": 1820
1285
+ },
1286
+ {
1287
+ "epoch": 1.957987690660958,
1288
+ "grad_norm": 1.5546875,
1289
+ "learning_rate": 6.959001782531195e-06,
1290
+ "loss": 1.2400826454162597,
1291
+ "step": 1830
1292
+ },
1293
+ {
1294
+ "epoch": 1.9686914637409687,
1295
+ "grad_norm": 1.7421875,
1296
+ "learning_rate": 6.887700534759358e-06,
1297
+ "loss": 1.2463386535644532,
1298
+ "step": 1840
1299
+ },
1300
+ {
1301
+ "epoch": 1.9793952368209795,
1302
+ "grad_norm": 1.46875,
1303
+ "learning_rate": 6.8163992869875225e-06,
1304
+ "loss": 1.3235528945922852,
1305
+ "step": 1850
1306
+ },
1307
+ {
1308
+ "epoch": 1.99009900990099,
1309
+ "grad_norm": 1.8203125,
1310
+ "learning_rate": 6.745098039215687e-06,
1311
+ "loss": 1.2812946319580079,
1312
+ "step": 1860
1313
+ },
1314
+ {
1315
+ "epoch": 2.0,
1316
+ "grad_norm": 4.1875,
1317
+ "learning_rate": 6.673796791443851e-06,
1318
+ "loss": 1.2953272819519044,
1319
+ "step": 1870
1320
+ },
1321
+ {
1322
+ "epoch": 2.0107037730800106,
1323
+ "grad_norm": 1.5390625,
1324
+ "learning_rate": 6.602495543672015e-06,
1325
+ "loss": 1.207719612121582,
1326
+ "step": 1880
1327
+ },
1328
+ {
1329
+ "epoch": 2.0214075461600216,
1330
+ "grad_norm": 1.671875,
1331
+ "learning_rate": 6.531194295900179e-06,
1332
+ "loss": 1.2520846366882323,
1333
+ "step": 1890
1334
+ },
1335
+ {
1336
+ "epoch": 2.032111319240032,
1337
+ "grad_norm": 1.703125,
1338
+ "learning_rate": 6.459893048128343e-06,
1339
+ "loss": 1.2905988693237305,
1340
+ "step": 1900
1341
+ },
1342
+ {
1343
+ "epoch": 2.0428150923200428,
1344
+ "grad_norm": 1.921875,
1345
+ "learning_rate": 6.388591800356507e-06,
1346
+ "loss": 1.3520148277282715,
1347
+ "step": 1910
1348
+ },
1349
+ {
1350
+ "epoch": 2.0535188654000534,
1351
+ "grad_norm": 1.8515625,
1352
+ "learning_rate": 6.3172905525846705e-06,
1353
+ "loss": 1.2911107063293457,
1354
+ "step": 1920
1355
+ },
1356
+ {
1357
+ "epoch": 2.0642226384800644,
1358
+ "grad_norm": 1.671875,
1359
+ "learning_rate": 6.245989304812835e-06,
1360
+ "loss": 1.2403117179870606,
1361
+ "step": 1930
1362
+ },
1363
+ {
1364
+ "epoch": 2.074926411560075,
1365
+ "grad_norm": 1.96875,
1366
+ "learning_rate": 6.174688057040999e-06,
1367
+ "loss": 1.3558055877685546,
1368
+ "step": 1940
1369
+ },
1370
+ {
1371
+ "epoch": 2.0856301846400855,
1372
+ "grad_norm": 1.6328125,
1373
+ "learning_rate": 6.103386809269163e-06,
1374
+ "loss": 1.3194045066833495,
1375
+ "step": 1950
1376
+ },
1377
+ {
1378
+ "epoch": 2.096333957720096,
1379
+ "grad_norm": 2.140625,
1380
+ "learning_rate": 6.032085561497326e-06,
1381
+ "loss": 1.3321582794189453,
1382
+ "step": 1960
1383
+ },
1384
+ {
1385
+ "epoch": 2.107037730800107,
1386
+ "grad_norm": 1.625,
1387
+ "learning_rate": 5.96078431372549e-06,
1388
+ "loss": 1.288839054107666,
1389
+ "step": 1970
1390
+ },
1391
+ {
1392
+ "epoch": 2.1177415038801177,
1393
+ "grad_norm": 2.0,
1394
+ "learning_rate": 5.889483065953655e-06,
1395
+ "loss": 1.3260244369506835,
1396
+ "step": 1980
1397
+ },
1398
+ {
1399
+ "epoch": 2.1284452769601283,
1400
+ "grad_norm": 1.5703125,
1401
+ "learning_rate": 5.8181818181818185e-06,
1402
+ "loss": 1.2721702575683593,
1403
+ "step": 1990
1404
+ },
1405
+ {
1406
+ "epoch": 2.1391490500401393,
1407
+ "grad_norm": 1.6640625,
1408
+ "learning_rate": 5.746880570409983e-06,
1409
+ "loss": 1.2622364044189454,
1410
+ "step": 2000
1411
+ },
1412
+ {
1413
+ "epoch": 2.14985282312015,
1414
+ "grad_norm": 1.609375,
1415
+ "learning_rate": 5.675579322638146e-06,
1416
+ "loss": 1.30474796295166,
1417
+ "step": 2010
1418
+ },
1419
+ {
1420
+ "epoch": 2.1605565962001605,
1421
+ "grad_norm": 1.7890625,
1422
+ "learning_rate": 5.60427807486631e-06,
1423
+ "loss": 1.3109374046325684,
1424
+ "step": 2020
1425
+ },
1426
+ {
1427
+ "epoch": 2.171260369280171,
1428
+ "grad_norm": 1.71875,
1429
+ "learning_rate": 5.532976827094475e-06,
1430
+ "loss": 1.3231799125671386,
1431
+ "step": 2030
1432
+ },
1433
+ {
1434
+ "epoch": 2.181964142360182,
1435
+ "grad_norm": 1.796875,
1436
+ "learning_rate": 5.4616755793226384e-06,
1437
+ "loss": 1.2993489265441895,
1438
+ "step": 2040
1439
+ },
1440
+ {
1441
+ "epoch": 2.1926679154401927,
1442
+ "grad_norm": 1.875,
1443
+ "learning_rate": 5.390374331550803e-06,
1444
+ "loss": 1.3044631958007813,
1445
+ "step": 2050
1446
+ },
1447
+ {
1448
+ "epoch": 2.2033716885202033,
1449
+ "grad_norm": 1.4609375,
1450
+ "learning_rate": 5.3190730837789666e-06,
1451
+ "loss": 1.2702978134155274,
1452
+ "step": 2060
1453
+ },
1454
+ {
1455
+ "epoch": 2.2140754616002143,
1456
+ "grad_norm": 1.5546875,
1457
+ "learning_rate": 5.24777183600713e-06,
1458
+ "loss": 1.287952709197998,
1459
+ "step": 2070
1460
+ },
1461
+ {
1462
+ "epoch": 2.224779234680225,
1463
+ "grad_norm": 1.5546875,
1464
+ "learning_rate": 5.176470588235295e-06,
1465
+ "loss": 1.2974214553833008,
1466
+ "step": 2080
1467
+ },
1468
+ {
1469
+ "epoch": 2.2354830077602355,
1470
+ "grad_norm": 1.703125,
1471
+ "learning_rate": 5.105169340463458e-06,
1472
+ "loss": 1.3148197174072265,
1473
+ "step": 2090
1474
+ },
1475
+ {
1476
+ "epoch": 2.246186780840246,
1477
+ "grad_norm": 1.6328125,
1478
+ "learning_rate": 5.033868092691623e-06,
1479
+ "loss": 1.3466445922851562,
1480
+ "step": 2100
1481
+ },
1482
+ {
1483
+ "epoch": 2.256890553920257,
1484
+ "grad_norm": 1.484375,
1485
+ "learning_rate": 4.9625668449197864e-06,
1486
+ "loss": 1.334506893157959,
1487
+ "step": 2110
1488
+ },
1489
+ {
1490
+ "epoch": 2.2675943270002676,
1491
+ "grad_norm": 1.9765625,
1492
+ "learning_rate": 4.891265597147951e-06,
1493
+ "loss": 1.279165267944336,
1494
+ "step": 2120
1495
+ },
1496
+ {
1497
+ "epoch": 2.278298100080278,
1498
+ "grad_norm": 1.890625,
1499
+ "learning_rate": 4.8199643493761146e-06,
1500
+ "loss": 1.2512639045715332,
1501
+ "step": 2130
1502
+ },
1503
+ {
1504
+ "epoch": 2.289001873160289,
1505
+ "grad_norm": 1.59375,
1506
+ "learning_rate": 4.748663101604278e-06,
1507
+ "loss": 1.2572649002075196,
1508
+ "step": 2140
1509
+ },
1510
+ {
1511
+ "epoch": 2.2997056462403,
1512
+ "grad_norm": 1.5625,
1513
+ "learning_rate": 4.677361853832442e-06,
1514
+ "loss": 1.2503036499023437,
1515
+ "step": 2150
1516
+ },
1517
+ {
1518
+ "epoch": 2.3104094193203104,
1519
+ "grad_norm": 1.859375,
1520
+ "learning_rate": 4.606060606060606e-06,
1521
+ "loss": 1.2866994857788085,
1522
+ "step": 2160
1523
+ },
1524
+ {
1525
+ "epoch": 2.321113192400321,
1526
+ "grad_norm": 1.6171875,
1527
+ "learning_rate": 4.534759358288771e-06,
1528
+ "loss": 1.2810638427734375,
1529
+ "step": 2170
1530
+ },
1531
+ {
1532
+ "epoch": 2.331816965480332,
1533
+ "grad_norm": 1.71875,
1534
+ "learning_rate": 4.4634581105169345e-06,
1535
+ "loss": 1.2588828086853028,
1536
+ "step": 2180
1537
+ },
1538
+ {
1539
+ "epoch": 2.3425207385603426,
1540
+ "grad_norm": 1.6875,
1541
+ "learning_rate": 4.392156862745098e-06,
1542
+ "loss": 1.2615557670593263,
1543
+ "step": 2190
1544
+ },
1545
+ {
1546
+ "epoch": 2.353224511640353,
1547
+ "grad_norm": 1.921875,
1548
+ "learning_rate": 4.320855614973263e-06,
1549
+ "loss": 1.2974510192871094,
1550
+ "step": 2200
1551
+ },
1552
+ {
1553
+ "epoch": 2.3639282847203638,
1554
+ "grad_norm": 1.78125,
1555
+ "learning_rate": 4.249554367201426e-06,
1556
+ "loss": 1.303697681427002,
1557
+ "step": 2210
1558
+ },
1559
+ {
1560
+ "epoch": 2.374632057800375,
1561
+ "grad_norm": 1.765625,
1562
+ "learning_rate": 4.178253119429591e-06,
1563
+ "loss": 1.303341007232666,
1564
+ "step": 2220
1565
+ },
1566
+ {
1567
+ "epoch": 2.3853358308803854,
1568
+ "grad_norm": 1.46875,
1569
+ "learning_rate": 4.106951871657754e-06,
1570
+ "loss": 1.306796932220459,
1571
+ "step": 2230
1572
+ },
1573
+ {
1574
+ "epoch": 2.396039603960396,
1575
+ "grad_norm": 1.828125,
1576
+ "learning_rate": 4.035650623885918e-06,
1577
+ "loss": 1.3068408012390136,
1578
+ "step": 2240
1579
+ },
1580
+ {
1581
+ "epoch": 2.4067433770404065,
1582
+ "grad_norm": 1.671875,
1583
+ "learning_rate": 3.9643493761140825e-06,
1584
+ "loss": 1.307657527923584,
1585
+ "step": 2250
1586
+ },
1587
+ {
1588
+ "epoch": 2.4174471501204176,
1589
+ "grad_norm": 1.75,
1590
+ "learning_rate": 3.893048128342246e-06,
1591
+ "loss": 1.2932353973388673,
1592
+ "step": 2260
1593
+ },
1594
+ {
1595
+ "epoch": 2.428150923200428,
1596
+ "grad_norm": 1.765625,
1597
+ "learning_rate": 3.821746880570411e-06,
1598
+ "loss": 1.2625031471252441,
1599
+ "step": 2270
1600
+ },
1601
+ {
1602
+ "epoch": 2.4388546962804387,
1603
+ "grad_norm": 1.703125,
1604
+ "learning_rate": 3.7504456327985743e-06,
1605
+ "loss": 1.3354209899902343,
1606
+ "step": 2280
1607
+ },
1608
+ {
1609
+ "epoch": 2.4495584693604497,
1610
+ "grad_norm": 1.3671875,
1611
+ "learning_rate": 3.6791443850267383e-06,
1612
+ "loss": 1.200312042236328,
1613
+ "step": 2290
1614
+ },
1615
+ {
1616
+ "epoch": 2.4602622424404603,
1617
+ "grad_norm": 1.6484375,
1618
+ "learning_rate": 3.6078431372549024e-06,
1619
+ "loss": 1.2868337631225586,
1620
+ "step": 2300
1621
+ },
1622
+ {
1623
+ "epoch": 2.470966015520471,
1624
+ "grad_norm": 1.78125,
1625
+ "learning_rate": 3.536541889483066e-06,
1626
+ "loss": 1.2731021881103515,
1627
+ "step": 2310
1628
+ },
1629
+ {
1630
+ "epoch": 2.4816697886004815,
1631
+ "grad_norm": 2.265625,
1632
+ "learning_rate": 3.46524064171123e-06,
1633
+ "loss": 1.3145703315734862,
1634
+ "step": 2320
1635
+ },
1636
+ {
1637
+ "epoch": 2.4923735616804925,
1638
+ "grad_norm": 1.671875,
1639
+ "learning_rate": 3.3939393939393946e-06,
1640
+ "loss": 1.305215549468994,
1641
+ "step": 2330
1642
+ },
1643
+ {
1644
+ "epoch": 2.503077334760503,
1645
+ "grad_norm": 1.734375,
1646
+ "learning_rate": 3.322638146167558e-06,
1647
+ "loss": 1.3567096710205078,
1648
+ "step": 2340
1649
+ },
1650
+ {
1651
+ "epoch": 2.5137811078405137,
1652
+ "grad_norm": 1.6015625,
1653
+ "learning_rate": 3.2513368983957223e-06,
1654
+ "loss": 1.3081507682800293,
1655
+ "step": 2350
1656
+ },
1657
+ {
1658
+ "epoch": 2.5244848809205243,
1659
+ "grad_norm": 1.7109375,
1660
+ "learning_rate": 3.180035650623886e-06,
1661
+ "loss": 1.300461483001709,
1662
+ "step": 2360
1663
+ },
1664
+ {
1665
+ "epoch": 2.5351886540005353,
1666
+ "grad_norm": 1.5859375,
1667
+ "learning_rate": 3.10873440285205e-06,
1668
+ "loss": 1.3006972312927245,
1669
+ "step": 2370
1670
+ },
1671
+ {
1672
+ "epoch": 2.545892427080546,
1673
+ "grad_norm": 1.734375,
1674
+ "learning_rate": 3.0374331550802145e-06,
1675
+ "loss": 1.3157925605773926,
1676
+ "step": 2380
1677
+ },
1678
+ {
1679
+ "epoch": 2.5565962001605564,
1680
+ "grad_norm": 1.5625,
1681
+ "learning_rate": 2.966131907308378e-06,
1682
+ "loss": 1.2608634948730468,
1683
+ "step": 2390
1684
+ },
1685
+ {
1686
+ "epoch": 2.5672999732405675,
1687
+ "grad_norm": 1.765625,
1688
+ "learning_rate": 2.894830659536542e-06,
1689
+ "loss": 1.237275981903076,
1690
+ "step": 2400
1691
+ },
1692
+ {
1693
+ "epoch": 2.578003746320578,
1694
+ "grad_norm": 1.5859375,
1695
+ "learning_rate": 2.8235294117647062e-06,
1696
+ "loss": 1.274481964111328,
1697
+ "step": 2410
1698
+ },
1699
+ {
1700
+ "epoch": 2.5887075194005886,
1701
+ "grad_norm": 1.65625,
1702
+ "learning_rate": 2.75222816399287e-06,
1703
+ "loss": 1.3146997451782227,
1704
+ "step": 2420
1705
+ },
1706
+ {
1707
+ "epoch": 2.5994112924805997,
1708
+ "grad_norm": 1.6640625,
1709
+ "learning_rate": 2.680926916221034e-06,
1710
+ "loss": 1.3125761032104493,
1711
+ "step": 2430
1712
+ },
1713
+ {
1714
+ "epoch": 2.6101150655606102,
1715
+ "grad_norm": 1.6953125,
1716
+ "learning_rate": 2.6096256684491984e-06,
1717
+ "loss": 1.2919845581054688,
1718
+ "step": 2440
1719
+ },
1720
+ {
1721
+ "epoch": 2.620818838640621,
1722
+ "grad_norm": 1.6484375,
1723
+ "learning_rate": 2.538324420677362e-06,
1724
+ "loss": 1.2364542961120606,
1725
+ "step": 2450
1726
+ },
1727
+ {
1728
+ "epoch": 2.6315226117206314,
1729
+ "grad_norm": 1.609375,
1730
+ "learning_rate": 2.467023172905526e-06,
1731
+ "loss": 1.2624409675598145,
1732
+ "step": 2460
1733
+ },
1734
+ {
1735
+ "epoch": 2.642226384800642,
1736
+ "grad_norm": 1.5859375,
1737
+ "learning_rate": 2.3957219251336898e-06,
1738
+ "loss": 1.3075796127319337,
1739
+ "step": 2470
1740
+ },
1741
+ {
1742
+ "epoch": 2.652930157880653,
1743
+ "grad_norm": 2.109375,
1744
+ "learning_rate": 2.3244206773618542e-06,
1745
+ "loss": 1.2835824012756347,
1746
+ "step": 2480
1747
+ },
1748
+ {
1749
+ "epoch": 2.6636339309606636,
1750
+ "grad_norm": 1.8359375,
1751
+ "learning_rate": 2.253119429590018e-06,
1752
+ "loss": 1.307276153564453,
1753
+ "step": 2490
1754
+ },
1755
+ {
1756
+ "epoch": 2.674337704040674,
1757
+ "grad_norm": 1.7421875,
1758
+ "learning_rate": 2.181818181818182e-06,
1759
+ "loss": 1.2622486114501954,
1760
+ "step": 2500
1761
+ },
1762
+ {
1763
+ "epoch": 2.685041477120685,
1764
+ "grad_norm": 1.6796875,
1765
+ "learning_rate": 2.110516934046346e-06,
1766
+ "loss": 1.2514682769775392,
1767
+ "step": 2510
1768
+ },
1769
+ {
1770
+ "epoch": 2.6957452502006958,
1771
+ "grad_norm": 1.5625,
1772
+ "learning_rate": 2.03921568627451e-06,
1773
+ "loss": 1.2614849090576172,
1774
+ "step": 2520
1775
+ },
1776
+ {
1777
+ "epoch": 2.7064490232807064,
1778
+ "grad_norm": 1.5390625,
1779
+ "learning_rate": 1.9679144385026737e-06,
1780
+ "loss": 1.3241849899291993,
1781
+ "step": 2530
1782
+ },
1783
+ {
1784
+ "epoch": 2.7171527963607174,
1785
+ "grad_norm": 1.7890625,
1786
+ "learning_rate": 1.896613190730838e-06,
1787
+ "loss": 1.2841781616210937,
1788
+ "step": 2540
1789
+ },
1790
+ {
1791
+ "epoch": 2.727856569440728,
1792
+ "grad_norm": 1.8515625,
1793
+ "learning_rate": 1.8253119429590018e-06,
1794
+ "loss": 1.3062438011169433,
1795
+ "step": 2550
1796
+ },
1797
+ {
1798
+ "epoch": 2.7385603425207385,
1799
+ "grad_norm": 1.6328125,
1800
+ "learning_rate": 1.7540106951871661e-06,
1801
+ "loss": 1.3065251350402831,
1802
+ "step": 2560
1803
+ },
1804
+ {
1805
+ "epoch": 2.749264115600749,
1806
+ "grad_norm": 1.65625,
1807
+ "learning_rate": 1.68270944741533e-06,
1808
+ "loss": 1.2980451583862305,
1809
+ "step": 2570
1810
+ },
1811
+ {
1812
+ "epoch": 2.7599678886807597,
1813
+ "grad_norm": 1.671875,
1814
+ "learning_rate": 1.6114081996434938e-06,
1815
+ "loss": 1.2766281127929688,
1816
+ "step": 2580
1817
+ },
1818
+ {
1819
+ "epoch": 2.7706716617607707,
1820
+ "grad_norm": 1.5859375,
1821
+ "learning_rate": 1.5401069518716579e-06,
1822
+ "loss": 1.3033970832824706,
1823
+ "step": 2590
1824
+ },
1825
+ {
1826
+ "epoch": 2.7813754348407813,
1827
+ "grad_norm": 1.4921875,
1828
+ "learning_rate": 1.468805704099822e-06,
1829
+ "loss": 1.2335359573364257,
1830
+ "step": 2600
1831
+ },
1832
+ {
1833
+ "epoch": 2.792079207920792,
1834
+ "grad_norm": 1.6484375,
1835
+ "learning_rate": 1.3975044563279858e-06,
1836
+ "loss": 1.3184511184692382,
1837
+ "step": 2610
1838
+ },
1839
+ {
1840
+ "epoch": 2.802782981000803,
1841
+ "grad_norm": 1.609375,
1842
+ "learning_rate": 1.3262032085561499e-06,
1843
+ "loss": 1.1845362663269043,
1844
+ "step": 2620
1845
+ },
1846
+ {
1847
+ "epoch": 2.8134867540808135,
1848
+ "grad_norm": 1.7890625,
1849
+ "learning_rate": 1.2549019607843137e-06,
1850
+ "loss": 1.2506700515747071,
1851
+ "step": 2630
1852
+ },
1853
+ {
1854
+ "epoch": 2.824190527160824,
1855
+ "grad_norm": 1.7109375,
1856
+ "learning_rate": 1.1836007130124778e-06,
1857
+ "loss": 1.2360112190246582,
1858
+ "step": 2640
1859
+ },
1860
+ {
1861
+ "epoch": 2.834894300240835,
1862
+ "grad_norm": 1.703125,
1863
+ "learning_rate": 1.1122994652406418e-06,
1864
+ "loss": 1.2875761032104491,
1865
+ "step": 2650
1866
+ },
1867
+ {
1868
+ "epoch": 2.8455980733208457,
1869
+ "grad_norm": 1.578125,
1870
+ "learning_rate": 1.0409982174688057e-06,
1871
+ "loss": 1.2473506927490234,
1872
+ "step": 2660
1873
+ },
1874
+ {
1875
+ "epoch": 2.8563018464008563,
1876
+ "grad_norm": 1.546875,
1877
+ "learning_rate": 9.696969696969698e-07,
1878
+ "loss": 1.3208060264587402,
1879
+ "step": 2670
1880
+ },
1881
+ {
1882
+ "epoch": 2.867005619480867,
1883
+ "grad_norm": 1.671875,
1884
+ "learning_rate": 8.983957219251338e-07,
1885
+ "loss": 1.3371116638183593,
1886
+ "step": 2680
1887
+ },
1888
+ {
1889
+ "epoch": 2.8777093925608774,
1890
+ "grad_norm": 1.484375,
1891
+ "learning_rate": 8.270944741532977e-07,
1892
+ "loss": 1.2605000495910645,
1893
+ "step": 2690
1894
+ },
1895
+ {
1896
+ "epoch": 2.8884131656408885,
1897
+ "grad_norm": 1.6015625,
1898
+ "learning_rate": 7.557932263814617e-07,
1899
+ "loss": 1.267725658416748,
1900
+ "step": 2700
1901
+ },
1902
+ {
1903
+ "epoch": 2.899116938720899,
1904
+ "grad_norm": 1.640625,
1905
+ "learning_rate": 6.844919786096257e-07,
1906
+ "loss": 1.27689208984375,
1907
+ "step": 2710
1908
+ },
1909
+ {
1910
+ "epoch": 2.9098207118009096,
1911
+ "grad_norm": 1.578125,
1912
+ "learning_rate": 6.131907308377896e-07,
1913
+ "loss": 1.286923885345459,
1914
+ "step": 2720
1915
+ },
1916
+ {
1917
+ "epoch": 2.9205244848809206,
1918
+ "grad_norm": 1.765625,
1919
+ "learning_rate": 5.418894830659537e-07,
1920
+ "loss": 1.330905055999756,
1921
+ "step": 2730
1922
+ },
1923
+ {
1924
+ "epoch": 2.9312282579609312,
1925
+ "grad_norm": 1.71875,
1926
+ "learning_rate": 4.7058823529411767e-07,
1927
+ "loss": 1.2354840278625487,
1928
+ "step": 2740
1929
+ },
1930
+ {
1931
+ "epoch": 2.941932031040942,
1932
+ "grad_norm": 1.4609375,
1933
+ "learning_rate": 3.992869875222817e-07,
1934
+ "loss": 1.2647834777832032,
1935
+ "step": 2750
1936
+ },
1937
+ {
1938
+ "epoch": 2.952635804120953,
1939
+ "grad_norm": 1.6875,
1940
+ "learning_rate": 3.2798573975044564e-07,
1941
+ "loss": 1.2786317825317384,
1942
+ "step": 2760
1943
+ },
1944
+ {
1945
+ "epoch": 2.9633395772009634,
1946
+ "grad_norm": 1.53125,
1947
+ "learning_rate": 2.5668449197860965e-07,
1948
+ "loss": 1.2594982147216798,
1949
+ "step": 2770
1950
+ },
1951
+ {
1952
+ "epoch": 2.974043350280974,
1953
+ "grad_norm": 1.578125,
1954
+ "learning_rate": 1.8538324420677363e-07,
1955
+ "loss": 1.3203317642211914,
1956
+ "step": 2780
1957
+ },
1958
+ {
1959
+ "epoch": 2.984747123360985,
1960
+ "grad_norm": 2.046875,
1961
+ "learning_rate": 1.1408199643493762e-07,
1962
+ "loss": 1.224764347076416,
1963
+ "step": 2790
1964
+ },
1965
+ {
1966
+ "epoch": 2.9954508964409956,
1967
+ "grad_norm": 1.5078125,
1968
+ "learning_rate": 4.2780748663101606e-08,
1969
+ "loss": 1.2845193862915039,
1970
+ "step": 2800
1971
+ }
1972
+ ],
1973
+ "logging_steps": 10,
1974
+ "max_steps": 2805,
1975
+ "num_input_tokens_seen": 0,
1976
+ "num_train_epochs": 3,
1977
+ "save_steps": 500,
1978
+ "stateful_callbacks": {
1979
+ "TrainerControl": {
1980
+ "args": {
1981
+ "should_epoch_stop": false,
1982
+ "should_evaluate": false,
1983
+ "should_log": false,
1984
+ "should_save": true,
1985
+ "should_training_stop": true
1986
+ },
1987
+ "attributes": {}
1988
+ }
1989
+ },
1990
+ "total_flos": 7314356060356608.0,
1991
+ "train_batch_size": 2,
1992
+ "trial_name": null,
1993
+ "trial_params": null
1994
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09cadf8b90b06130276c0937eec3cb26b58d80dacc8599fa833d2893aa4a490e
3
+ size 5137