ola31 commited on
Commit
06dd538
·
verified ·
1 Parent(s): 62b0f58

Add files using upload-large-folder tool

Browse files
config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_dim": 32,
3
+ "action_head_cfg": {
4
+ "action_dim": 32,
5
+ "action_horizon": 16,
6
+ "add_pos_embed": true,
7
+ "backbone_embedding_dim": 2048,
8
+ "diffusion_model_cfg": {
9
+ "attention_head_dim": 48,
10
+ "cross_attention_dim": 2048,
11
+ "dropout": 0.2,
12
+ "final_dropout": true,
13
+ "interleave_self_attention": true,
14
+ "norm_type": "ada_norm",
15
+ "num_attention_heads": 32,
16
+ "num_layers": 16,
17
+ "output_dim": 1024,
18
+ "positional_embeddings": null
19
+ },
20
+ "hidden_size": 1024,
21
+ "input_embedding_dim": 1536,
22
+ "max_action_dim": 32,
23
+ "max_state_dim": 64,
24
+ "model_dtype": "float32",
25
+ "noise_beta_alpha": 1.5,
26
+ "noise_beta_beta": 1.0,
27
+ "noise_s": 0.999,
28
+ "num_inference_timesteps": 4,
29
+ "num_target_vision_tokens": 32,
30
+ "num_timestep_buckets": 1000,
31
+ "tune_diffusion_model": true,
32
+ "tune_projector": true,
33
+ "use_vlln": true,
34
+ "vl_self_attention_cfg": {
35
+ "attention_head_dim": 64,
36
+ "dropout": 0.2,
37
+ "final_dropout": true,
38
+ "num_attention_heads": 32,
39
+ "num_layers": 4,
40
+ "positional_embeddings": null
41
+ }
42
+ },
43
+ "action_horizon": 16,
44
+ "architectures": [
45
+ "GR00T_N1_5"
46
+ ],
47
+ "attn_implementation": null,
48
+ "backbone_cfg": {
49
+ "eagle_path": "NVEagle/eagle_er-qwen3_1_7B-Siglip2_400M_stage1_5_128gpu_er_v7_1mlp_nops",
50
+ "load_bf16": false,
51
+ "project_to_dim": null,
52
+ "reproject_vision": false,
53
+ "select_layer": 12,
54
+ "tune_llm": false,
55
+ "tune_visual": true,
56
+ "use_flash_attention": true
57
+ },
58
+ "compute_dtype": "bfloat16",
59
+ "hidden_size": 2048,
60
+ "model_dtype": "float32",
61
+ "model_type": "gr00t_n1_5",
62
+ "torch_dtype": "bfloat16",
63
+ "transformers_version": "4.51.3"
64
+ }
experiment_cfg/metadata.json ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "new_embodiment": {
3
+ "statistics": {
4
+ "state": {
5
+ "left_arm": {
6
+ "max": [
7
+ 1.9294003248214722,
8
+ 1.602099061012268,
9
+ 1.6264151334762573,
10
+ -0.25446105003356934,
11
+ 1.5196356773376465,
12
+ 1.1433908939361572,
13
+ 0.6067746877670288,
14
+ 1.0975148677825928
15
+ ],
16
+ "min": [
17
+ -1.1136820316314697,
18
+ 0.0,
19
+ -1.3611562252044678,
20
+ -2.3707194328308105,
21
+ -1.5844343900680542,
22
+ -0.9786677360534668,
23
+ -1.5404508113861084,
24
+ -0.0014851351734250784
25
+ ],
26
+ "mean": [
27
+ 0.593299925327301,
28
+ 0.32747721672058105,
29
+ -0.05066496506333351,
30
+ -1.9063727855682373,
31
+ 0.17272311449050903,
32
+ -0.23221395909786224,
33
+ -0.10456155240535736,
34
+ 0.31981971859931946
35
+ ],
36
+ "std": [
37
+ 0.5220098495483398,
38
+ 0.37464016675949097,
39
+ 0.35901498794555664,
40
+ 0.2830471694469452,
41
+ 0.3904763460159302,
42
+ 0.4544326961040497,
43
+ 0.24206174910068512,
44
+ 0.42259541153907776
45
+ ],
46
+ "q01": [
47
+ -0.6101756036281586,
48
+ 0.0,
49
+ -0.9561625659465789,
50
+ -2.369952440261841,
51
+ -1.06134934425354,
52
+ -0.8941370129585267,
53
+ -0.9334145128726958,
54
+ 0.0
55
+ ],
56
+ "q99": [
57
+ 1.5445568442344675,
58
+ 1.3273332118988044,
59
+ 1.254622507095338,
60
+ -0.7825890600681288,
61
+ 1.1786281228065492,
62
+ 0.8640626072883606,
63
+ 0.27465743422508326,
64
+ 1.0960297584533691
65
+ ]
66
+ },
67
+ "right_arm": {
68
+ "max": [
69
+ 1.8287568092346191,
70
+ 0.0,
71
+ 1.3382184505462646,
72
+ -0.22242721915245056,
73
+ 0.6614453196525574,
74
+ 0.851934552192688,
75
+ 0.7879786491394043,
76
+ 1.0871188640594482
77
+ ],
78
+ "min": [
79
+ -0.9815319776535034,
80
+ -1.6567351818084717,
81
+ -1.280957818031311,
82
+ -2.376268148422241,
83
+ -1.5769321918487549,
84
+ -1.1178884506225586,
85
+ -0.38204869627952576,
86
+ -0.0014851351734250784
87
+ ],
88
+ "mean": [
89
+ 0.7194070816040039,
90
+ -0.4011678397655487,
91
+ -0.04745709151029587,
92
+ -1.989113688468933,
93
+ -0.16602793335914612,
94
+ -0.24818535149097443,
95
+ 0.047956615686416626,
96
+ 0.3058697283267975
97
+ ],
98
+ "std": [
99
+ 0.5864532589912415,
100
+ 0.30040380358695984,
101
+ 0.3263898193836212,
102
+ 0.2959355115890503,
103
+ 0.32285091280937195,
104
+ 0.43125075101852417,
105
+ 0.18399831652641296,
106
+ 0.427061527967453
107
+ ],
108
+ "q01": [
109
+ -0.5737208127975464,
110
+ -1.260634982585907,
111
+ -1.029768466949463,
112
+ -2.3731162548065186,
113
+ -1.2456367015838623,
114
+ -0.8758910298347473,
115
+ -0.27340284287929534,
116
+ 0.0
117
+ ],
118
+ "q99": [
119
+ 1.4089793443679814,
120
+ 0.0,
121
+ 0.6310521304607393,
122
+ -0.8161724567413328,
123
+ 0.36215848326683053,
124
+ 0.7335819780826569,
125
+ 0.5693755149841309,
126
+ 1.078208088874817
127
+ ]
128
+ }
129
+ },
130
+ "action": {
131
+ "left_arm": {
132
+ "max": [
133
+ 1.932225227355957,
134
+ 1.6037930250167847,
135
+ 1.6268140077590942,
136
+ -0.2546408176422119,
137
+ 1.5193828344345093,
138
+ 1.1449086666107178,
139
+ 0.6062183976173401,
140
+ 1.217768669128418
141
+ ],
142
+ "min": [
143
+ -1.1152039766311646,
144
+ 0.0,
145
+ -1.3621749877929688,
146
+ -2.3715343475341797,
147
+ -1.5892040729522705,
148
+ -0.9786797165870667,
149
+ -1.5401166677474976,
150
+ -0.45471855998039246
151
+ ],
152
+ "mean": [
153
+ 0.5945148468017578,
154
+ 0.3266119956970215,
155
+ -0.0502079613506794,
156
+ -1.9065903425216675,
157
+ 0.17195096611976624,
158
+ -0.2329772710800171,
159
+ -0.10447312891483307,
160
+ 0.31009721755981445
161
+ ],
162
+ "std": [
163
+ 0.5217796564102173,
164
+ 0.3746543228626251,
165
+ 0.35901448130607605,
166
+ 0.2825615406036377,
167
+ 0.3902038037776947,
168
+ 0.4548611342906952,
169
+ 0.24220769107341766,
170
+ 0.4963742792606354
171
+ ],
172
+ "q01": [
173
+ -0.6089903712272644,
174
+ 0.0,
175
+ -0.9526020884513855,
176
+ -2.3700003623962402,
177
+ -1.0650429010391234,
178
+ -0.8943107724189758,
179
+ -0.9341943264007568,
180
+ -0.34887388348579407
181
+ ],
182
+ "q99": [
183
+ 1.5470080375671387,
184
+ 1.3290762901306152,
185
+ 1.2554092407226562,
186
+ -0.7900000810623169,
187
+ 1.177138090133667,
188
+ 0.8640530705451965,
189
+ 0.2762514352798462,
190
+ 1.187074065208435
191
+ ]
192
+ },
193
+ "right_arm": {
194
+ "max": [
195
+ 1.8293983936309814,
196
+ 0.0,
197
+ 1.3382847309112549,
198
+ -0.22242721915245056,
199
+ 0.6614686846733093,
200
+ 0.8517752289772034,
201
+ 0.7873165607452393,
202
+ 1.3436164855957031
203
+ ],
204
+ "min": [
205
+ -0.9817476868629456,
206
+ -1.656699299812317,
207
+ -1.2824078798294067,
208
+ -2.376136302947998,
209
+ -1.5769321918487549,
210
+ -1.0952622890472412,
211
+ -0.38196122646331787,
212
+ -0.20314569771289825
213
+ ],
214
+ "mean": [
215
+ 0.7209159731864929,
216
+ -0.4004747271537781,
217
+ -0.047843363136053085,
218
+ -1.9898872375488281,
219
+ -0.16549433767795563,
220
+ -0.24894820153713226,
221
+ 0.048004526644945145,
222
+ 0.3641880452632904
223
+ ],
224
+ "std": [
225
+ 0.5858996510505676,
226
+ 0.3006748855113983,
227
+ 0.32654568552970886,
228
+ 0.2949144244194031,
229
+ 0.3225865364074707,
230
+ 0.43120241165161133,
231
+ 0.18383744359016418,
232
+ 0.534224271774292
233
+ ],
234
+ "q01": [
235
+ -0.5737088322639465,
236
+ -1.2640001773834229,
237
+ -1.0308350324630737,
238
+ -2.373068332672119,
239
+ -1.2455923557281494,
240
+ -0.8759030103683472,
241
+ -0.2730485796928406,
242
+ -0.0788932517170906
243
+ ],
244
+ "q99": [
245
+ 1.410417079925537,
246
+ 0.0,
247
+ 0.6307740807533264,
248
+ -0.8191457390785217,
249
+ 0.3637310564517975,
250
+ 0.7336010336875916,
251
+ 0.5693848729133606,
252
+ 1.3221303224563599
253
+ ]
254
+ }
255
+ }
256
+ },
257
+ "modalities": {
258
+ "video": {
259
+ "cam_head": {
260
+ "resolution": [
261
+ 672,
262
+ 376
263
+ ],
264
+ "channels": 3,
265
+ "fps": 10.0
266
+ }
267
+ },
268
+ "state": {
269
+ "left_arm": {
270
+ "absolute": true,
271
+ "rotation_type": null,
272
+ "shape": [
273
+ 8
274
+ ],
275
+ "continuous": true
276
+ },
277
+ "right_arm": {
278
+ "absolute": true,
279
+ "rotation_type": null,
280
+ "shape": [
281
+ 8
282
+ ],
283
+ "continuous": true
284
+ }
285
+ },
286
+ "action": {
287
+ "left_arm": {
288
+ "absolute": true,
289
+ "rotation_type": null,
290
+ "shape": [
291
+ 8
292
+ ],
293
+ "continuous": true
294
+ },
295
+ "right_arm": {
296
+ "absolute": true,
297
+ "rotation_type": null,
298
+ "shape": [
299
+ 8
300
+ ],
301
+ "continuous": true
302
+ }
303
+ }
304
+ },
305
+ "embodiment_tag": "new_embodiment"
306
+ }
307
+ }
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8dc9fc08499c881bbb0b95233a234222b3584d6d74f92e7787d6b3ef00b479f
3
+ size 4999367032
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72063132af90c9d887d4efd2f6323b8e725abdbf501b33e20adfc7829075e512
3
+ size 2586705312
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
trainer_state.json ADDED
@@ -0,0 +1,1434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 15.748031496062993,
6
+ "eval_steps": 500,
7
+ "global_step": 2000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.07874015748031496,
14
+ "grad_norm": 5.119835376739502,
15
+ "learning_rate": 1.8e-06,
16
+ "loss": 0.8008,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.15748031496062992,
21
+ "grad_norm": 2.7846670150756836,
22
+ "learning_rate": 3.8e-06,
23
+ "loss": 0.6753,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.23622047244094488,
28
+ "grad_norm": 0.9429484605789185,
29
+ "learning_rate": 5.8e-06,
30
+ "loss": 0.3854,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.31496062992125984,
35
+ "grad_norm": 0.4289324879646301,
36
+ "learning_rate": 7.8e-06,
37
+ "loss": 0.2374,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.3937007874015748,
42
+ "grad_norm": 0.7212084531784058,
43
+ "learning_rate": 9.800000000000001e-06,
44
+ "loss": 0.1937,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.47244094488188976,
49
+ "grad_norm": 0.3743739426136017,
50
+ "learning_rate": 1.18e-05,
51
+ "loss": 0.169,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.5511811023622047,
56
+ "grad_norm": 0.33906233310699463,
57
+ "learning_rate": 1.3800000000000002e-05,
58
+ "loss": 0.1513,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.6299212598425197,
63
+ "grad_norm": 0.4938454031944275,
64
+ "learning_rate": 1.58e-05,
65
+ "loss": 0.1333,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.7086614173228346,
70
+ "grad_norm": 0.3791239261627197,
71
+ "learning_rate": 1.78e-05,
72
+ "loss": 0.1209,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.7874015748031497,
77
+ "grad_norm": 0.31115081906318665,
78
+ "learning_rate": 1.9800000000000004e-05,
79
+ "loss": 0.108,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.8661417322834646,
84
+ "grad_norm": 0.3266952633857727,
85
+ "learning_rate": 2.18e-05,
86
+ "loss": 0.105,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.9448818897637795,
91
+ "grad_norm": 0.42559316754341125,
92
+ "learning_rate": 2.38e-05,
93
+ "loss": 0.0931,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 1.0236220472440944,
98
+ "grad_norm": 0.40235716104507446,
99
+ "learning_rate": 2.58e-05,
100
+ "loss": 0.0867,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 1.1023622047244095,
105
+ "grad_norm": 0.3455784022808075,
106
+ "learning_rate": 2.7800000000000005e-05,
107
+ "loss": 0.0843,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 1.1811023622047245,
112
+ "grad_norm": 0.39895790815353394,
113
+ "learning_rate": 2.98e-05,
114
+ "loss": 0.0803,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 1.2598425196850394,
119
+ "grad_norm": 0.32412493228912354,
120
+ "learning_rate": 3.18e-05,
121
+ "loss": 0.075,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 1.3385826771653544,
126
+ "grad_norm": 0.3238581717014313,
127
+ "learning_rate": 3.38e-05,
128
+ "loss": 0.069,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 1.4173228346456692,
133
+ "grad_norm": 0.35713112354278564,
134
+ "learning_rate": 3.58e-05,
135
+ "loss": 0.0632,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 1.4960629921259843,
140
+ "grad_norm": 0.31078988313674927,
141
+ "learning_rate": 3.7800000000000004e-05,
142
+ "loss": 0.0606,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 1.574803149606299,
147
+ "grad_norm": 0.23168951272964478,
148
+ "learning_rate": 3.9800000000000005e-05,
149
+ "loss": 0.0566,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 1.6535433070866141,
154
+ "grad_norm": 0.2528112828731537,
155
+ "learning_rate": 4.18e-05,
156
+ "loss": 0.0553,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 1.7322834645669292,
161
+ "grad_norm": 0.31132972240448,
162
+ "learning_rate": 4.38e-05,
163
+ "loss": 0.0516,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 1.811023622047244,
168
+ "grad_norm": 0.3506482243537903,
169
+ "learning_rate": 4.58e-05,
170
+ "loss": 0.051,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 1.889763779527559,
175
+ "grad_norm": 0.30420321226119995,
176
+ "learning_rate": 4.78e-05,
177
+ "loss": 0.0498,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 1.968503937007874,
182
+ "grad_norm": 0.27608105540275574,
183
+ "learning_rate": 4.9800000000000004e-05,
184
+ "loss": 0.0425,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 2.047244094488189,
189
+ "grad_norm": 0.24153359234333038,
190
+ "learning_rate": 5.1800000000000005e-05,
191
+ "loss": 0.0425,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 2.1259842519685037,
196
+ "grad_norm": 0.2684983015060425,
197
+ "learning_rate": 5.380000000000001e-05,
198
+ "loss": 0.0397,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 2.204724409448819,
203
+ "grad_norm": 0.2812291979789734,
204
+ "learning_rate": 5.580000000000001e-05,
205
+ "loss": 0.0347,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 2.283464566929134,
210
+ "grad_norm": 0.272079199552536,
211
+ "learning_rate": 5.7799999999999995e-05,
212
+ "loss": 0.0339,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 2.362204724409449,
217
+ "grad_norm": 0.30601683259010315,
218
+ "learning_rate": 5.9800000000000003e-05,
219
+ "loss": 0.0359,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 2.440944881889764,
224
+ "grad_norm": 0.3129172921180725,
225
+ "learning_rate": 6.18e-05,
226
+ "loss": 0.0351,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 2.5196850393700787,
231
+ "grad_norm": 0.27252131700515747,
232
+ "learning_rate": 6.38e-05,
233
+ "loss": 0.0287,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 2.5984251968503935,
238
+ "grad_norm": 0.2653070390224457,
239
+ "learning_rate": 6.58e-05,
240
+ "loss": 0.0313,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 2.677165354330709,
245
+ "grad_norm": 0.35808777809143066,
246
+ "learning_rate": 6.780000000000001e-05,
247
+ "loss": 0.0356,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 2.7559055118110236,
252
+ "grad_norm": 0.26742085814476013,
253
+ "learning_rate": 6.98e-05,
254
+ "loss": 0.0299,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 2.8346456692913384,
259
+ "grad_norm": 0.4106348156929016,
260
+ "learning_rate": 7.18e-05,
261
+ "loss": 0.0324,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 2.9133858267716537,
266
+ "grad_norm": 0.213535338640213,
267
+ "learning_rate": 7.38e-05,
268
+ "loss": 0.0273,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 2.9921259842519685,
273
+ "grad_norm": 0.26808497309684753,
274
+ "learning_rate": 7.58e-05,
275
+ "loss": 0.0254,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 3.0708661417322833,
280
+ "grad_norm": 0.18177832663059235,
281
+ "learning_rate": 7.780000000000001e-05,
282
+ "loss": 0.0261,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 3.1496062992125986,
287
+ "grad_norm": 0.2706851065158844,
288
+ "learning_rate": 7.98e-05,
289
+ "loss": 0.0268,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 3.2283464566929134,
294
+ "grad_norm": 0.29524528980255127,
295
+ "learning_rate": 8.18e-05,
296
+ "loss": 0.0285,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 3.3070866141732282,
301
+ "grad_norm": 0.21399272978305817,
302
+ "learning_rate": 8.38e-05,
303
+ "loss": 0.0252,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 3.3858267716535435,
308
+ "grad_norm": 0.290097177028656,
309
+ "learning_rate": 8.58e-05,
310
+ "loss": 0.0279,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 3.4645669291338583,
315
+ "grad_norm": 0.2789689004421234,
316
+ "learning_rate": 8.78e-05,
317
+ "loss": 0.0236,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 3.543307086614173,
322
+ "grad_norm": 0.3307545781135559,
323
+ "learning_rate": 8.98e-05,
324
+ "loss": 0.0256,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 3.622047244094488,
329
+ "grad_norm": 0.2919306457042694,
330
+ "learning_rate": 9.180000000000001e-05,
331
+ "loss": 0.0227,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 3.7007874015748032,
336
+ "grad_norm": 0.27534034848213196,
337
+ "learning_rate": 9.38e-05,
338
+ "loss": 0.0219,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 3.779527559055118,
343
+ "grad_norm": 0.26348116993904114,
344
+ "learning_rate": 9.58e-05,
345
+ "loss": 0.0242,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 3.8582677165354333,
350
+ "grad_norm": 0.29468125104904175,
351
+ "learning_rate": 9.78e-05,
352
+ "loss": 0.0224,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 3.937007874015748,
357
+ "grad_norm": 0.20534993708133698,
358
+ "learning_rate": 9.98e-05,
359
+ "loss": 0.0235,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 4.015748031496063,
364
+ "grad_norm": 0.2911393642425537,
365
+ "learning_rate": 9.9999778549206e-05,
366
+ "loss": 0.0206,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 4.094488188976378,
371
+ "grad_norm": 0.2478438913822174,
372
+ "learning_rate": 9.999901304280685e-05,
373
+ "loss": 0.0219,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 4.173228346456693,
378
+ "grad_norm": 0.32605063915252686,
379
+ "learning_rate": 9.999770075521164e-05,
380
+ "loss": 0.0232,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 4.251968503937007,
385
+ "grad_norm": 0.22585000097751617,
386
+ "learning_rate": 9.99958417007713e-05,
387
+ "loss": 0.0228,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 4.330708661417323,
392
+ "grad_norm": 0.3227289319038391,
393
+ "learning_rate": 9.999343589981615e-05,
394
+ "loss": 0.018,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 4.409448818897638,
399
+ "grad_norm": 0.2523372769355774,
400
+ "learning_rate": 9.999048337865568e-05,
401
+ "loss": 0.0215,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 4.488188976377953,
406
+ "grad_norm": 0.34720173478126526,
407
+ "learning_rate": 9.998698416957815e-05,
408
+ "loss": 0.0242,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 4.566929133858268,
413
+ "grad_norm": 0.24005654454231262,
414
+ "learning_rate": 9.998293831085037e-05,
415
+ "loss": 0.0213,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 4.645669291338582,
420
+ "grad_norm": 0.28940242528915405,
421
+ "learning_rate": 9.997834584671719e-05,
422
+ "loss": 0.0204,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 4.724409448818898,
427
+ "grad_norm": 0.2654191255569458,
428
+ "learning_rate": 9.997320682740107e-05,
429
+ "loss": 0.0217,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 4.803149606299213,
434
+ "grad_norm": 0.2912241816520691,
435
+ "learning_rate": 9.996752130910149e-05,
436
+ "loss": 0.0197,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 4.881889763779528,
441
+ "grad_norm": 0.23718924820423126,
442
+ "learning_rate": 9.99612893539944e-05,
443
+ "loss": 0.0209,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 4.960629921259843,
448
+ "grad_norm": 0.2647818326950073,
449
+ "learning_rate": 9.995451103023144e-05,
450
+ "loss": 0.0222,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 5.039370078740157,
455
+ "grad_norm": 0.35324886441230774,
456
+ "learning_rate": 9.994718641193928e-05,
457
+ "loss": 0.0224,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 5.118110236220472,
462
+ "grad_norm": 0.2671961188316345,
463
+ "learning_rate": 9.993931557921874e-05,
464
+ "loss": 0.0219,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 5.196850393700787,
469
+ "grad_norm": 0.2596529722213745,
470
+ "learning_rate": 9.993089861814402e-05,
471
+ "loss": 0.0203,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 5.275590551181103,
476
+ "grad_norm": 0.25885483622550964,
477
+ "learning_rate": 9.992193562076166e-05,
478
+ "loss": 0.0188,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 5.354330708661418,
483
+ "grad_norm": 0.24976016581058502,
484
+ "learning_rate": 9.991242668508954e-05,
485
+ "loss": 0.0175,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 5.433070866141732,
490
+ "grad_norm": 0.24121227860450745,
491
+ "learning_rate": 9.990237191511587e-05,
492
+ "loss": 0.0158,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 5.511811023622047,
497
+ "grad_norm": 0.22227917611598969,
498
+ "learning_rate": 9.989177142079802e-05,
499
+ "loss": 0.0177,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 5.590551181102362,
504
+ "grad_norm": 0.231464222073555,
505
+ "learning_rate": 9.988062531806126e-05,
506
+ "loss": 0.0183,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 5.669291338582677,
511
+ "grad_norm": 0.16609017550945282,
512
+ "learning_rate": 9.986893372879762e-05,
513
+ "loss": 0.018,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 5.748031496062993,
518
+ "grad_norm": 0.19624024629592896,
519
+ "learning_rate": 9.985669678086443e-05,
520
+ "loss": 0.018,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 5.826771653543307,
525
+ "grad_norm": 0.22255055606365204,
526
+ "learning_rate": 9.984391460808298e-05,
527
+ "loss": 0.0199,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 5.905511811023622,
532
+ "grad_norm": 0.22765639424324036,
533
+ "learning_rate": 9.983058735023709e-05,
534
+ "loss": 0.0191,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 5.984251968503937,
539
+ "grad_norm": 0.23915418982505798,
540
+ "learning_rate": 9.98167151530715e-05,
541
+ "loss": 0.0178,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 6.062992125984252,
546
+ "grad_norm": 0.2489311248064041,
547
+ "learning_rate": 9.980229816829034e-05,
548
+ "loss": 0.0202,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 6.141732283464567,
553
+ "grad_norm": 0.22865547239780426,
554
+ "learning_rate": 9.978733655355544e-05,
555
+ "loss": 0.0187,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 6.2204724409448815,
560
+ "grad_norm": 0.19393905997276306,
561
+ "learning_rate": 9.977183047248464e-05,
562
+ "loss": 0.0168,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 6.299212598425197,
567
+ "grad_norm": 0.20525363087654114,
568
+ "learning_rate": 9.975578009464992e-05,
569
+ "loss": 0.018,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 6.377952755905512,
574
+ "grad_norm": 0.2537108063697815,
575
+ "learning_rate": 9.97391855955757e-05,
576
+ "loss": 0.0143,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 6.456692913385827,
581
+ "grad_norm": 0.2665018141269684,
582
+ "learning_rate": 9.972204715673669e-05,
583
+ "loss": 0.0165,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 6.535433070866142,
588
+ "grad_norm": 0.18383699655532837,
589
+ "learning_rate": 9.970436496555617e-05,
590
+ "loss": 0.0164,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 6.6141732283464565,
595
+ "grad_norm": 0.3430931270122528,
596
+ "learning_rate": 9.968613921540373e-05,
597
+ "loss": 0.0176,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 6.692913385826771,
602
+ "grad_norm": 0.2601425349712372,
603
+ "learning_rate": 9.966737010559326e-05,
604
+ "loss": 0.0175,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 6.771653543307087,
609
+ "grad_norm": 0.19988982379436493,
610
+ "learning_rate": 9.964805784138072e-05,
611
+ "loss": 0.0172,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 6.850393700787402,
616
+ "grad_norm": 0.18660953640937805,
617
+ "learning_rate": 9.962820263396195e-05,
618
+ "loss": 0.0158,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 6.929133858267717,
623
+ "grad_norm": 0.22756962478160858,
624
+ "learning_rate": 9.960780470047033e-05,
625
+ "loss": 0.0185,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 7.0078740157480315,
630
+ "grad_norm": 0.14548353850841522,
631
+ "learning_rate": 9.958686426397437e-05,
632
+ "loss": 0.0164,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 7.086614173228346,
637
+ "grad_norm": 0.20737145841121674,
638
+ "learning_rate": 9.956538155347534e-05,
639
+ "loss": 0.0182,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 7.165354330708661,
644
+ "grad_norm": 0.20689648389816284,
645
+ "learning_rate": 9.95433568039047e-05,
646
+ "loss": 0.0145,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 7.244094488188976,
651
+ "grad_norm": 0.26220783591270447,
652
+ "learning_rate": 9.952079025612162e-05,
653
+ "loss": 0.0145,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 7.322834645669292,
658
+ "grad_norm": 0.23523452877998352,
659
+ "learning_rate": 9.949768215691022e-05,
660
+ "loss": 0.0168,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 7.4015748031496065,
665
+ "grad_norm": 0.207063227891922,
666
+ "learning_rate": 9.9474032758977e-05,
667
+ "loss": 0.0154,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 7.480314960629921,
672
+ "grad_norm": 0.2092580646276474,
673
+ "learning_rate": 9.944984232094794e-05,
674
+ "loss": 0.0169,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 7.559055118110236,
679
+ "grad_norm": 0.1808154582977295,
680
+ "learning_rate": 9.942511110736584e-05,
681
+ "loss": 0.0157,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 7.637795275590551,
686
+ "grad_norm": 0.2190985083580017,
687
+ "learning_rate": 9.939983938868726e-05,
688
+ "loss": 0.0155,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 7.716535433070866,
693
+ "grad_norm": 0.1607908308506012,
694
+ "learning_rate": 9.93740274412797e-05,
695
+ "loss": 0.0136,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 7.7952755905511815,
700
+ "grad_norm": 0.20882774889469147,
701
+ "learning_rate": 9.934767554741846e-05,
702
+ "loss": 0.0192,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 7.874015748031496,
707
+ "grad_norm": 0.18141894042491913,
708
+ "learning_rate": 9.932078399528361e-05,
709
+ "loss": 0.0134,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 7.952755905511811,
714
+ "grad_norm": 0.1842644363641739,
715
+ "learning_rate": 9.929335307895689e-05,
716
+ "loss": 0.0145,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 8.031496062992126,
721
+ "grad_norm": 0.19102592766284943,
722
+ "learning_rate": 9.926538309841839e-05,
723
+ "loss": 0.0179,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 8.11023622047244,
728
+ "grad_norm": 0.2554001212120056,
729
+ "learning_rate": 9.923687435954334e-05,
730
+ "loss": 0.0145,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 8.188976377952756,
735
+ "grad_norm": 0.2188219279050827,
736
+ "learning_rate": 9.920782717409873e-05,
737
+ "loss": 0.0133,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 8.26771653543307,
742
+ "grad_norm": 0.19668325781822205,
743
+ "learning_rate": 9.917824185973994e-05,
744
+ "loss": 0.013,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 8.346456692913385,
749
+ "grad_norm": 0.19224300980567932,
750
+ "learning_rate": 9.914811874000723e-05,
751
+ "loss": 0.012,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 8.4251968503937,
756
+ "grad_norm": 0.2617517113685608,
757
+ "learning_rate": 9.911745814432218e-05,
758
+ "loss": 0.0144,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 8.503937007874015,
763
+ "grad_norm": 0.340850293636322,
764
+ "learning_rate": 9.90862604079842e-05,
765
+ "loss": 0.0163,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 8.582677165354331,
770
+ "grad_norm": 0.24036389589309692,
771
+ "learning_rate": 9.90545258721667e-05,
772
+ "loss": 0.0143,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 8.661417322834646,
777
+ "grad_norm": 0.2523621916770935,
778
+ "learning_rate": 9.90222548839135e-05,
779
+ "loss": 0.0137,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 8.740157480314961,
784
+ "grad_norm": 0.25303855538368225,
785
+ "learning_rate": 9.898944779613495e-05,
786
+ "loss": 0.0124,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 8.818897637795276,
791
+ "grad_norm": 0.2672367990016937,
792
+ "learning_rate": 9.89561049676041e-05,
793
+ "loss": 0.0135,
794
+ "step": 1120
795
+ },
796
+ {
797
+ "epoch": 8.89763779527559,
798
+ "grad_norm": 0.22292408347129822,
799
+ "learning_rate": 9.89222267629528e-05,
800
+ "loss": 0.0155,
801
+ "step": 1130
802
+ },
803
+ {
804
+ "epoch": 8.976377952755906,
805
+ "grad_norm": 0.2113981992006302,
806
+ "learning_rate": 9.888781355266763e-05,
807
+ "loss": 0.0139,
808
+ "step": 1140
809
+ },
810
+ {
811
+ "epoch": 9.05511811023622,
812
+ "grad_norm": 0.16752807796001434,
813
+ "learning_rate": 9.885286571308598e-05,
814
+ "loss": 0.0124,
815
+ "step": 1150
816
+ },
817
+ {
818
+ "epoch": 9.133858267716535,
819
+ "grad_norm": 0.1773703545331955,
820
+ "learning_rate": 9.881738362639182e-05,
821
+ "loss": 0.015,
822
+ "step": 1160
823
+ },
824
+ {
825
+ "epoch": 9.21259842519685,
826
+ "grad_norm": 0.26974138617515564,
827
+ "learning_rate": 9.878136768061154e-05,
828
+ "loss": 0.0162,
829
+ "step": 1170
830
+ },
831
+ {
832
+ "epoch": 9.291338582677165,
833
+ "grad_norm": 0.2184063196182251,
834
+ "learning_rate": 9.874481826960979e-05,
835
+ "loss": 0.0148,
836
+ "step": 1180
837
+ },
838
+ {
839
+ "epoch": 9.37007874015748,
840
+ "grad_norm": 0.1977306753396988,
841
+ "learning_rate": 9.870773579308503e-05,
842
+ "loss": 0.0123,
843
+ "step": 1190
844
+ },
845
+ {
846
+ "epoch": 9.448818897637794,
847
+ "grad_norm": 0.1981269121170044,
848
+ "learning_rate": 9.867012065656533e-05,
849
+ "loss": 0.0152,
850
+ "step": 1200
851
+ },
852
+ {
853
+ "epoch": 9.527559055118111,
854
+ "grad_norm": 0.17817805707454681,
855
+ "learning_rate": 9.863197327140376e-05,
856
+ "loss": 0.0123,
857
+ "step": 1210
858
+ },
859
+ {
860
+ "epoch": 9.606299212598426,
861
+ "grad_norm": 0.23420843482017517,
862
+ "learning_rate": 9.859329405477403e-05,
863
+ "loss": 0.0129,
864
+ "step": 1220
865
+ },
866
+ {
867
+ "epoch": 9.68503937007874,
868
+ "grad_norm": 0.25216200947761536,
869
+ "learning_rate": 9.855408342966585e-05,
870
+ "loss": 0.0138,
871
+ "step": 1230
872
+ },
873
+ {
874
+ "epoch": 9.763779527559056,
875
+ "grad_norm": 0.1990588754415512,
876
+ "learning_rate": 9.851434182488033e-05,
877
+ "loss": 0.0129,
878
+ "step": 1240
879
+ },
880
+ {
881
+ "epoch": 9.84251968503937,
882
+ "grad_norm": 0.27837619185447693,
883
+ "learning_rate": 9.84740696750253e-05,
884
+ "loss": 0.0124,
885
+ "step": 1250
886
+ },
887
+ {
888
+ "epoch": 9.921259842519685,
889
+ "grad_norm": 0.21090054512023926,
890
+ "learning_rate": 9.843326742051055e-05,
891
+ "loss": 0.013,
892
+ "step": 1260
893
+ },
894
+ {
895
+ "epoch": 10.0,
896
+ "grad_norm": 0.19581645727157593,
897
+ "learning_rate": 9.839193550754297e-05,
898
+ "loss": 0.0126,
899
+ "step": 1270
900
+ },
901
+ {
902
+ "epoch": 10.078740157480315,
903
+ "grad_norm": 0.21251627802848816,
904
+ "learning_rate": 9.835007438812177e-05,
905
+ "loss": 0.0148,
906
+ "step": 1280
907
+ },
908
+ {
909
+ "epoch": 10.15748031496063,
910
+ "grad_norm": 0.18511821329593658,
911
+ "learning_rate": 9.830768452003341e-05,
912
+ "loss": 0.0133,
913
+ "step": 1290
914
+ },
915
+ {
916
+ "epoch": 10.236220472440944,
917
+ "grad_norm": 0.18811464309692383,
918
+ "learning_rate": 9.826476636684671e-05,
919
+ "loss": 0.0126,
920
+ "step": 1300
921
+ },
922
+ {
923
+ "epoch": 10.31496062992126,
924
+ "grad_norm": 0.18782231211662292,
925
+ "learning_rate": 9.822132039790773e-05,
926
+ "loss": 0.0117,
927
+ "step": 1310
928
+ },
929
+ {
930
+ "epoch": 10.393700787401574,
931
+ "grad_norm": 0.16824057698249817,
932
+ "learning_rate": 9.817734708833461e-05,
933
+ "loss": 0.0106,
934
+ "step": 1320
935
+ },
936
+ {
937
+ "epoch": 10.472440944881889,
938
+ "grad_norm": 0.1814710795879364,
939
+ "learning_rate": 9.813284691901243e-05,
940
+ "loss": 0.0162,
941
+ "step": 1330
942
+ },
943
+ {
944
+ "epoch": 10.551181102362206,
945
+ "grad_norm": 0.2217687964439392,
946
+ "learning_rate": 9.808782037658792e-05,
947
+ "loss": 0.0155,
948
+ "step": 1340
949
+ },
950
+ {
951
+ "epoch": 10.62992125984252,
952
+ "grad_norm": 0.19781896471977234,
953
+ "learning_rate": 9.804226795346411e-05,
954
+ "loss": 0.0133,
955
+ "step": 1350
956
+ },
957
+ {
958
+ "epoch": 10.708661417322835,
959
+ "grad_norm": 0.24714171886444092,
960
+ "learning_rate": 9.799619014779503e-05,
961
+ "loss": 0.0129,
962
+ "step": 1360
963
+ },
964
+ {
965
+ "epoch": 10.78740157480315,
966
+ "grad_norm": 0.16805458068847656,
967
+ "learning_rate": 9.794958746348013e-05,
968
+ "loss": 0.0125,
969
+ "step": 1370
970
+ },
971
+ {
972
+ "epoch": 10.866141732283465,
973
+ "grad_norm": 0.18694327771663666,
974
+ "learning_rate": 9.790246041015896e-05,
975
+ "loss": 0.0112,
976
+ "step": 1380
977
+ },
978
+ {
979
+ "epoch": 10.94488188976378,
980
+ "grad_norm": 0.21768535673618317,
981
+ "learning_rate": 9.785480950320538e-05,
982
+ "loss": 0.0121,
983
+ "step": 1390
984
+ },
985
+ {
986
+ "epoch": 11.023622047244094,
987
+ "grad_norm": 0.16912485659122467,
988
+ "learning_rate": 9.78066352637221e-05,
989
+ "loss": 0.0109,
990
+ "step": 1400
991
+ },
992
+ {
993
+ "epoch": 11.10236220472441,
994
+ "grad_norm": 0.15913233160972595,
995
+ "learning_rate": 9.775793821853488e-05,
996
+ "loss": 0.0115,
997
+ "step": 1410
998
+ },
999
+ {
1000
+ "epoch": 11.181102362204724,
1001
+ "grad_norm": 0.15250848233699799,
1002
+ "learning_rate": 9.77087189001868e-05,
1003
+ "loss": 0.0123,
1004
+ "step": 1420
1005
+ },
1006
+ {
1007
+ "epoch": 11.259842519685039,
1008
+ "grad_norm": 0.17317131161689758,
1009
+ "learning_rate": 9.765897784693243e-05,
1010
+ "loss": 0.0117,
1011
+ "step": 1430
1012
+ },
1013
+ {
1014
+ "epoch": 11.338582677165354,
1015
+ "grad_norm": 0.23304998874664307,
1016
+ "learning_rate": 9.760871560273197e-05,
1017
+ "loss": 0.0107,
1018
+ "step": 1440
1019
+ },
1020
+ {
1021
+ "epoch": 11.417322834645669,
1022
+ "grad_norm": 0.2260117381811142,
1023
+ "learning_rate": 9.755793271724526e-05,
1024
+ "loss": 0.0113,
1025
+ "step": 1450
1026
+ },
1027
+ {
1028
+ "epoch": 11.496062992125985,
1029
+ "grad_norm": 0.20854035019874573,
1030
+ "learning_rate": 9.750662974582584e-05,
1031
+ "loss": 0.0156,
1032
+ "step": 1460
1033
+ },
1034
+ {
1035
+ "epoch": 11.5748031496063,
1036
+ "grad_norm": 0.18729598820209503,
1037
+ "learning_rate": 9.745480724951473e-05,
1038
+ "loss": 0.0115,
1039
+ "step": 1470
1040
+ },
1041
+ {
1042
+ "epoch": 11.653543307086615,
1043
+ "grad_norm": 0.1489574909210205,
1044
+ "learning_rate": 9.740246579503447e-05,
1045
+ "loss": 0.0122,
1046
+ "step": 1480
1047
+ },
1048
+ {
1049
+ "epoch": 11.73228346456693,
1050
+ "grad_norm": 0.16865724325180054,
1051
+ "learning_rate": 9.734960595478284e-05,
1052
+ "loss": 0.0121,
1053
+ "step": 1490
1054
+ },
1055
+ {
1056
+ "epoch": 11.811023622047244,
1057
+ "grad_norm": 0.1705121397972107,
1058
+ "learning_rate": 9.729622830682657e-05,
1059
+ "loss": 0.0117,
1060
+ "step": 1500
1061
+ },
1062
+ {
1063
+ "epoch": 11.88976377952756,
1064
+ "grad_norm": 0.12779462337493896,
1065
+ "learning_rate": 9.724233343489504e-05,
1066
+ "loss": 0.013,
1067
+ "step": 1510
1068
+ },
1069
+ {
1070
+ "epoch": 11.968503937007874,
1071
+ "grad_norm": 0.21109400689601898,
1072
+ "learning_rate": 9.718792192837396e-05,
1073
+ "loss": 0.0105,
1074
+ "step": 1520
1075
+ },
1076
+ {
1077
+ "epoch": 12.047244094488189,
1078
+ "grad_norm": 0.17350123822689056,
1079
+ "learning_rate": 9.713299438229886e-05,
1080
+ "loss": 0.0129,
1081
+ "step": 1530
1082
+ },
1083
+ {
1084
+ "epoch": 12.125984251968504,
1085
+ "grad_norm": 0.19555015861988068,
1086
+ "learning_rate": 9.707755139734855e-05,
1087
+ "loss": 0.0131,
1088
+ "step": 1540
1089
+ },
1090
+ {
1091
+ "epoch": 12.204724409448819,
1092
+ "grad_norm": 0.22949132323265076,
1093
+ "learning_rate": 9.702159357983866e-05,
1094
+ "loss": 0.0122,
1095
+ "step": 1550
1096
+ },
1097
+ {
1098
+ "epoch": 12.283464566929133,
1099
+ "grad_norm": 0.21299389004707336,
1100
+ "learning_rate": 9.696512154171492e-05,
1101
+ "loss": 0.013,
1102
+ "step": 1560
1103
+ },
1104
+ {
1105
+ "epoch": 12.362204724409448,
1106
+ "grad_norm": 0.2029636800289154,
1107
+ "learning_rate": 9.690813590054645e-05,
1108
+ "loss": 0.0127,
1109
+ "step": 1570
1110
+ },
1111
+ {
1112
+ "epoch": 12.440944881889763,
1113
+ "grad_norm": 0.2509428858757019,
1114
+ "learning_rate": 9.685063727951914e-05,
1115
+ "loss": 0.0115,
1116
+ "step": 1580
1117
+ },
1118
+ {
1119
+ "epoch": 12.519685039370078,
1120
+ "grad_norm": 0.17952832579612732,
1121
+ "learning_rate": 9.679262630742865e-05,
1122
+ "loss": 0.0123,
1123
+ "step": 1590
1124
+ },
1125
+ {
1126
+ "epoch": 12.598425196850394,
1127
+ "grad_norm": 0.17356553673744202,
1128
+ "learning_rate": 9.673410361867373e-05,
1129
+ "loss": 0.0133,
1130
+ "step": 1600
1131
+ },
1132
+ {
1133
+ "epoch": 12.67716535433071,
1134
+ "grad_norm": 0.2649160921573639,
1135
+ "learning_rate": 9.667506985324909e-05,
1136
+ "loss": 0.0116,
1137
+ "step": 1610
1138
+ },
1139
+ {
1140
+ "epoch": 12.755905511811024,
1141
+ "grad_norm": 0.209790900349617,
1142
+ "learning_rate": 9.661552565673855e-05,
1143
+ "loss": 0.0104,
1144
+ "step": 1620
1145
+ },
1146
+ {
1147
+ "epoch": 12.834645669291339,
1148
+ "grad_norm": 0.21641805768013,
1149
+ "learning_rate": 9.655547168030789e-05,
1150
+ "loss": 0.0129,
1151
+ "step": 1630
1152
+ },
1153
+ {
1154
+ "epoch": 12.913385826771654,
1155
+ "grad_norm": 0.2454116940498352,
1156
+ "learning_rate": 9.649490858069777e-05,
1157
+ "loss": 0.0104,
1158
+ "step": 1640
1159
+ },
1160
+ {
1161
+ "epoch": 12.992125984251969,
1162
+ "grad_norm": 0.21532991528511047,
1163
+ "learning_rate": 9.643383702021658e-05,
1164
+ "loss": 0.0111,
1165
+ "step": 1650
1166
+ },
1167
+ {
1168
+ "epoch": 13.070866141732283,
1169
+ "grad_norm": 0.22105859220027924,
1170
+ "learning_rate": 9.637225766673307e-05,
1171
+ "loss": 0.0098,
1172
+ "step": 1660
1173
+ },
1174
+ {
1175
+ "epoch": 13.149606299212598,
1176
+ "grad_norm": 0.16493800282478333,
1177
+ "learning_rate": 9.631017119366922e-05,
1178
+ "loss": 0.0107,
1179
+ "step": 1670
1180
+ },
1181
+ {
1182
+ "epoch": 13.228346456692913,
1183
+ "grad_norm": 0.18128368258476257,
1184
+ "learning_rate": 9.624757827999273e-05,
1185
+ "loss": 0.0117,
1186
+ "step": 1680
1187
+ },
1188
+ {
1189
+ "epoch": 13.307086614173228,
1190
+ "grad_norm": 0.2412339746952057,
1191
+ "learning_rate": 9.618447961020971e-05,
1192
+ "loss": 0.015,
1193
+ "step": 1690
1194
+ },
1195
+ {
1196
+ "epoch": 13.385826771653543,
1197
+ "grad_norm": 0.23829780519008636,
1198
+ "learning_rate": 9.612087587435707e-05,
1199
+ "loss": 0.0146,
1200
+ "step": 1700
1201
+ },
1202
+ {
1203
+ "epoch": 13.464566929133857,
1204
+ "grad_norm": 0.17435379326343536,
1205
+ "learning_rate": 9.605676776799508e-05,
1206
+ "loss": 0.012,
1207
+ "step": 1710
1208
+ },
1209
+ {
1210
+ "epoch": 13.543307086614174,
1211
+ "grad_norm": 0.26677370071411133,
1212
+ "learning_rate": 9.599215599219973e-05,
1213
+ "loss": 0.0119,
1214
+ "step": 1720
1215
+ },
1216
+ {
1217
+ "epoch": 13.622047244094489,
1218
+ "grad_norm": 0.17352107167243958,
1219
+ "learning_rate": 9.592704125355505e-05,
1220
+ "loss": 0.0119,
1221
+ "step": 1730
1222
+ },
1223
+ {
1224
+ "epoch": 13.700787401574804,
1225
+ "grad_norm": 0.1817910224199295,
1226
+ "learning_rate": 9.586142426414538e-05,
1227
+ "loss": 0.011,
1228
+ "step": 1740
1229
+ },
1230
+ {
1231
+ "epoch": 13.779527559055119,
1232
+ "grad_norm": 0.26779067516326904,
1233
+ "learning_rate": 9.57953057415476e-05,
1234
+ "loss": 0.0137,
1235
+ "step": 1750
1236
+ },
1237
+ {
1238
+ "epoch": 13.858267716535433,
1239
+ "grad_norm": 0.16992807388305664,
1240
+ "learning_rate": 9.572868640882328e-05,
1241
+ "loss": 0.0116,
1242
+ "step": 1760
1243
+ },
1244
+ {
1245
+ "epoch": 13.937007874015748,
1246
+ "grad_norm": 0.2475721836090088,
1247
+ "learning_rate": 9.56615669945108e-05,
1248
+ "loss": 0.013,
1249
+ "step": 1770
1250
+ },
1251
+ {
1252
+ "epoch": 14.015748031496063,
1253
+ "grad_norm": 0.30210572481155396,
1254
+ "learning_rate": 9.55939482326173e-05,
1255
+ "loss": 0.0124,
1256
+ "step": 1780
1257
+ },
1258
+ {
1259
+ "epoch": 14.094488188976378,
1260
+ "grad_norm": 0.19526968896389008,
1261
+ "learning_rate": 9.552583086261069e-05,
1262
+ "loss": 0.0135,
1263
+ "step": 1790
1264
+ },
1265
+ {
1266
+ "epoch": 14.173228346456693,
1267
+ "grad_norm": 0.1772489845752716,
1268
+ "learning_rate": 9.545721562941168e-05,
1269
+ "loss": 0.0119,
1270
+ "step": 1800
1271
+ },
1272
+ {
1273
+ "epoch": 14.251968503937007,
1274
+ "grad_norm": 0.20985430479049683,
1275
+ "learning_rate": 9.538810328338543e-05,
1276
+ "loss": 0.0106,
1277
+ "step": 1810
1278
+ },
1279
+ {
1280
+ "epoch": 14.330708661417322,
1281
+ "grad_norm": 0.2288864552974701,
1282
+ "learning_rate": 9.531849458033349e-05,
1283
+ "loss": 0.0121,
1284
+ "step": 1820
1285
+ },
1286
+ {
1287
+ "epoch": 14.409448818897637,
1288
+ "grad_norm": 0.14826878905296326,
1289
+ "learning_rate": 9.524839028148547e-05,
1290
+ "loss": 0.0109,
1291
+ "step": 1830
1292
+ },
1293
+ {
1294
+ "epoch": 14.488188976377952,
1295
+ "grad_norm": 0.24729447066783905,
1296
+ "learning_rate": 9.517779115349077e-05,
1297
+ "loss": 0.0122,
1298
+ "step": 1840
1299
+ },
1300
+ {
1301
+ "epoch": 14.566929133858268,
1302
+ "grad_norm": 0.23712359368801117,
1303
+ "learning_rate": 9.510669796841014e-05,
1304
+ "loss": 0.012,
1305
+ "step": 1850
1306
+ },
1307
+ {
1308
+ "epoch": 14.645669291338583,
1309
+ "grad_norm": 0.24393972754478455,
1310
+ "learning_rate": 9.503511150370727e-05,
1311
+ "loss": 0.014,
1312
+ "step": 1860
1313
+ },
1314
+ {
1315
+ "epoch": 14.724409448818898,
1316
+ "grad_norm": 0.16620883345603943,
1317
+ "learning_rate": 9.496303254224024e-05,
1318
+ "loss": 0.0124,
1319
+ "step": 1870
1320
+ },
1321
+ {
1322
+ "epoch": 14.803149606299213,
1323
+ "grad_norm": 0.19335606694221497,
1324
+ "learning_rate": 9.489046187225306e-05,
1325
+ "loss": 0.0116,
1326
+ "step": 1880
1327
+ },
1328
+ {
1329
+ "epoch": 14.881889763779528,
1330
+ "grad_norm": 0.17978379130363464,
1331
+ "learning_rate": 9.481740028736692e-05,
1332
+ "loss": 0.0127,
1333
+ "step": 1890
1334
+ },
1335
+ {
1336
+ "epoch": 14.960629921259843,
1337
+ "grad_norm": 0.15655072033405304,
1338
+ "learning_rate": 9.474384858657164e-05,
1339
+ "loss": 0.0112,
1340
+ "step": 1900
1341
+ },
1342
+ {
1343
+ "epoch": 15.039370078740157,
1344
+ "grad_norm": 0.13158245384693146,
1345
+ "learning_rate": 9.466980757421679e-05,
1346
+ "loss": 0.011,
1347
+ "step": 1910
1348
+ },
1349
+ {
1350
+ "epoch": 15.118110236220472,
1351
+ "grad_norm": 0.21858836710453033,
1352
+ "learning_rate": 9.459527806000305e-05,
1353
+ "loss": 0.0116,
1354
+ "step": 1920
1355
+ },
1356
+ {
1357
+ "epoch": 15.196850393700787,
1358
+ "grad_norm": 0.19522692263126373,
1359
+ "learning_rate": 9.452026085897325e-05,
1360
+ "loss": 0.0118,
1361
+ "step": 1930
1362
+ },
1363
+ {
1364
+ "epoch": 15.275590551181102,
1365
+ "grad_norm": 0.20890626311302185,
1366
+ "learning_rate": 9.444475679150348e-05,
1367
+ "loss": 0.0111,
1368
+ "step": 1940
1369
+ },
1370
+ {
1371
+ "epoch": 15.354330708661417,
1372
+ "grad_norm": 0.20746995508670807,
1373
+ "learning_rate": 9.436876668329411e-05,
1374
+ "loss": 0.0107,
1375
+ "step": 1950
1376
+ },
1377
+ {
1378
+ "epoch": 15.433070866141732,
1379
+ "grad_norm": 0.18878526985645294,
1380
+ "learning_rate": 9.429229136536079e-05,
1381
+ "loss": 0.0105,
1382
+ "step": 1960
1383
+ },
1384
+ {
1385
+ "epoch": 15.511811023622048,
1386
+ "grad_norm": 0.18786223232746124,
1387
+ "learning_rate": 9.421533167402534e-05,
1388
+ "loss": 0.0112,
1389
+ "step": 1970
1390
+ },
1391
+ {
1392
+ "epoch": 15.590551181102363,
1393
+ "grad_norm": 0.12698164582252502,
1394
+ "learning_rate": 9.413788845090666e-05,
1395
+ "loss": 0.011,
1396
+ "step": 1980
1397
+ },
1398
+ {
1399
+ "epoch": 15.669291338582678,
1400
+ "grad_norm": 0.22439169883728027,
1401
+ "learning_rate": 9.405996254291136e-05,
1402
+ "loss": 0.0113,
1403
+ "step": 1990
1404
+ },
1405
+ {
1406
+ "epoch": 15.748031496062993,
1407
+ "grad_norm": 0.16835476458072662,
1408
+ "learning_rate": 9.398155480222474e-05,
1409
+ "loss": 0.0111,
1410
+ "step": 2000
1411
+ }
1412
+ ],
1413
+ "logging_steps": 10,
1414
+ "max_steps": 10000,
1415
+ "num_input_tokens_seen": 0,
1416
+ "num_train_epochs": 79,
1417
+ "save_steps": 2000,
1418
+ "stateful_callbacks": {
1419
+ "TrainerControl": {
1420
+ "args": {
1421
+ "should_epoch_stop": false,
1422
+ "should_evaluate": false,
1423
+ "should_log": false,
1424
+ "should_save": true,
1425
+ "should_training_stop": false
1426
+ },
1427
+ "attributes": {}
1428
+ }
1429
+ },
1430
+ "total_flos": 0.0,
1431
+ "train_batch_size": 64,
1432
+ "trial_name": null,
1433
+ "trial_params": null
1434
+ }