thng292 commited on
Commit
2542363
·
verified ·
1 Parent(s): 8c70746

Upload folder using huggingface_hub

Browse files
20250501-1443/checkpoint-111/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4e8ddbb7db75c7753a0072a9d206bc3fcfe6cca185cde5e346cb5fa101933ba
3
+ size 33824762
20250501-1443/checkpoint-111/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85d26d8feb40fa988797a0da10f54092b69129fc55181e59693208bd7fa9a889
3
+ size 368596590
20250501-1443/checkpoint-111/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:238523785dc8a6bd63d8f51ae72844d04988b9978d2050f234191a39fe7b1141
3
+ size 14244
20250501-1443/checkpoint-111/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5aaf933d4fb1ea8eea938475ce05bd1a1bf3e571898db00f4d1ce0efa30350bb
3
+ size 1064
20250501-1443/checkpoint-111/trainer_state.json ADDED
@@ -0,0 +1,811 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.9955156950672646,
6
+ "eval_steps": 200,
7
+ "global_step": 111,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.008968609865470852,
14
+ "grad_norm": 2.960822105407715,
15
+ "learning_rate": 1e-05,
16
+ "loss": 2.8706,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.017937219730941704,
21
+ "grad_norm": 0.8830159306526184,
22
+ "learning_rate": 9.99799753559161e-06,
23
+ "loss": 1.3321,
24
+ "step": 2
25
+ },
26
+ {
27
+ "epoch": 0.026905829596412557,
28
+ "grad_norm": 0.6235600113868713,
29
+ "learning_rate": 9.991991746311916e-06,
30
+ "loss": 1.286,
31
+ "step": 3
32
+ },
33
+ {
34
+ "epoch": 0.03587443946188341,
35
+ "grad_norm": 3.6786465644836426,
36
+ "learning_rate": 9.981987442712634e-06,
37
+ "loss": 6.3481,
38
+ "step": 4
39
+ },
40
+ {
41
+ "epoch": 0.04484304932735426,
42
+ "grad_norm": 0.9499123096466064,
43
+ "learning_rate": 9.967992638098517e-06,
44
+ "loss": 0.9874,
45
+ "step": 5
46
+ },
47
+ {
48
+ "epoch": 0.053811659192825115,
49
+ "grad_norm": 0.5928323268890381,
50
+ "learning_rate": 9.950018542108818e-06,
51
+ "loss": 2.2348,
52
+ "step": 6
53
+ },
54
+ {
55
+ "epoch": 0.06278026905829596,
56
+ "grad_norm": 0.5137901306152344,
57
+ "learning_rate": 9.928079551738542e-06,
58
+ "loss": 0.9355,
59
+ "step": 7
60
+ },
61
+ {
62
+ "epoch": 0.07174887892376682,
63
+ "grad_norm": 0.9037736654281616,
64
+ "learning_rate": 9.902193239806634e-06,
65
+ "loss": 2.7805,
66
+ "step": 8
67
+ },
68
+ {
69
+ "epoch": 0.08071748878923767,
70
+ "grad_norm": 0.44555988907814026,
71
+ "learning_rate": 9.872380340880416e-06,
72
+ "loss": 1.0094,
73
+ "step": 9
74
+ },
75
+ {
76
+ "epoch": 0.08968609865470852,
77
+ "grad_norm": 0.5814350843429565,
78
+ "learning_rate": 9.838664734667496e-06,
79
+ "loss": 1.3459,
80
+ "step": 10
81
+ },
82
+ {
83
+ "epoch": 0.09865470852017937,
84
+ "grad_norm": 3.1227192878723145,
85
+ "learning_rate": 9.801073426888447e-06,
86
+ "loss": 6.6014,
87
+ "step": 11
88
+ },
89
+ {
90
+ "epoch": 0.10762331838565023,
91
+ "grad_norm": 1.7620548009872437,
92
+ "learning_rate": 9.759636527645633e-06,
93
+ "loss": 2.0171,
94
+ "step": 12
95
+ },
96
+ {
97
+ "epoch": 0.11659192825112108,
98
+ "grad_norm": 1.4160568714141846,
99
+ "learning_rate": 9.714387227305422e-06,
100
+ "loss": 0.4961,
101
+ "step": 13
102
+ },
103
+ {
104
+ "epoch": 0.12556053811659193,
105
+ "grad_norm": 2.736999750137329,
106
+ "learning_rate": 9.665361769913187e-06,
107
+ "loss": 6.0883,
108
+ "step": 14
109
+ },
110
+ {
111
+ "epoch": 0.13452914798206278,
112
+ "grad_norm": 1.6954115629196167,
113
+ "learning_rate": 9.612599424162344e-06,
114
+ "loss": 2.2329,
115
+ "step": 15
116
+ },
117
+ {
118
+ "epoch": 0.14349775784753363,
119
+ "grad_norm": 1.6500178575515747,
120
+ "learning_rate": 9.55614245194068e-06,
121
+ "loss": 2.9183,
122
+ "step": 16
123
+ },
124
+ {
125
+ "epoch": 0.15246636771300448,
126
+ "grad_norm": 0.3353758454322815,
127
+ "learning_rate": 9.496036074479184e-06,
128
+ "loss": 0.9336,
129
+ "step": 17
130
+ },
131
+ {
132
+ "epoch": 0.16143497757847533,
133
+ "grad_norm": 8.906839370727539,
134
+ "learning_rate": 9.432328436130493e-06,
135
+ "loss": 20.6856,
136
+ "step": 18
137
+ },
138
+ {
139
+ "epoch": 0.17040358744394618,
140
+ "grad_norm": 0.25995615124702454,
141
+ "learning_rate": 9.365070565805941e-06,
142
+ "loss": 0.5657,
143
+ "step": 19
144
+ },
145
+ {
146
+ "epoch": 0.17937219730941703,
147
+ "grad_norm": 0.4160774350166321,
148
+ "learning_rate": 9.294316336102132e-06,
149
+ "loss": 0.8398,
150
+ "step": 20
151
+ },
152
+ {
153
+ "epoch": 0.18834080717488788,
154
+ "grad_norm": 1.9057613611221313,
155
+ "learning_rate": 9.220122420149753e-06,
156
+ "loss": 0.5355,
157
+ "step": 21
158
+ },
159
+ {
160
+ "epoch": 0.19730941704035873,
161
+ "grad_norm": 7.734179496765137,
162
+ "learning_rate": 9.142548246219212e-06,
163
+ "loss": 13.5005,
164
+ "step": 22
165
+ },
166
+ {
167
+ "epoch": 0.2062780269058296,
168
+ "grad_norm": 2.181833505630493,
169
+ "learning_rate": 9.06165595011943e-06,
170
+ "loss": 2.8525,
171
+ "step": 23
172
+ },
173
+ {
174
+ "epoch": 0.21524663677130046,
175
+ "grad_norm": 0.7631401419639587,
176
+ "learning_rate": 8.97751032542795e-06,
177
+ "loss": 1.1342,
178
+ "step": 24
179
+ },
180
+ {
181
+ "epoch": 0.2242152466367713,
182
+ "grad_norm": 0.3742104470729828,
183
+ "learning_rate": 8.890178771592198e-06,
184
+ "loss": 0.9673,
185
+ "step": 25
186
+ },
187
+ {
188
+ "epoch": 0.23318385650224216,
189
+ "grad_norm": 0.39411431550979614,
190
+ "learning_rate": 8.799731239943488e-06,
191
+ "loss": 2.3545,
192
+ "step": 26
193
+ },
194
+ {
195
+ "epoch": 0.242152466367713,
196
+ "grad_norm": 0.22178885340690613,
197
+ "learning_rate": 8.706240177667003e-06,
198
+ "loss": 0.5843,
199
+ "step": 27
200
+ },
201
+ {
202
+ "epoch": 0.25112107623318386,
203
+ "grad_norm": 1.9868074655532837,
204
+ "learning_rate": 8.609780469772623e-06,
205
+ "loss": 2.832,
206
+ "step": 28
207
+ },
208
+ {
209
+ "epoch": 0.2600896860986547,
210
+ "grad_norm": 1.152679443359375,
211
+ "learning_rate": 8.510429379113114e-06,
212
+ "loss": 1.091,
213
+ "step": 29
214
+ },
215
+ {
216
+ "epoch": 0.26905829596412556,
217
+ "grad_norm": 3.0194895267486572,
218
+ "learning_rate": 8.408266484497664e-06,
219
+ "loss": 6.3736,
220
+ "step": 30
221
+ },
222
+ {
223
+ "epoch": 0.27802690582959644,
224
+ "grad_norm": 0.9038723707199097,
225
+ "learning_rate": 8.303373616950408e-06,
226
+ "loss": 0.5556,
227
+ "step": 31
228
+ },
229
+ {
230
+ "epoch": 0.28699551569506726,
231
+ "grad_norm": 1.5291337966918945,
232
+ "learning_rate": 8.195834794164925e-06,
233
+ "loss": 0.7135,
234
+ "step": 32
235
+ },
236
+ {
237
+ "epoch": 0.29596412556053814,
238
+ "grad_norm": 2.6512932777404785,
239
+ "learning_rate": 8.085736153207277e-06,
240
+ "loss": 3.171,
241
+ "step": 33
242
+ },
243
+ {
244
+ "epoch": 0.30493273542600896,
245
+ "grad_norm": 1.5600694417953491,
246
+ "learning_rate": 7.973165881521435e-06,
247
+ "loss": 1.4656,
248
+ "step": 34
249
+ },
250
+ {
251
+ "epoch": 0.31390134529147984,
252
+ "grad_norm": 3.086857557296753,
253
+ "learning_rate": 7.858214146292394e-06,
254
+ "loss": 6.4496,
255
+ "step": 35
256
+ },
257
+ {
258
+ "epoch": 0.32286995515695066,
259
+ "grad_norm": 0.5738726258277893,
260
+ "learning_rate": 7.74097302222355e-06,
261
+ "loss": 1.0392,
262
+ "step": 36
263
+ },
264
+ {
265
+ "epoch": 0.33183856502242154,
266
+ "grad_norm": 0.6252602934837341,
267
+ "learning_rate": 7.621536417786159e-06,
268
+ "loss": 2.0603,
269
+ "step": 37
270
+ },
271
+ {
272
+ "epoch": 0.34080717488789236,
273
+ "grad_norm": 1.3429893255233765,
274
+ "learning_rate": 7.500000000000001e-06,
275
+ "loss": 1.9293,
276
+ "step": 38
277
+ },
278
+ {
279
+ "epoch": 0.34977578475336324,
280
+ "grad_norm": 4.64713716506958,
281
+ "learning_rate": 7.37646111780545e-06,
282
+ "loss": 10.205,
283
+ "step": 39
284
+ },
285
+ {
286
+ "epoch": 0.35874439461883406,
287
+ "grad_norm": 1.8087923526763916,
288
+ "learning_rate": 7.251018724088367e-06,
289
+ "loss": 4.4365,
290
+ "step": 40
291
+ },
292
+ {
293
+ "epoch": 0.36771300448430494,
294
+ "grad_norm": 7.087998390197754,
295
+ "learning_rate": 7.12377329642024e-06,
296
+ "loss": 13.9087,
297
+ "step": 41
298
+ },
299
+ {
300
+ "epoch": 0.37668161434977576,
301
+ "grad_norm": 0.43495556712150574,
302
+ "learning_rate": 6.994826756577082e-06,
303
+ "loss": 1.1252,
304
+ "step": 42
305
+ },
306
+ {
307
+ "epoch": 0.38565022421524664,
308
+ "grad_norm": 0.846789538860321,
309
+ "learning_rate": 6.864282388901544e-06,
310
+ "loss": 3.3436,
311
+ "step": 43
312
+ },
313
+ {
314
+ "epoch": 0.39461883408071746,
315
+ "grad_norm": 2.1694114208221436,
316
+ "learning_rate": 6.732244757573619e-06,
317
+ "loss": 5.1797,
318
+ "step": 44
319
+ },
320
+ {
321
+ "epoch": 0.40358744394618834,
322
+ "grad_norm": 1.503968358039856,
323
+ "learning_rate": 6.598819622856227e-06,
324
+ "loss": 1.6628,
325
+ "step": 45
326
+ },
327
+ {
328
+ "epoch": 0.4125560538116592,
329
+ "grad_norm": 0.7890214323997498,
330
+ "learning_rate": 6.464113856382752e-06,
331
+ "loss": 1.7293,
332
+ "step": 46
333
+ },
334
+ {
335
+ "epoch": 0.42152466367713004,
336
+ "grad_norm": 7.634031772613525,
337
+ "learning_rate": 6.328235355554382e-06,
338
+ "loss": 18.4625,
339
+ "step": 47
340
+ },
341
+ {
342
+ "epoch": 0.4304932735426009,
343
+ "grad_norm": 0.4779037535190582,
344
+ "learning_rate": 6.191292957115825e-06,
345
+ "loss": 1.503,
346
+ "step": 48
347
+ },
348
+ {
349
+ "epoch": 0.43946188340807174,
350
+ "grad_norm": 1.601840615272522,
351
+ "learning_rate": 6.053396349978632e-06,
352
+ "loss": 1.3566,
353
+ "step": 49
354
+ },
355
+ {
356
+ "epoch": 0.4484304932735426,
357
+ "grad_norm": 0.35992446541786194,
358
+ "learning_rate": 5.914655987361934e-06,
359
+ "loss": 1.2768,
360
+ "step": 50
361
+ },
362
+ {
363
+ "epoch": 0.45739910313901344,
364
+ "grad_norm": 5.177908420562744,
365
+ "learning_rate": 5.77518299832099e-06,
366
+ "loss": 8.8561,
367
+ "step": 51
368
+ },
369
+ {
370
+ "epoch": 0.4663677130044843,
371
+ "grad_norm": 1.3006104230880737,
372
+ "learning_rate": 5.635089098734394e-06,
373
+ "loss": 0.5305,
374
+ "step": 52
375
+ },
376
+ {
377
+ "epoch": 0.47533632286995514,
378
+ "grad_norm": 7.080929279327393,
379
+ "learning_rate": 5.49448650182125e-06,
380
+ "loss": 20.9515,
381
+ "step": 53
382
+ },
383
+ {
384
+ "epoch": 0.484304932735426,
385
+ "grad_norm": 0.9084333777427673,
386
+ "learning_rate": 5.353487828259973e-06,
387
+ "loss": 1.2171,
388
+ "step": 54
389
+ },
390
+ {
391
+ "epoch": 0.49327354260089684,
392
+ "grad_norm": 9.15694808959961,
393
+ "learning_rate": 5.212206015980742e-06,
394
+ "loss": 21.9408,
395
+ "step": 55
396
+ },
397
+ {
398
+ "epoch": 0.5022421524663677,
399
+ "grad_norm": 1.3177266120910645,
400
+ "learning_rate": 5.070754229703811e-06,
401
+ "loss": 2.9937,
402
+ "step": 56
403
+ },
404
+ {
405
+ "epoch": 0.5112107623318386,
406
+ "grad_norm": 1.0787602663040161,
407
+ "learning_rate": 4.929245770296191e-06,
408
+ "loss": 0.6544,
409
+ "step": 57
410
+ },
411
+ {
412
+ "epoch": 0.5201793721973094,
413
+ "grad_norm": 1.7759263515472412,
414
+ "learning_rate": 4.78779398401926e-06,
415
+ "loss": 0.5922,
416
+ "step": 58
417
+ },
418
+ {
419
+ "epoch": 0.5291479820627802,
420
+ "grad_norm": 1.2032181024551392,
421
+ "learning_rate": 4.646512171740028e-06,
422
+ "loss": 2.3146,
423
+ "step": 59
424
+ },
425
+ {
426
+ "epoch": 0.5381165919282511,
427
+ "grad_norm": 1.228383183479309,
428
+ "learning_rate": 4.505513498178752e-06,
429
+ "loss": 0.5068,
430
+ "step": 60
431
+ },
432
+ {
433
+ "epoch": 0.547085201793722,
434
+ "grad_norm": 2.082943916320801,
435
+ "learning_rate": 4.364910901265607e-06,
436
+ "loss": 0.6455,
437
+ "step": 61
438
+ },
439
+ {
440
+ "epoch": 0.5560538116591929,
441
+ "grad_norm": 5.729760646820068,
442
+ "learning_rate": 4.224817001679011e-06,
443
+ "loss": 13.7731,
444
+ "step": 62
445
+ },
446
+ {
447
+ "epoch": 0.5650224215246636,
448
+ "grad_norm": 0.9803327918052673,
449
+ "learning_rate": 4.085344012638067e-06,
450
+ "loss": 0.6381,
451
+ "step": 63
452
+ },
453
+ {
454
+ "epoch": 0.5739910313901345,
455
+ "grad_norm": 5.145959377288818,
456
+ "learning_rate": 3.94660365002137e-06,
457
+ "loss": 6.1626,
458
+ "step": 64
459
+ },
460
+ {
461
+ "epoch": 0.5829596412556054,
462
+ "grad_norm": 1.8556267023086548,
463
+ "learning_rate": 3.808707042884176e-06,
464
+ "loss": 0.507,
465
+ "step": 65
466
+ },
467
+ {
468
+ "epoch": 0.5919282511210763,
469
+ "grad_norm": 4.164766788482666,
470
+ "learning_rate": 3.6717646444456196e-06,
471
+ "loss": 7.3431,
472
+ "step": 66
473
+ },
474
+ {
475
+ "epoch": 0.600896860986547,
476
+ "grad_norm": 0.5073394179344177,
477
+ "learning_rate": 3.5358861436172487e-06,
478
+ "loss": 1.4834,
479
+ "step": 67
480
+ },
481
+ {
482
+ "epoch": 0.6098654708520179,
483
+ "grad_norm": 5.046154975891113,
484
+ "learning_rate": 3.401180377143774e-06,
485
+ "loss": 8.1017,
486
+ "step": 68
487
+ },
488
+ {
489
+ "epoch": 0.6188340807174888,
490
+ "grad_norm": 2.9860904216766357,
491
+ "learning_rate": 3.2677552424263836e-06,
492
+ "loss": 4.9051,
493
+ "step": 69
494
+ },
495
+ {
496
+ "epoch": 0.6278026905829597,
497
+ "grad_norm": 1.433781385421753,
498
+ "learning_rate": 3.1357176110984578e-06,
499
+ "loss": 0.752,
500
+ "step": 70
501
+ },
502
+ {
503
+ "epoch": 0.6367713004484304,
504
+ "grad_norm": 1.624589443206787,
505
+ "learning_rate": 3.0051732434229185e-06,
506
+ "loss": 0.7641,
507
+ "step": 71
508
+ },
509
+ {
510
+ "epoch": 0.6457399103139013,
511
+ "grad_norm": 0.9126272797584534,
512
+ "learning_rate": 2.8762267035797607e-06,
513
+ "loss": 1.248,
514
+ "step": 72
515
+ },
516
+ {
517
+ "epoch": 0.6547085201793722,
518
+ "grad_norm": 0.49890244007110596,
519
+ "learning_rate": 2.748981275911633e-06,
520
+ "loss": 0.6581,
521
+ "step": 73
522
+ },
523
+ {
524
+ "epoch": 0.6636771300448431,
525
+ "grad_norm": 1.4649814367294312,
526
+ "learning_rate": 2.6235388821945497e-06,
527
+ "loss": 3.489,
528
+ "step": 74
529
+ },
530
+ {
531
+ "epoch": 0.672645739910314,
532
+ "grad_norm": 1.5615613460540771,
533
+ "learning_rate": 2.5000000000000015e-06,
534
+ "loss": 1.8816,
535
+ "step": 75
536
+ },
537
+ {
538
+ "epoch": 0.6816143497757847,
539
+ "grad_norm": 1.2162355184555054,
540
+ "learning_rate": 2.3784635822138424e-06,
541
+ "loss": 0.7183,
542
+ "step": 76
543
+ },
544
+ {
545
+ "epoch": 0.6905829596412556,
546
+ "grad_norm": 3.3642117977142334,
547
+ "learning_rate": 2.2590269777764516e-06,
548
+ "loss": 8.284,
549
+ "step": 77
550
+ },
551
+ {
552
+ "epoch": 0.6995515695067265,
553
+ "grad_norm": 2.1293866634368896,
554
+ "learning_rate": 2.141785853707607e-06,
555
+ "loss": 3.4749,
556
+ "step": 78
557
+ },
558
+ {
559
+ "epoch": 0.7085201793721974,
560
+ "grad_norm": 8.28476619720459,
561
+ "learning_rate": 2.0268341184785674e-06,
562
+ "loss": 18.1382,
563
+ "step": 79
564
+ },
565
+ {
566
+ "epoch": 0.7174887892376681,
567
+ "grad_norm": 1.8235303163528442,
568
+ "learning_rate": 1.9142638467927254e-06,
569
+ "loss": 3.4227,
570
+ "step": 80
571
+ },
572
+ {
573
+ "epoch": 0.726457399103139,
574
+ "grad_norm": 2.0121238231658936,
575
+ "learning_rate": 1.8041652058350768e-06,
576
+ "loss": 0.7364,
577
+ "step": 81
578
+ },
579
+ {
580
+ "epoch": 0.7354260089686099,
581
+ "grad_norm": 1.3227595090866089,
582
+ "learning_rate": 1.6966263830495939e-06,
583
+ "loss": 0.6529,
584
+ "step": 82
585
+ },
586
+ {
587
+ "epoch": 0.7443946188340808,
588
+ "grad_norm": 0.40294215083122253,
589
+ "learning_rate": 1.5917335155023368e-06,
590
+ "loss": 0.5894,
591
+ "step": 83
592
+ },
593
+ {
594
+ "epoch": 0.7533632286995515,
595
+ "grad_norm": 4.5333099365234375,
596
+ "learning_rate": 1.4895706208868876e-06,
597
+ "loss": 6.954,
598
+ "step": 84
599
+ },
600
+ {
601
+ "epoch": 0.7623318385650224,
602
+ "grad_norm": 0.9095823764801025,
603
+ "learning_rate": 1.390219530227378e-06,
604
+ "loss": 0.8511,
605
+ "step": 85
606
+ },
607
+ {
608
+ "epoch": 0.7713004484304933,
609
+ "grad_norm": 2.197908401489258,
610
+ "learning_rate": 1.2937598223330006e-06,
611
+ "loss": 5.0122,
612
+ "step": 86
613
+ },
614
+ {
615
+ "epoch": 0.7802690582959642,
616
+ "grad_norm": 2.3578813076019287,
617
+ "learning_rate": 1.2002687600565138e-06,
618
+ "loss": 0.6863,
619
+ "step": 87
620
+ },
621
+ {
622
+ "epoch": 0.7892376681614349,
623
+ "grad_norm": 0.6472300291061401,
624
+ "learning_rate": 1.1098212284078037e-06,
625
+ "loss": 0.6623,
626
+ "step": 88
627
+ },
628
+ {
629
+ "epoch": 0.7982062780269058,
630
+ "grad_norm": 3.915199041366577,
631
+ "learning_rate": 1.0224896745720513e-06,
632
+ "loss": 5.9304,
633
+ "step": 89
634
+ },
635
+ {
636
+ "epoch": 0.8071748878923767,
637
+ "grad_norm": 1.4207231998443604,
638
+ "learning_rate": 9.383440498805712e-07,
639
+ "loss": 1.4954,
640
+ "step": 90
641
+ },
642
+ {
643
+ "epoch": 0.8161434977578476,
644
+ "grad_norm": 0.8446305990219116,
645
+ "learning_rate": 8.574517537807897e-07,
646
+ "loss": 2.0417,
647
+ "step": 91
648
+ },
649
+ {
650
+ "epoch": 0.8251121076233184,
651
+ "grad_norm": 5.90231990814209,
652
+ "learning_rate": 7.798775798502484e-07,
653
+ "loss": 15.1191,
654
+ "step": 92
655
+ },
656
+ {
657
+ "epoch": 0.8340807174887892,
658
+ "grad_norm": 5.121821403503418,
659
+ "learning_rate": 7.056836638978698e-07,
660
+ "loss": 7.9447,
661
+ "step": 93
662
+ },
663
+ {
664
+ "epoch": 0.8430493273542601,
665
+ "grad_norm": 4.37883186340332,
666
+ "learning_rate": 6.349294341940593e-07,
667
+ "loss": 10.4594,
668
+ "step": 94
669
+ },
670
+ {
671
+ "epoch": 0.852017937219731,
672
+ "grad_norm": 4.789362907409668,
673
+ "learning_rate": 5.676715638695063e-07,
674
+ "loss": 9.3101,
675
+ "step": 95
676
+ },
677
+ {
678
+ "epoch": 0.8609865470852018,
679
+ "grad_norm": 2.2993826866149902,
680
+ "learning_rate": 5.039639255208156e-07,
681
+ "loss": 3.6573,
682
+ "step": 96
683
+ },
684
+ {
685
+ "epoch": 0.8699551569506726,
686
+ "grad_norm": 0.9291459918022156,
687
+ "learning_rate": 4.43857548059321e-07,
688
+ "loss": 0.7949,
689
+ "step": 97
690
+ },
691
+ {
692
+ "epoch": 0.8789237668161435,
693
+ "grad_norm": 1.2445647716522217,
694
+ "learning_rate": 3.87400575837657e-07,
695
+ "loss": 2.0625,
696
+ "step": 98
697
+ },
698
+ {
699
+ "epoch": 0.8878923766816144,
700
+ "grad_norm": 4.5968337059021,
701
+ "learning_rate": 3.346382300868134e-07,
702
+ "loss": 8.252,
703
+ "step": 99
704
+ },
705
+ {
706
+ "epoch": 0.8968609865470852,
707
+ "grad_norm": 2.6838462352752686,
708
+ "learning_rate": 2.85612772694579e-07,
709
+ "loss": 3.5332,
710
+ "step": 100
711
+ },
712
+ {
713
+ "epoch": 0.905829596412556,
714
+ "grad_norm": 2.564300298690796,
715
+ "learning_rate": 2.403634723543674e-07,
716
+ "loss": 3.3899,
717
+ "step": 101
718
+ },
719
+ {
720
+ "epoch": 0.9147982062780269,
721
+ "grad_norm": 1.2007478475570679,
722
+ "learning_rate": 1.989265731115525e-07,
723
+ "loss": 0.6646,
724
+ "step": 102
725
+ },
726
+ {
727
+ "epoch": 0.9237668161434978,
728
+ "grad_norm": 0.49647316336631775,
729
+ "learning_rate": 1.6133526533250566e-07,
730
+ "loss": 0.8976,
731
+ "step": 103
732
+ },
733
+ {
734
+ "epoch": 0.9327354260089686,
735
+ "grad_norm": 7.291123867034912,
736
+ "learning_rate": 1.2761965911958385e-07,
737
+ "loss": 18.4972,
738
+ "step": 104
739
+ },
740
+ {
741
+ "epoch": 0.9417040358744395,
742
+ "grad_norm": 0.898957371711731,
743
+ "learning_rate": 9.780676019336632e-08,
744
+ "loss": 1.1393,
745
+ "step": 105
746
+ },
747
+ {
748
+ "epoch": 0.9506726457399103,
749
+ "grad_norm": 3.6352906227111816,
750
+ "learning_rate": 7.192044826145772e-08,
751
+ "loss": 5.9557,
752
+ "step": 106
753
+ },
754
+ {
755
+ "epoch": 0.9596412556053812,
756
+ "grad_norm": 2.3766579627990723,
757
+ "learning_rate": 4.998145789118114e-08,
758
+ "loss": 0.6812,
759
+ "step": 107
760
+ },
761
+ {
762
+ "epoch": 0.968609865470852,
763
+ "grad_norm": 0.5174874067306519,
764
+ "learning_rate": 3.2007361901485455e-08,
765
+ "loss": 0.7247,
766
+ "step": 108
767
+ },
768
+ {
769
+ "epoch": 0.9775784753363229,
770
+ "grad_norm": 2.757059335708618,
771
+ "learning_rate": 1.8012557287367394e-08,
772
+ "loss": 5.8355,
773
+ "step": 109
774
+ },
775
+ {
776
+ "epoch": 0.9865470852017937,
777
+ "grad_norm": 0.954919159412384,
778
+ "learning_rate": 8.008253688084888e-09,
779
+ "loss": 3.1317,
780
+ "step": 110
781
+ },
782
+ {
783
+ "epoch": 0.9955156950672646,
784
+ "grad_norm": 3.348649740219116,
785
+ "learning_rate": 2.002464408392135e-09,
786
+ "loss": 7.5343,
787
+ "step": 111
788
+ }
789
+ ],
790
+ "logging_steps": 1,
791
+ "max_steps": 111,
792
+ "num_input_tokens_seen": 0,
793
+ "num_train_epochs": 1,
794
+ "save_steps": 200,
795
+ "stateful_callbacks": {
796
+ "TrainerControl": {
797
+ "args": {
798
+ "should_epoch_stop": false,
799
+ "should_evaluate": false,
800
+ "should_log": false,
801
+ "should_save": true,
802
+ "should_training_stop": true
803
+ },
804
+ "attributes": {}
805
+ }
806
+ },
807
+ "total_flos": 0.0,
808
+ "train_batch_size": 24,
809
+ "trial_name": null,
810
+ "trial_params": null
811
+ }
20250501-1443/checkpoint-111/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f557942d59ec98655c05adb4d5496eb4bbb1842077eb2c1f22a9459b51d268e
3
+ size 5304
20250501-1443/runs/May01_14-43-04_d55355693cce/events.out.tfevents.1746110584.d55355693cce.190.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cbc3140f84a0db89e905b1f64a8f712492d091d7ad5ed4140c57af7cddf232e
3
+ size 27538