rgb255 commited on
Commit
56a0d37
·
verified ·
1 Parent(s): 0fb96f9

Upload LoRA adapter (Fixed README metadata)

Browse files
README.md CHANGED
@@ -39,8 +39,8 @@ while intermediate reasoning (Chain-of-Thought) is masked.
39
  - Method: QLoRA (4-bit)
40
  - Max sequence length: 512
41
  - Epochs: 1
42
- - Learning rate: 1e-04
43
- - LoRA: r=32, alpha=64
44
 
45
  ## Usage
46
 
 
39
  - Method: QLoRA (4-bit)
40
  - Max sequence length: 512
41
  - Epochs: 1
42
+ - Learning rate: 1e-06
43
+ - LoRA: r=80, alpha=160
44
 
45
  ## Usage
46
 
adapter_config.json CHANGED
@@ -20,7 +20,7 @@
20
  "layers_pattern": null,
21
  "layers_to_transform": null,
22
  "loftq_config": {},
23
- "lora_alpha": 64,
24
  "lora_bias": false,
25
  "lora_dropout": 0.0,
26
  "megatron_config": null,
@@ -29,17 +29,17 @@
29
  "peft_type": "LORA",
30
  "peft_version": "0.18.1",
31
  "qalora_group_size": 16,
32
- "r": 32,
33
  "rank_pattern": {},
34
  "revision": null,
35
  "target_modules": [
36
- "up_proj",
37
  "v_proj",
38
- "o_proj",
39
- "gate_proj",
40
- "k_proj",
41
  "down_proj",
42
- "q_proj"
 
 
 
43
  ],
44
  "target_parameters": null,
45
  "task_type": "CAUSAL_LM",
 
20
  "layers_pattern": null,
21
  "layers_to_transform": null,
22
  "loftq_config": {},
23
+ "lora_alpha": 160,
24
  "lora_bias": false,
25
  "lora_dropout": 0.0,
26
  "megatron_config": null,
 
29
  "peft_type": "LORA",
30
  "peft_version": "0.18.1",
31
  "qalora_group_size": 16,
32
+ "r": 80,
33
  "rank_pattern": {},
34
  "revision": null,
35
  "target_modules": [
 
36
  "v_proj",
37
+ "up_proj",
 
 
38
  "down_proj",
39
+ "q_proj",
40
+ "k_proj",
41
+ "gate_proj",
42
+ "o_proj"
43
  ],
44
  "target_parameters": null,
45
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:87158f985ea3df9cecafe79a9eb0cc460b9ace2f5ebdd889befd0a6711811ed9
3
- size 264308896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32d951a8a18e0ae23940e93375640002eeb176d147e4512c2933c571edd1c141
3
+ size 660670872
all_experiments_details.json CHANGED
@@ -13,8 +13,8 @@
13
  "SEED": 3407,
14
  "VAL_RATIO": 0.05,
15
  "MAX_SEQ_LEN": 512,
16
- "LORA_R": 32,
17
- "LORA_ALPHA": 64,
18
  "LORA_DROPOUT": 0.0,
19
  "LORA_TARGET_MODULES": [
20
  "q_proj",
@@ -29,7 +29,7 @@
29
  "PER_DEVICE_TRAIN_BS": 2,
30
  "PER_DEVICE_EVAL_BS": 2,
31
  "GRAD_ACCUM": 8,
32
- "LR": 0.0001,
33
  "WARMUP_RATIO": 0.1,
34
  "WEIGHT_DECAY": 0.05,
35
  "MAX_STEPS": -1,
@@ -53,945 +53,945 @@
53
  },
54
  "history": [
55
  {
56
- "loss": 1.9109,
57
- "grad_norm": 3.2909586429595947,
58
- "learning_rate": 8.181818181818183e-06,
59
  "epoch": 0.009169054441260744,
60
  "step": 10
61
  },
62
  {
63
- "loss": 1.5949,
64
- "grad_norm": 0.8696144223213196,
65
- "learning_rate": 1.7272727272727274e-05,
66
  "epoch": 0.01833810888252149,
67
  "step": 20
68
  },
69
  {
70
- "loss": 1.4542,
71
- "grad_norm": 0.41898515820503235,
72
- "learning_rate": 2.636363636363636e-05,
73
  "epoch": 0.027507163323782235,
74
  "step": 30
75
  },
76
  {
77
- "loss": 1.241,
78
- "grad_norm": 0.42209118604660034,
79
- "learning_rate": 3.545454545454546e-05,
80
  "epoch": 0.03667621776504298,
81
  "step": 40
82
  },
83
  {
84
- "loss": 1.3934,
85
- "grad_norm": 0.7735824584960938,
86
- "learning_rate": 4.454545454545455e-05,
87
  "epoch": 0.045845272206303724,
88
  "step": 50
89
  },
90
  {
91
- "eval_loss": 1.1486613750457764,
92
- "eval_runtime": 157.2295,
93
- "eval_samples_per_second": 5.902,
94
- "eval_steps_per_second": 2.951,
95
  "epoch": 0.045845272206303724,
96
  "step": 50
97
  },
98
  {
99
- "loss": 1.1159,
100
- "grad_norm": 0.4702383875846863,
101
- "learning_rate": 5.363636363636364e-05,
102
  "epoch": 0.05501432664756447,
103
  "step": 60
104
  },
105
  {
106
- "loss": 1.0729,
107
- "grad_norm": 0.4587116539478302,
108
- "learning_rate": 6.272727272727273e-05,
109
  "epoch": 0.06418338108882521,
110
  "step": 70
111
  },
112
  {
113
- "loss": 1.049,
114
- "grad_norm": 0.5562591552734375,
115
- "learning_rate": 7.181818181818182e-05,
116
  "epoch": 0.07335243553008595,
117
  "step": 80
118
  },
119
  {
120
- "loss": 0.9021,
121
- "grad_norm": 0.6512398719787598,
122
- "learning_rate": 8.090909090909092e-05,
123
  "epoch": 0.0825214899713467,
124
  "step": 90
125
  },
126
  {
127
- "loss": 0.9578,
128
- "grad_norm": 0.6464130878448486,
129
- "learning_rate": 9e-05,
130
  "epoch": 0.09169054441260745,
131
  "step": 100
132
  },
133
  {
134
- "eval_loss": 0.8903455138206482,
135
- "eval_runtime": 149.8861,
136
- "eval_samples_per_second": 6.191,
137
- "eval_steps_per_second": 3.096,
138
  "epoch": 0.09169054441260745,
139
  "step": 100
140
  },
141
  {
142
- "loss": 0.9056,
143
- "grad_norm": 0.5816523432731628,
144
- "learning_rate": 9.909090909090911e-05,
145
  "epoch": 0.1008595988538682,
146
  "step": 110
147
  },
148
  {
149
- "loss": 0.8961,
150
- "grad_norm": 0.6272807121276855,
151
- "learning_rate": 9.997923381619256e-05,
152
  "epoch": 0.11002865329512894,
153
  "step": 120
154
  },
155
  {
156
- "loss": 0.8718,
157
- "grad_norm": 0.3907865881919861,
158
- "learning_rate": 9.990747162241872e-05,
159
  "epoch": 0.11919770773638969,
160
  "step": 130
161
  },
162
  {
163
- "loss": 0.8814,
164
- "grad_norm": 0.4261883497238159,
165
- "learning_rate": 9.978453061876696e-05,
166
  "epoch": 0.12836676217765042,
167
  "step": 140
168
  },
169
  {
170
- "loss": 0.9294,
171
- "grad_norm": 0.3960123658180237,
172
- "learning_rate": 9.96105368780285e-05,
173
  "epoch": 0.13753581661891118,
174
  "step": 150
175
  },
176
  {
177
- "eval_loss": 0.8402041792869568,
178
- "eval_runtime": 153.7211,
179
- "eval_samples_per_second": 6.037,
180
- "eval_steps_per_second": 3.018,
181
  "epoch": 0.13753581661891118,
182
  "step": 150
183
  },
184
  {
185
- "loss": 0.8915,
186
- "grad_norm": 0.45985275506973267,
187
- "learning_rate": 9.938566882624436e-05,
188
  "epoch": 0.1467048710601719,
189
  "step": 160
190
  },
191
  {
192
- "loss": 0.8776,
193
- "grad_norm": 0.4007140100002289,
194
- "learning_rate": 9.9110157059734e-05,
195
  "epoch": 0.15587392550143267,
196
  "step": 170
197
  },
198
  {
199
- "loss": 0.8804,
200
- "grad_norm": 0.5031015872955322,
201
- "learning_rate": 9.878428410862483e-05,
202
  "epoch": 0.1650429799426934,
203
  "step": 180
204
  },
205
  {
206
- "loss": 0.8733,
207
- "grad_norm": 0.3346236050128937,
208
- "learning_rate": 9.840838414712501e-05,
209
  "epoch": 0.17421203438395416,
210
  "step": 190
211
  },
212
  {
213
- "loss": 0.7578,
214
- "grad_norm": 0.37552139163017273,
215
- "learning_rate": 9.798284265083642e-05,
216
  "epoch": 0.1833810888252149,
217
  "step": 200
218
  },
219
  {
220
- "eval_loss": 0.8151949048042297,
221
- "eval_runtime": 147.771,
222
- "eval_samples_per_second": 6.28,
223
- "eval_steps_per_second": 3.14,
224
  "epoch": 0.1833810888252149,
225
  "step": 200
226
  },
227
  {
228
- "loss": 0.8844,
229
- "grad_norm": 0.3008505403995514,
230
- "learning_rate": 9.750809600145954e-05,
231
  "epoch": 0.19255014326647565,
232
  "step": 210
233
  },
234
  {
235
- "loss": 0.806,
236
- "grad_norm": 0.40879538655281067,
237
- "learning_rate": 9.698463103929542e-05,
238
  "epoch": 0.2017191977077364,
239
  "step": 220
240
  },
241
  {
242
- "loss": 0.757,
243
- "grad_norm": 0.2864895164966583,
244
- "learning_rate": 9.641298456400363e-05,
245
  "epoch": 0.21088825214899715,
246
  "step": 230
247
  },
248
  {
249
- "loss": 0.8971,
250
- "grad_norm": 0.4410439431667328,
251
- "learning_rate": 9.579374278412819e-05,
252
  "epoch": 0.22005730659025788,
253
  "step": 240
254
  },
255
  {
256
- "loss": 0.8188,
257
- "grad_norm": 0.3572053611278534,
258
- "learning_rate": 9.512754071595605e-05,
259
  "epoch": 0.22922636103151864,
260
  "step": 250
261
  },
262
  {
263
- "eval_loss": 0.8022013902664185,
264
- "eval_runtime": 149.5554,
265
- "eval_samples_per_second": 6.205,
266
- "eval_steps_per_second": 3.103,
267
  "epoch": 0.22922636103151864,
268
  "step": 250
269
  },
270
  {
271
- "loss": 0.8794,
272
- "grad_norm": 0.3161059617996216,
273
- "learning_rate": 9.441506153232443e-05,
274
  "epoch": 0.23839541547277937,
275
  "step": 260
276
  },
277
  {
278
- "loss": 0.7863,
279
- "grad_norm": 0.3244767487049103,
280
- "learning_rate": 9.365703586204496e-05,
281
  "epoch": 0.2475644699140401,
282
  "step": 270
283
  },
284
  {
285
- "loss": 0.9084,
286
- "grad_norm": 0.40793028473854065,
287
- "learning_rate": 9.285424104066276e-05,
288
  "epoch": 0.25673352435530084,
289
  "step": 280
290
  },
291
  {
292
- "loss": 0.8005,
293
- "grad_norm": 0.2878340184688568,
294
- "learning_rate": 9.20075003133193e-05,
295
  "epoch": 0.2659025787965616,
296
  "step": 290
297
  },
298
  {
299
- "loss": 0.8717,
300
- "grad_norm": 0.42775702476501465,
301
- "learning_rate": 9.111768199053588e-05,
302
  "epoch": 0.27507163323782235,
303
  "step": 300
304
  },
305
  {
306
- "eval_loss": 0.7930753231048584,
307
- "eval_runtime": 159.9775,
308
- "eval_samples_per_second": 5.801,
309
- "eval_steps_per_second": 2.9,
310
  "epoch": 0.27507163323782235,
311
  "step": 300
312
  },
313
  {
314
- "loss": 0.9456,
315
- "grad_norm": 0.44111281633377075,
316
- "learning_rate": 9.018569855778383e-05,
317
  "epoch": 0.2842406876790831,
318
  "step": 310
319
  },
320
  {
321
- "loss": 0.8296,
322
- "grad_norm": 0.29679298400878906,
323
- "learning_rate": 8.921250573975456e-05,
324
  "epoch": 0.2934097421203438,
325
  "step": 320
326
  },
327
  {
328
- "loss": 0.7403,
329
- "grad_norm": 0.3140794634819031,
330
- "learning_rate": 8.819910152028872e-05,
331
  "epoch": 0.3025787965616046,
332
  "step": 330
333
  },
334
  {
335
- "loss": 0.8083,
336
- "grad_norm": 0.2960476279258728,
337
- "learning_rate": 8.714652511896994e-05,
338
  "epoch": 0.31174785100286534,
339
  "step": 340
340
  },
341
  {
342
- "loss": 0.7474,
343
- "grad_norm": 0.5192521214485168,
344
- "learning_rate": 8.605585592543212e-05,
345
  "epoch": 0.3209169054441261,
346
  "step": 350
347
  },
348
  {
349
- "eval_loss": 0.7890114784240723,
350
- "eval_runtime": 156.9758,
351
- "eval_samples_per_second": 5.912,
352
- "eval_steps_per_second": 2.956,
353
  "epoch": 0.3209169054441261,
354
  "step": 350
355
  },
356
  {
357
- "loss": 0.8881,
358
- "grad_norm": 0.26124686002731323,
359
- "learning_rate": 8.492821239247364e-05,
360
  "epoch": 0.3300859598853868,
361
  "step": 360
362
  },
363
  {
364
- "loss": 0.9664,
365
- "grad_norm": 1.590624213218689,
366
- "learning_rate": 8.376475088911317e-05,
367
  "epoch": 0.33925501432664756,
368
  "step": 370
369
  },
370
  {
371
- "loss": 0.7176,
372
- "grad_norm": 0.3093242943286896,
373
- "learning_rate": 8.256666451476337e-05,
374
  "epoch": 0.3484240687679083,
375
  "step": 380
376
  },
377
  {
378
- "loss": 0.9026,
379
- "grad_norm": 0.5042882561683655,
380
- "learning_rate": 8.133518187573862e-05,
381
  "epoch": 0.35759312320916903,
382
  "step": 390
383
  },
384
  {
385
- "loss": 0.7752,
386
- "grad_norm": 0.3653980791568756,
387
- "learning_rate": 8.007156582535131e-05,
388
  "epoch": 0.3667621776504298,
389
  "step": 400
390
  },
391
  {
392
- "eval_loss": 0.7844048738479614,
393
- "eval_runtime": 149.3433,
394
- "eval_samples_per_second": 6.214,
395
- "eval_steps_per_second": 3.107,
396
  "epoch": 0.3667621776504298,
397
  "step": 400
398
  },
399
  {
400
- "loss": 0.8571,
401
- "grad_norm": 0.23480232059955597,
402
- "learning_rate": 7.877711216888867e-05,
403
  "epoch": 0.37593123209169055,
404
  "step": 410
405
  },
406
  {
407
- "loss": 0.769,
408
- "grad_norm": 0.34751203656196594,
409
- "learning_rate": 7.745314833479833e-05,
410
  "epoch": 0.3851002865329513,
411
  "step": 420
412
  },
413
  {
414
- "loss": 0.7989,
415
- "grad_norm": 0.2936677932739258,
416
- "learning_rate": 7.6101032013445e-05,
417
  "epoch": 0.394269340974212,
418
  "step": 430
419
  },
420
  {
421
- "loss": 0.789,
422
- "grad_norm": 0.3774430751800537,
423
- "learning_rate": 7.472214976483452e-05,
424
  "epoch": 0.4034383954154728,
425
  "step": 440
426
  },
427
  {
428
- "loss": 0.7272,
429
- "grad_norm": 0.3077329099178314,
430
- "learning_rate": 7.33179155967327e-05,
431
  "epoch": 0.41260744985673353,
432
  "step": 450
433
  },
434
  {
435
- "eval_loss": 0.7796526551246643,
436
- "eval_runtime": 152.2118,
437
- "eval_samples_per_second": 6.097,
438
- "eval_steps_per_second": 3.048,
439
  "epoch": 0.41260744985673353,
440
  "step": 450
441
  },
442
  {
443
- "loss": 0.7975,
444
- "grad_norm": 0.2882915437221527,
445
- "learning_rate": 7.188976951463723e-05,
446
  "epoch": 0.4217765042979943,
447
  "step": 460
448
  },
449
  {
450
- "loss": 0.9331,
451
- "grad_norm": 0.26507413387298584,
452
- "learning_rate": 7.043917604508971e-05,
453
  "epoch": 0.430945558739255,
454
  "step": 470
455
  },
456
  {
457
- "loss": 0.8037,
458
- "grad_norm": 0.34291401505470276,
459
- "learning_rate": 6.896762273384178e-05,
460
  "epoch": 0.44011461318051576,
461
  "step": 480
462
  },
463
  {
464
- "loss": 0.7612,
465
- "grad_norm": 0.43661966919898987,
466
- "learning_rate": 6.747661862041585e-05,
467
  "epoch": 0.4492836676217765,
468
  "step": 490
469
  },
470
  {
471
- "loss": 0.9427,
472
- "grad_norm": 0.37725234031677246,
473
- "learning_rate": 6.596769269062444e-05,
474
  "epoch": 0.4584527220630373,
475
  "step": 500
476
  },
477
  {
478
- "eval_loss": 0.774621307849884,
479
- "eval_runtime": 154.4593,
480
- "eval_samples_per_second": 6.008,
481
- "eval_steps_per_second": 3.004,
482
  "epoch": 0.4584527220630373,
483
  "step": 500
484
  },
485
  {
486
- "loss": 0.8304,
487
- "grad_norm": 0.30780285596847534,
488
- "learning_rate": 6.444239230863504e-05,
489
  "epoch": 0.467621776504298,
490
  "step": 510
491
  },
492
  {
493
- "loss": 0.715,
494
- "grad_norm": 0.27012601494789124,
495
- "learning_rate": 6.290228163018868e-05,
496
  "epoch": 0.47679083094555874,
497
  "step": 520
498
  },
499
  {
500
- "loss": 0.7056,
501
- "grad_norm": 0.2883375585079193,
502
- "learning_rate": 6.134893999859887e-05,
503
  "epoch": 0.4859598853868195,
504
  "step": 530
505
  },
506
  {
507
- "loss": 0.8827,
508
- "grad_norm": 0.3120364248752594,
509
- "learning_rate": 5.97839603251764e-05,
510
  "epoch": 0.4951289398280802,
511
  "step": 540
512
  },
513
  {
514
- "loss": 0.7032,
515
- "grad_norm": 0.2870006859302521,
516
- "learning_rate": 5.820894745574025e-05,
517
  "epoch": 0.504297994269341,
518
  "step": 550
519
  },
520
  {
521
- "eval_loss": 0.7724801898002625,
522
- "eval_runtime": 154.0554,
523
- "eval_samples_per_second": 6.024,
524
- "eval_steps_per_second": 3.012,
525
  "epoch": 0.504297994269341,
526
  "step": 550
527
  },
528
  {
529
- "loss": 0.7996,
530
- "grad_norm": 0.2602643072605133,
531
- "learning_rate": 5.662551652489009e-05,
532
  "epoch": 0.5134670487106017,
533
  "step": 560
534
  },
535
  {
536
- "loss": 0.7694,
537
- "grad_norm": 0.43268686532974243,
538
- "learning_rate": 5.503529129972792e-05,
539
  "epoch": 0.5226361031518625,
540
  "step": 570
541
  },
542
  {
543
- "loss": 0.7802,
544
- "grad_norm": 0.3927740156650543,
545
- "learning_rate": 5.34399025147273e-05,
546
  "epoch": 0.5318051575931232,
547
  "step": 580
548
  },
549
  {
550
- "loss": 0.7649,
551
- "grad_norm": 0.3333654999732971,
552
- "learning_rate": 5.1840986199457606e-05,
553
  "epoch": 0.540974212034384,
554
  "step": 590
555
  },
556
  {
557
- "loss": 0.8586,
558
- "grad_norm": 0.29149171710014343,
559
- "learning_rate": 5.024018200087855e-05,
560
  "epoch": 0.5501432664756447,
561
  "step": 600
562
  },
563
  {
564
- "eval_loss": 0.7711001038551331,
565
- "eval_runtime": 160.1497,
566
- "eval_samples_per_second": 5.795,
567
- "eval_steps_per_second": 2.897,
568
  "epoch": 0.5501432664756447,
569
  "step": 600
570
  },
571
  {
572
- "loss": 0.8023,
573
- "grad_norm": 0.48755213618278503,
574
- "learning_rate": 4.863913150192481e-05,
575
  "epoch": 0.5593123209169054,
576
  "step": 610
577
  },
578
  {
579
- "loss": 0.737,
580
- "grad_norm": 0.290272057056427,
581
- "learning_rate": 4.703947653810575e-05,
582
  "epoch": 0.5684813753581662,
583
  "step": 620
584
  },
585
  {
586
- "loss": 0.8758,
587
- "grad_norm": 0.2664808928966522,
588
- "learning_rate": 4.544285751384584e-05,
589
  "epoch": 0.5776504297994269,
590
  "step": 630
591
  },
592
  {
593
- "loss": 0.8121,
594
- "grad_norm": 0.7378506660461426,
595
- "learning_rate": 4.3850911720292756e-05,
596
  "epoch": 0.5868194842406876,
597
  "step": 640
598
  },
599
  {
600
- "loss": 0.7965,
601
- "grad_norm": 0.33212271332740784,
602
- "learning_rate": 4.226527165631801e-05,
603
  "epoch": 0.5959885386819485,
604
  "step": 650
605
  },
606
  {
607
- "eval_loss": 0.7688117027282715,
608
- "eval_runtime": 157.579,
609
- "eval_samples_per_second": 5.889,
610
- "eval_steps_per_second": 2.945,
611
  "epoch": 0.5959885386819485,
612
  "step": 650
613
  },
614
  {
615
- "loss": 0.8721,
616
- "grad_norm": 0.45228180289268494,
617
- "learning_rate": 4.0687563354431984e-05,
618
  "epoch": 0.6051575931232092,
619
  "step": 660
620
  },
621
  {
622
- "loss": 0.9747,
623
- "grad_norm": 0.39334630966186523,
624
- "learning_rate": 3.911940471333002e-05,
625
  "epoch": 0.6143266475644699,
626
  "step": 670
627
  },
628
  {
629
- "loss": 0.8033,
630
- "grad_norm": 0.2843310534954071,
631
- "learning_rate": 3.756240383877947e-05,
632
  "epoch": 0.6234957020057307,
633
  "step": 680
634
  },
635
  {
636
- "loss": 0.8623,
637
- "grad_norm": 0.3566271662712097,
638
- "learning_rate": 3.6018157394549284e-05,
639
  "epoch": 0.6326647564469914,
640
  "step": 690
641
  },
642
  {
643
- "loss": 0.7507,
644
- "grad_norm": 0.2564896047115326,
645
- "learning_rate": 3.448824896507292e-05,
646
  "epoch": 0.6418338108882522,
647
  "step": 700
648
  },
649
  {
650
- "eval_loss": 0.7661372423171997,
651
- "eval_runtime": 154.0479,
652
- "eval_samples_per_second": 6.024,
653
- "eval_steps_per_second": 3.012,
654
  "epoch": 0.6418338108882522,
655
  "step": 700
656
  },
657
  {
658
- "loss": 0.8036,
659
- "grad_norm": 0.279855340719223,
660
- "learning_rate": 3.297424743152382e-05,
661
  "epoch": 0.6510028653295129,
662
  "step": 710
663
  },
664
  {
665
- "loss": 0.7921,
666
- "grad_norm": 0.3235679268836975,
667
- "learning_rate": 3.14777053629687e-05,
668
  "epoch": 0.6601719197707736,
669
  "step": 720
670
  },
671
  {
672
- "loss": 0.7928,
673
- "grad_norm": 0.37638944387435913,
674
- "learning_rate": 3.0000157424248575e-05,
675
  "epoch": 0.6693409742120344,
676
  "step": 730
677
  },
678
  {
679
- "loss": 0.9068,
680
- "grad_norm": 0.40034306049346924,
681
- "learning_rate": 2.8543118802219904e-05,
682
  "epoch": 0.6785100286532951,
683
  "step": 740
684
  },
685
  {
686
- "loss": 0.8321,
687
- "grad_norm": 0.15902051329612732,
688
- "learning_rate": 2.710808365197e-05,
689
  "epoch": 0.6876790830945558,
690
  "step": 750
691
  },
692
  {
693
- "eval_loss": 0.7649410367012024,
694
- "eval_runtime": 155.9897,
695
- "eval_samples_per_second": 5.949,
696
- "eval_steps_per_second": 2.975,
697
  "epoch": 0.6876790830945558,
698
  "step": 750
699
  },
700
  {
701
- "loss": 0.8081,
702
- "grad_norm": 0.22720754146575928,
703
- "learning_rate": 2.5696523564600074e-05,
704
  "epoch": 0.6968481375358166,
705
  "step": 760
706
  },
707
  {
708
- "loss": 0.8296,
709
- "grad_norm": 0.3821977972984314,
710
- "learning_rate": 2.4309886058146912e-05,
711
  "epoch": 0.7060171919770774,
712
  "step": 770
713
  },
714
  {
715
- "loss": 0.7437,
716
- "grad_norm": 0.23446418344974518,
717
- "learning_rate": 2.2949593093190862e-05,
718
  "epoch": 0.7151862464183381,
719
  "step": 780
720
  },
721
  {
722
- "loss": 0.8172,
723
- "grad_norm": 0.39708301424980164,
724
- "learning_rate": 2.161703961467238e-05,
725
  "epoch": 0.7243553008595989,
726
  "step": 790
727
  },
728
  {
729
- "loss": 0.7895,
730
- "grad_norm": 0.27752557396888733,
731
- "learning_rate": 2.0313592121412466e-05,
732
  "epoch": 0.7335243553008596,
733
  "step": 800
734
  },
735
  {
736
- "eval_loss": 0.7634205222129822,
737
- "eval_runtime": 155.4241,
738
- "eval_samples_per_second": 5.971,
739
- "eval_steps_per_second": 2.985,
740
  "epoch": 0.7335243553008596,
741
  "step": 800
742
  },
743
  {
744
- "loss": 0.7796,
745
- "grad_norm": 0.3197689354419708,
746
- "learning_rate": 1.904058726480367e-05,
747
  "epoch": 0.7426934097421204,
748
  "step": 810
749
  },
750
  {
751
- "loss": 0.6869,
752
- "grad_norm": 0.2617953419685364,
753
- "learning_rate": 1.7799330478109027e-05,
754
  "epoch": 0.7518624641833811,
755
  "step": 820
756
  },
757
  {
758
- "loss": 0.9052,
759
- "grad_norm": 0.3640119135379791,
760
- "learning_rate": 1.6591094637774303e-05,
761
  "epoch": 0.7610315186246418,
762
  "step": 830
763
  },
764
  {
765
- "loss": 0.7278,
766
- "grad_norm": 0.3398507237434387,
767
- "learning_rate": 1.541711875812641e-05,
768
  "epoch": 0.7702005730659026,
769
  "step": 840
770
  },
771
  {
772
- "loss": 0.7398,
773
- "grad_norm": 0.3581792116165161,
774
- "learning_rate": 1.4278606720796544e-05,
775
  "epoch": 0.7793696275071633,
776
  "step": 850
777
  },
778
  {
779
- "eval_loss": 0.7617470026016235,
780
- "eval_runtime": 159.2523,
781
- "eval_samples_per_second": 5.827,
782
- "eval_steps_per_second": 2.914,
783
  "epoch": 0.7793696275071633,
784
  "step": 850
785
  },
786
  {
787
- "loss": 0.871,
788
- "grad_norm": 0.4080051779747009,
789
- "learning_rate": 1.3176726040171e-05,
790
  "epoch": 0.788538681948424,
791
  "step": 860
792
  },
793
  {
794
- "loss": 0.877,
795
- "grad_norm": 0.4084082543849945,
796
- "learning_rate": 1.2112606666135602e-05,
797
  "epoch": 0.7977077363896848,
798
  "step": 870
799
  },
800
  {
801
- "loss": 0.8205,
802
- "grad_norm": 0.26790153980255127,
803
- "learning_rate": 1.1087339825341592e-05,
804
  "epoch": 0.8068767908309455,
805
  "step": 880
806
  },
807
  {
808
- "loss": 0.8441,
809
- "grad_norm": 0.3132439851760864,
810
- "learning_rate": 1.0101976902181226e-05,
811
  "epoch": 0.8160458452722062,
812
  "step": 890
813
  },
814
  {
815
- "loss": 0.7718,
816
- "grad_norm": 0.41226926445961,
817
- "learning_rate": 9.157528360620415e-06,
818
  "epoch": 0.8252148997134671,
819
  "step": 900
820
  },
821
  {
822
- "eval_loss": 0.7602015733718872,
823
- "eval_runtime": 159.5675,
824
- "eval_samples_per_second": 5.816,
825
- "eval_steps_per_second": 2.908,
826
  "epoch": 0.8252148997134671,
827
  "step": 900
828
  },
829
  {
830
- "loss": 0.8352,
831
- "grad_norm": 0.30735114216804504,
832
- "learning_rate": 8.254962707994374e-06,
833
  "epoch": 0.8343839541547278,
834
  "step": 910
835
  },
836
  {
837
- "loss": 0.796,
838
- "grad_norm": 0.38642844557762146,
839
- "learning_rate": 7.395205501828578e-06,
840
  "epoch": 0.8435530085959886,
841
  "step": 920
842
  },
843
  {
844
- "loss": 0.8314,
845
- "grad_norm": 0.3047927916049957,
846
- "learning_rate": 6.579138400703716e-06,
847
  "epoch": 0.8527220630372493,
848
  "step": 930
849
  },
850
  {
851
- "loss": 0.7396,
852
- "grad_norm": 0.26591596007347107,
853
- "learning_rate": 5.807598260137759e-06,
854
  "epoch": 0.86189111747851,
855
  "step": 940
856
  },
857
  {
858
- "loss": 0.7736,
859
- "grad_norm": 0.41588667035102844,
860
- "learning_rate": 5.081376274412531e-06,
861
  "epoch": 0.8710601719197708,
862
  "step": 950
863
  },
864
  {
865
- "eval_loss": 0.7594464421272278,
866
- "eval_runtime": 158.8458,
867
- "eval_samples_per_second": 5.842,
868
- "eval_steps_per_second": 2.921,
869
  "epoch": 0.8710601719197708,
870
  "step": 950
871
  },
872
  {
873
- "loss": 0.7564,
874
- "grad_norm": 0.28924882411956787,
875
- "learning_rate": 4.4012171652245635e-06,
876
  "epoch": 0.8802292263610315,
877
  "step": 960
878
  },
879
  {
880
- "loss": 0.7387,
881
- "grad_norm": 0.2902253568172455,
882
- "learning_rate": 3.767818417992447e-06,
883
  "epoch": 0.8893982808022922,
884
  "step": 970
885
  },
886
  {
887
- "loss": 0.7774,
888
- "grad_norm": 0.354568749666214,
889
- "learning_rate": 3.1818295666037724e-06,
890
  "epoch": 0.898567335243553,
891
  "step": 980
892
  },
893
  {
894
- "loss": 0.7819,
895
- "grad_norm": 0.21420103311538696,
896
- "learning_rate": 2.643851527335006e-06,
897
  "epoch": 0.9077363896848137,
898
  "step": 990
899
  },
900
  {
901
- "loss": 0.9055,
902
- "grad_norm": 0.4397925138473511,
903
- "learning_rate": 2.154435982627573e-06,
904
  "epoch": 0.9169054441260746,
905
  "step": 1000
906
  },
907
  {
908
- "eval_loss": 0.7588858604431152,
909
- "eval_runtime": 153.725,
910
- "eval_samples_per_second": 6.037,
911
- "eval_steps_per_second": 3.018,
912
  "epoch": 0.9169054441260746,
913
  "step": 1000
914
  },
915
  {
916
- "loss": 0.7463,
917
- "grad_norm": 0.37509453296661377,
918
- "learning_rate": 1.7140848153519129e-06,
919
  "epoch": 0.9260744985673353,
920
  "step": 1010
921
  },
922
  {
923
- "loss": 0.7624,
924
- "grad_norm": 0.33944201469421387,
925
- "learning_rate": 1.3232495941396639e-06,
926
  "epoch": 0.935243553008596,
927
  "step": 1020
928
  },
929
  {
930
- "loss": 0.7424,
931
- "grad_norm": 0.3569444715976715,
932
- "learning_rate": 9.82331110311857e-07,
933
  "epoch": 0.9444126074498568,
934
  "step": 1030
935
  },
936
  {
937
- "loss": 0.8083,
938
- "grad_norm": 0.42634230852127075,
939
- "learning_rate": 6.916789668778123e-07,
940
  "epoch": 0.9535816618911175,
941
  "step": 1040
942
  },
943
  {
944
- "loss": 0.8018,
945
- "grad_norm": 0.36843785643577576,
946
- "learning_rate": 4.5159122002644274e-07,
947
  "epoch": 0.9627507163323782,
948
  "step": 1050
949
  },
950
  {
951
- "eval_loss": 0.7587710022926331,
952
- "eval_runtime": 158.0207,
953
- "eval_samples_per_second": 5.873,
954
- "eval_steps_per_second": 2.936,
955
  "epoch": 0.9627507163323782,
956
  "step": 1050
957
  },
958
  {
959
- "loss": 0.831,
960
- "grad_norm": 0.3824850618839264,
961
- "learning_rate": 2.6231407347736546e-07,
962
  "epoch": 0.971919770773639,
963
  "step": 1060
964
  },
965
  {
966
- "loss": 0.8253,
967
- "grad_norm": 0.34650319814682007,
968
- "learning_rate": 1.2404162600541115e-07,
969
  "epoch": 0.9810888252148997,
970
  "step": 1070
971
  },
972
  {
973
- "loss": 0.7623,
974
- "grad_norm": 0.33678287267684937,
975
- "learning_rate": 3.691567239743621e-08,
976
  "epoch": 0.9902578796561604,
977
  "step": 1080
978
  },
979
  {
980
- "loss": 0.7956,
981
- "grad_norm": 0.3255111575126648,
982
- "learning_rate": 1.0255580454254788e-09,
983
  "epoch": 0.9994269340974212,
984
  "step": 1090
985
  },
986
  {
987
- "train_runtime": 10397.2106,
988
- "train_samples_per_second": 1.678,
989
- "train_steps_per_second": 0.105,
990
- "total_flos": 1.5403789002928742e+17,
991
- "train_loss": 0.8588679365670321,
992
  "epoch": 1.0,
993
  "step": 1091,
994
- "total_runtime_sec": 10398.61917591095
995
  }
996
  ]
997
  }
 
13
  "SEED": 3407,
14
  "VAL_RATIO": 0.05,
15
  "MAX_SEQ_LEN": 512,
16
+ "LORA_R": 80,
17
+ "LORA_ALPHA": 160,
18
  "LORA_DROPOUT": 0.0,
19
  "LORA_TARGET_MODULES": [
20
  "q_proj",
 
29
  "PER_DEVICE_TRAIN_BS": 2,
30
  "PER_DEVICE_EVAL_BS": 2,
31
  "GRAD_ACCUM": 8,
32
+ "LR": 1e-06,
33
  "WARMUP_RATIO": 0.1,
34
  "WEIGHT_DECAY": 0.05,
35
  "MAX_STEPS": -1,
 
53
  },
54
  "history": [
55
  {
56
+ "loss": 1.9238,
57
+ "grad_norm": 5.567883491516113,
58
+ "learning_rate": 8.181818181818182e-08,
59
  "epoch": 0.009169054441260744,
60
  "step": 10
61
  },
62
  {
63
+ "loss": 1.7942,
64
+ "grad_norm": 3.81577467918396,
65
+ "learning_rate": 1.7272727272727272e-07,
66
  "epoch": 0.01833810888252149,
67
  "step": 20
68
  },
69
  {
70
+ "loss": 1.9151,
71
+ "grad_norm": 4.943554878234863,
72
+ "learning_rate": 2.636363636363636e-07,
73
  "epoch": 0.027507163323782235,
74
  "step": 30
75
  },
76
  {
77
+ "loss": 1.7679,
78
+ "grad_norm": 4.759264945983887,
79
+ "learning_rate": 3.545454545454545e-07,
80
  "epoch": 0.03667621776504298,
81
  "step": 40
82
  },
83
  {
84
+ "loss": 2.0977,
85
+ "grad_norm": 6.200092315673828,
86
+ "learning_rate": 4.4545454545454544e-07,
87
  "epoch": 0.045845272206303724,
88
  "step": 50
89
  },
90
  {
91
+ "eval_loss": 1.8969855308532715,
92
+ "eval_runtime": 165.3674,
93
+ "eval_samples_per_second": 5.612,
94
+ "eval_steps_per_second": 2.806,
95
  "epoch": 0.045845272206303724,
96
  "step": 50
97
  },
98
  {
99
+ "loss": 1.8792,
100
+ "grad_norm": 4.762349605560303,
101
+ "learning_rate": 5.363636363636363e-07,
102
  "epoch": 0.05501432664756447,
103
  "step": 60
104
  },
105
  {
106
+ "loss": 1.8231,
107
+ "grad_norm": 4.353812217712402,
108
+ "learning_rate": 6.272727272727273e-07,
109
  "epoch": 0.06418338108882521,
110
  "step": 70
111
  },
112
  {
113
+ "loss": 1.8868,
114
+ "grad_norm": 4.538625240325928,
115
+ "learning_rate": 7.181818181818181e-07,
116
  "epoch": 0.07335243553008595,
117
  "step": 80
118
  },
119
  {
120
+ "loss": 1.6794,
121
+ "grad_norm": 3.11306095123291,
122
+ "learning_rate": 8.09090909090909e-07,
123
  "epoch": 0.0825214899713467,
124
  "step": 90
125
  },
126
  {
127
+ "loss": 1.7012,
128
+ "grad_norm": 3.303393602371216,
129
+ "learning_rate": 9e-07,
130
  "epoch": 0.09169054441260745,
131
  "step": 100
132
  },
133
  {
134
+ "eval_loss": 1.6190643310546875,
135
+ "eval_runtime": 163.7929,
136
+ "eval_samples_per_second": 5.666,
137
+ "eval_steps_per_second": 2.833,
138
  "epoch": 0.09169054441260745,
139
  "step": 100
140
  },
141
  {
142
+ "loss": 1.6738,
143
+ "grad_norm": 2.3432679176330566,
144
+ "learning_rate": 9.909090909090909e-07,
145
  "epoch": 0.1008595988538682,
146
  "step": 110
147
  },
148
  {
149
+ "loss": 1.5342,
150
+ "grad_norm": 1.411333441734314,
151
+ "learning_rate": 9.997923381619255e-07,
152
  "epoch": 0.11002865329512894,
153
  "step": 120
154
  },
155
  {
156
+ "loss": 1.4696,
157
+ "grad_norm": 1.3321274518966675,
158
+ "learning_rate": 9.990747162241872e-07,
159
  "epoch": 0.11919770773638969,
160
  "step": 130
161
  },
162
  {
163
+ "loss": 1.4552,
164
+ "grad_norm": 1.0566920042037964,
165
+ "learning_rate": 9.978453061876695e-07,
166
  "epoch": 0.12836676217765042,
167
  "step": 140
168
  },
169
  {
170
+ "loss": 1.5192,
171
+ "grad_norm": 0.9345868229866028,
172
+ "learning_rate": 9.96105368780285e-07,
173
  "epoch": 0.13753581661891118,
174
  "step": 150
175
  },
176
  {
177
+ "eval_loss": 1.3909412622451782,
178
+ "eval_runtime": 170.2558,
179
+ "eval_samples_per_second": 5.451,
180
+ "eval_steps_per_second": 2.725,
181
  "epoch": 0.13753581661891118,
182
  "step": 150
183
  },
184
  {
185
+ "loss": 1.4217,
186
+ "grad_norm": 0.8553086519241333,
187
+ "learning_rate": 9.938566882624436e-07,
188
  "epoch": 0.1467048710601719,
189
  "step": 160
190
  },
191
  {
192
+ "loss": 1.4101,
193
+ "grad_norm": 0.8422027230262756,
194
+ "learning_rate": 9.911015705973398e-07,
195
  "epoch": 0.15587392550143267,
196
  "step": 170
197
  },
198
  {
199
+ "loss": 1.4131,
200
+ "grad_norm": 0.6763940453529358,
201
+ "learning_rate": 9.878428410862482e-07,
202
  "epoch": 0.1650429799426934,
203
  "step": 180
204
  },
205
  {
206
+ "loss": 1.4201,
207
+ "grad_norm": 0.5860380530357361,
208
+ "learning_rate": 9.8408384147125e-07,
209
  "epoch": 0.17421203438395416,
210
  "step": 190
211
  },
212
  {
213
+ "loss": 1.2782,
214
+ "grad_norm": 0.7507234215736389,
215
+ "learning_rate": 9.79828426508364e-07,
216
  "epoch": 0.1833810888252149,
217
  "step": 200
218
  },
219
  {
220
+ "eval_loss": 1.3075143098831177,
221
+ "eval_runtime": 168.9052,
222
+ "eval_samples_per_second": 5.494,
223
+ "eval_steps_per_second": 2.747,
224
  "epoch": 0.1833810888252149,
225
  "step": 200
226
  },
227
  {
228
+ "loss": 1.3912,
229
+ "grad_norm": 0.6225572228431702,
230
+ "learning_rate": 9.750809600145952e-07,
231
  "epoch": 0.19255014326647565,
232
  "step": 210
233
  },
234
  {
235
+ "loss": 1.2743,
236
+ "grad_norm": 0.5334329009056091,
237
+ "learning_rate": 9.698463103929541e-07,
238
  "epoch": 0.2017191977077364,
239
  "step": 220
240
  },
241
  {
242
+ "loss": 1.2105,
243
+ "grad_norm": 0.730050265789032,
244
+ "learning_rate": 9.641298456400363e-07,
245
  "epoch": 0.21088825214899715,
246
  "step": 230
247
  },
248
  {
249
+ "loss": 1.3692,
250
+ "grad_norm": 0.646460235118866,
251
+ "learning_rate": 9.579374278412817e-07,
252
  "epoch": 0.22005730659025788,
253
  "step": 240
254
  },
255
  {
256
+ "loss": 1.2971,
257
+ "grad_norm": 0.5395148992538452,
258
+ "learning_rate": 9.512754071595603e-07,
259
  "epoch": 0.22922636103151864,
260
  "step": 250
261
  },
262
  {
263
+ "eval_loss": 1.2624306678771973,
264
+ "eval_runtime": 157.9571,
265
+ "eval_samples_per_second": 5.875,
266
+ "eval_steps_per_second": 2.938,
267
  "epoch": 0.22922636103151864,
268
  "step": 250
269
  },
270
  {
271
+ "loss": 1.3346,
272
+ "grad_norm": 0.5858215093612671,
273
+ "learning_rate": 9.441506153232442e-07,
274
  "epoch": 0.23839541547277937,
275
  "step": 260
276
  },
277
  {
278
+ "loss": 1.2195,
279
+ "grad_norm": 0.5456379055976868,
280
+ "learning_rate": 9.365703586204494e-07,
281
  "epoch": 0.2475644699140401,
282
  "step": 270
283
  },
284
  {
285
+ "loss": 1.3763,
286
+ "grad_norm": 0.8651963472366333,
287
+ "learning_rate": 9.285424104066275e-07,
288
  "epoch": 0.25673352435530084,
289
  "step": 280
290
  },
291
  {
292
+ "loss": 1.2256,
293
+ "grad_norm": 0.5938352942466736,
294
+ "learning_rate": 9.20075003133193e-07,
295
  "epoch": 0.2659025787965616,
296
  "step": 290
297
  },
298
  {
299
+ "loss": 1.3137,
300
+ "grad_norm": 0.6355459690093994,
301
+ "learning_rate": 9.111768199053586e-07,
302
  "epoch": 0.27507163323782235,
303
  "step": 300
304
  },
305
  {
306
+ "eval_loss": 1.2268821001052856,
307
+ "eval_runtime": 164.1638,
308
+ "eval_samples_per_second": 5.653,
309
+ "eval_steps_per_second": 2.826,
310
  "epoch": 0.27507163323782235,
311
  "step": 300
312
  },
313
  {
314
+ "loss": 1.4409,
315
+ "grad_norm": 0.9009450078010559,
316
+ "learning_rate": 9.018569855778383e-07,
317
  "epoch": 0.2842406876790831,
318
  "step": 310
319
  },
320
  {
321
+ "loss": 1.2566,
322
+ "grad_norm": 0.5912006497383118,
323
+ "learning_rate": 8.921250573975455e-07,
324
  "epoch": 0.2934097421203438,
325
  "step": 320
326
  },
327
  {
328
+ "loss": 1.1593,
329
+ "grad_norm": 0.6118663549423218,
330
+ "learning_rate": 8.81991015202887e-07,
331
  "epoch": 0.3025787965616046,
332
  "step": 330
333
  },
334
  {
335
+ "loss": 1.2555,
336
+ "grad_norm": 0.581721305847168,
337
+ "learning_rate": 8.714652511896993e-07,
338
  "epoch": 0.31174785100286534,
339
  "step": 340
340
  },
341
  {
342
+ "loss": 1.1209,
343
+ "grad_norm": 0.5230151414871216,
344
+ "learning_rate": 8.605585592543211e-07,
345
  "epoch": 0.3209169054441261,
346
  "step": 350
347
  },
348
  {
349
+ "eval_loss": 1.1941955089569092,
350
+ "eval_runtime": 162.9119,
351
+ "eval_samples_per_second": 5.696,
352
+ "eval_steps_per_second": 2.848,
353
  "epoch": 0.3209169054441261,
354
  "step": 350
355
  },
356
  {
357
+ "loss": 1.3564,
358
+ "grad_norm": 0.565862238407135,
359
+ "learning_rate": 8.492821239247363e-07,
360
  "epoch": 0.3300859598853868,
361
  "step": 360
362
  },
363
  {
364
+ "loss": 1.3997,
365
+ "grad_norm": 0.6057285666465759,
366
+ "learning_rate": 8.376475088911317e-07,
367
  "epoch": 0.33925501432664756,
368
  "step": 370
369
  },
370
  {
371
+ "loss": 1.0878,
372
+ "grad_norm": 0.7517871260643005,
373
+ "learning_rate": 8.256666451476336e-07,
374
  "epoch": 0.3484240687679083,
375
  "step": 380
376
  },
377
  {
378
+ "loss": 1.3009,
379
+ "grad_norm": 0.7168652415275574,
380
+ "learning_rate": 8.133518187573862e-07,
381
  "epoch": 0.35759312320916903,
382
  "step": 390
383
  },
384
  {
385
+ "loss": 1.1606,
386
+ "grad_norm": 0.7250906825065613,
387
+ "learning_rate": 8.007156582535131e-07,
388
  "epoch": 0.3667621776504298,
389
  "step": 400
390
  },
391
  {
392
+ "eval_loss": 1.166169285774231,
393
+ "eval_runtime": 158.6155,
394
+ "eval_samples_per_second": 5.851,
395
+ "eval_steps_per_second": 2.925,
396
  "epoch": 0.3667621776504298,
397
  "step": 400
398
  },
399
  {
400
+ "loss": 1.2593,
401
+ "grad_norm": 0.6665163636207581,
402
+ "learning_rate": 7.877711216888867e-07,
403
  "epoch": 0.37593123209169055,
404
  "step": 410
405
  },
406
  {
407
+ "loss": 1.1365,
408
+ "grad_norm": 0.6199079751968384,
409
+ "learning_rate": 7.745314833479833e-07,
410
  "epoch": 0.3851002865329513,
411
  "step": 420
412
  },
413
  {
414
+ "loss": 1.19,
415
+ "grad_norm": 0.6242042779922485,
416
+ "learning_rate": 7.6101032013445e-07,
417
  "epoch": 0.394269340974212,
418
  "step": 430
419
  },
420
  {
421
+ "loss": 1.1405,
422
+ "grad_norm": 0.657778263092041,
423
+ "learning_rate": 7.472214976483451e-07,
424
  "epoch": 0.4034383954154728,
425
  "step": 440
426
  },
427
  {
428
+ "loss": 1.0791,
429
+ "grad_norm": 0.5634785890579224,
430
+ "learning_rate": 7.331791559673269e-07,
431
  "epoch": 0.41260744985673353,
432
  "step": 450
433
  },
434
  {
435
+ "eval_loss": 1.1434489488601685,
436
+ "eval_runtime": 153.2068,
437
+ "eval_samples_per_second": 6.057,
438
+ "eval_steps_per_second": 3.029,
439
  "epoch": 0.41260744985673353,
440
  "step": 450
441
  },
442
  {
443
+ "loss": 1.1352,
444
+ "grad_norm": 0.7913809418678284,
445
+ "learning_rate": 7.188976951463723e-07,
446
  "epoch": 0.4217765042979943,
447
  "step": 460
448
  },
449
  {
450
+ "loss": 1.3036,
451
+ "grad_norm": 0.6191056966781616,
452
+ "learning_rate": 7.043917604508971e-07,
453
  "epoch": 0.430945558739255,
454
  "step": 470
455
  },
456
  {
457
+ "loss": 1.121,
458
+ "grad_norm": 0.6955880522727966,
459
+ "learning_rate": 6.896762273384178e-07,
460
  "epoch": 0.44011461318051576,
461
  "step": 480
462
  },
463
  {
464
+ "loss": 1.0997,
465
+ "grad_norm": 0.8450888395309448,
466
+ "learning_rate": 6.747661862041585e-07,
467
  "epoch": 0.4492836676217765,
468
  "step": 490
469
  },
470
  {
471
+ "loss": 1.3604,
472
+ "grad_norm": 0.8735764622688293,
473
+ "learning_rate": 6.596769269062443e-07,
474
  "epoch": 0.4584527220630373,
475
  "step": 500
476
  },
477
  {
478
+ "eval_loss": 1.1247467994689941,
479
+ "eval_runtime": 152.4716,
480
+ "eval_samples_per_second": 6.086,
481
+ "eval_steps_per_second": 3.043,
482
  "epoch": 0.4584527220630373,
483
  "step": 500
484
  },
485
  {
486
+ "loss": 1.171,
487
+ "grad_norm": 0.7228217124938965,
488
+ "learning_rate": 6.444239230863504e-07,
489
  "epoch": 0.467621776504298,
490
  "step": 510
491
  },
492
  {
493
+ "loss": 1.03,
494
+ "grad_norm": 0.6075210571289062,
495
+ "learning_rate": 6.290228163018867e-07,
496
  "epoch": 0.47679083094555874,
497
  "step": 520
498
  },
499
  {
500
+ "loss": 1.0345,
501
+ "grad_norm": 0.7692680954933167,
502
+ "learning_rate": 6.134893999859886e-07,
503
  "epoch": 0.4859598853868195,
504
  "step": 530
505
  },
506
  {
507
+ "loss": 1.2594,
508
+ "grad_norm": 0.7600648403167725,
509
+ "learning_rate": 5.978396032517639e-07,
510
  "epoch": 0.4951289398280802,
511
  "step": 540
512
  },
513
  {
514
+ "loss": 1.0254,
515
+ "grad_norm": 0.6178115010261536,
516
+ "learning_rate": 5.820894745574025e-07,
517
  "epoch": 0.504297994269341,
518
  "step": 550
519
  },
520
  {
521
+ "eval_loss": 1.1089264154434204,
522
+ "eval_runtime": 151.8982,
523
+ "eval_samples_per_second": 6.109,
524
+ "eval_steps_per_second": 3.055,
525
  "epoch": 0.504297994269341,
526
  "step": 550
527
  },
528
  {
529
+ "loss": 1.1049,
530
+ "grad_norm": 0.5585054159164429,
531
+ "learning_rate": 5.662551652489008e-07,
532
  "epoch": 0.5134670487106017,
533
  "step": 560
534
  },
535
  {
536
+ "loss": 1.0898,
537
+ "grad_norm": 0.6844518780708313,
538
+ "learning_rate": 5.503529129972792e-07,
539
  "epoch": 0.5226361031518625,
540
  "step": 570
541
  },
542
  {
543
+ "loss": 1.1037,
544
+ "grad_norm": 0.8425552845001221,
545
+ "learning_rate": 5.34399025147273e-07,
546
  "epoch": 0.5318051575931232,
547
  "step": 580
548
  },
549
  {
550
+ "loss": 1.1019,
551
+ "grad_norm": 0.648064136505127,
552
+ "learning_rate": 5.18409861994576e-07,
553
  "epoch": 0.540974212034384,
554
  "step": 590
555
  },
556
  {
557
+ "loss": 1.1863,
558
+ "grad_norm": 0.5788621306419373,
559
+ "learning_rate": 5.024018200087854e-07,
560
  "epoch": 0.5501432664756447,
561
  "step": 600
562
  },
563
  {
564
+ "eval_loss": 1.095629096031189,
565
+ "eval_runtime": 150.8171,
566
+ "eval_samples_per_second": 6.153,
567
+ "eval_steps_per_second": 3.077,
568
  "epoch": 0.5501432664756447,
569
  "step": 600
570
  },
571
  {
572
+ "loss": 1.1025,
573
+ "grad_norm": 0.6422027349472046,
574
+ "learning_rate": 4.86391315019248e-07,
575
  "epoch": 0.5593123209169054,
576
  "step": 610
577
  },
578
  {
579
+ "loss": 1.0666,
580
+ "grad_norm": 0.6005454063415527,
581
+ "learning_rate": 4.703947653810575e-07,
582
  "epoch": 0.5684813753581662,
583
  "step": 620
584
  },
585
  {
586
+ "loss": 1.215,
587
+ "grad_norm": 0.6145904064178467,
588
+ "learning_rate": 4.544285751384584e-07,
589
  "epoch": 0.5776504297994269,
590
  "step": 630
591
  },
592
  {
593
+ "loss": 1.1613,
594
+ "grad_norm": 0.8756449818611145,
595
+ "learning_rate": 4.385091172029275e-07,
596
  "epoch": 0.5868194842406876,
597
  "step": 640
598
  },
599
  {
600
+ "loss": 1.1092,
601
+ "grad_norm": 0.7930067181587219,
602
+ "learning_rate": 4.2265271656318e-07,
603
  "epoch": 0.5959885386819485,
604
  "step": 650
605
  },
606
  {
607
+ "eval_loss": 1.0845845937728882,
608
+ "eval_runtime": 151.0566,
609
+ "eval_samples_per_second": 6.143,
610
+ "eval_steps_per_second": 3.072,
611
  "epoch": 0.5959885386819485,
612
  "step": 650
613
  },
614
  {
615
+ "loss": 1.214,
616
+ "grad_norm": 1.1016592979431152,
617
+ "learning_rate": 4.068756335443198e-07,
618
  "epoch": 0.6051575931232092,
619
  "step": 660
620
  },
621
  {
622
+ "loss": 1.3335,
623
+ "grad_norm": 0.7920063138008118,
624
+ "learning_rate": 3.9119404713330013e-07,
625
  "epoch": 0.6143266475644699,
626
  "step": 670
627
  },
628
  {
629
+ "loss": 1.123,
630
+ "grad_norm": 0.792630136013031,
631
+ "learning_rate": 3.7562403838779467e-07,
632
  "epoch": 0.6234957020057307,
633
  "step": 680
634
  },
635
  {
636
+ "loss": 1.2098,
637
+ "grad_norm": 0.8105105757713318,
638
+ "learning_rate": 3.601815739454928e-07,
639
  "epoch": 0.6326647564469914,
640
  "step": 690
641
  },
642
  {
643
+ "loss": 1.0302,
644
+ "grad_norm": 0.6204477548599243,
645
+ "learning_rate": 3.448824896507292e-07,
646
  "epoch": 0.6418338108882522,
647
  "step": 700
648
  },
649
  {
650
+ "eval_loss": 1.0753319263458252,
651
+ "eval_runtime": 159.1632,
652
+ "eval_samples_per_second": 5.83,
653
+ "eval_steps_per_second": 2.915,
654
  "epoch": 0.6418338108882522,
655
  "step": 700
656
  },
657
  {
658
+ "loss": 1.109,
659
+ "grad_norm": 0.6277522444725037,
660
+ "learning_rate": 3.297424743152381e-07,
661
  "epoch": 0.6510028653295129,
662
  "step": 710
663
  },
664
  {
665
+ "loss": 1.0808,
666
+ "grad_norm": 0.7879471182823181,
667
+ "learning_rate": 3.1477705362968696e-07,
668
  "epoch": 0.6601719197707736,
669
  "step": 720
670
  },
671
  {
672
+ "loss": 1.0842,
673
+ "grad_norm": 0.8374884128570557,
674
+ "learning_rate": 3.000015742424857e-07,
675
  "epoch": 0.6693409742120344,
676
  "step": 730
677
  },
678
  {
679
+ "loss": 1.2445,
680
+ "grad_norm": 0.7892112731933594,
681
+ "learning_rate": 2.85431188022199e-07,
682
  "epoch": 0.6785100286532951,
683
  "step": 740
684
  },
685
  {
686
+ "loss": 1.1506,
687
+ "grad_norm": 0.5540062785148621,
688
+ "learning_rate": 2.710808365197e-07,
689
  "epoch": 0.6876790830945558,
690
  "step": 750
691
  },
692
  {
693
+ "eval_loss": 1.0686043500900269,
694
+ "eval_runtime": 154.055,
695
+ "eval_samples_per_second": 6.024,
696
+ "eval_steps_per_second": 3.012,
697
  "epoch": 0.6876790830945558,
698
  "step": 750
699
  },
700
  {
701
+ "loss": 1.1023,
702
+ "grad_norm": 0.5218796133995056,
703
+ "learning_rate": 2.569652356460007e-07,
704
  "epoch": 0.6968481375358166,
705
  "step": 760
706
  },
707
  {
708
+ "loss": 1.1801,
709
+ "grad_norm": 0.7433627247810364,
710
+ "learning_rate": 2.430988605814691e-07,
711
  "epoch": 0.7060171919770774,
712
  "step": 770
713
  },
714
  {
715
+ "loss": 1.0276,
716
+ "grad_norm": 0.629487931728363,
717
+ "learning_rate": 2.294959309319086e-07,
718
  "epoch": 0.7151862464183381,
719
  "step": 780
720
  },
721
  {
722
+ "loss": 1.1164,
723
+ "grad_norm": 0.6667075157165527,
724
+ "learning_rate": 2.1617039614672378e-07,
725
  "epoch": 0.7243553008595989,
726
  "step": 790
727
  },
728
  {
729
+ "loss": 1.0869,
730
+ "grad_norm": 0.6930222511291504,
731
+ "learning_rate": 2.0313592121412464e-07,
732
  "epoch": 0.7335243553008596,
733
  "step": 800
734
  },
735
  {
736
+ "eval_loss": 1.0634101629257202,
737
+ "eval_runtime": 154.1494,
738
+ "eval_samples_per_second": 6.02,
739
+ "eval_steps_per_second": 3.01,
740
  "epoch": 0.7335243553008596,
741
  "step": 800
742
  },
743
  {
744
+ "loss": 1.0937,
745
+ "grad_norm": 0.7793363332748413,
746
+ "learning_rate": 1.904058726480367e-07,
747
  "epoch": 0.7426934097421204,
748
  "step": 810
749
  },
750
  {
751
+ "loss": 0.9728,
752
+ "grad_norm": 0.5570642948150635,
753
+ "learning_rate": 1.7799330478109026e-07,
754
  "epoch": 0.7518624641833811,
755
  "step": 820
756
  },
757
  {
758
+ "loss": 1.2568,
759
+ "grad_norm": 0.9017201662063599,
760
+ "learning_rate": 1.65910946377743e-07,
761
  "epoch": 0.7610315186246418,
762
  "step": 830
763
  },
764
  {
765
+ "loss": 1.0049,
766
+ "grad_norm": 0.7178328633308411,
767
+ "learning_rate": 1.5417118758126408e-07,
768
  "epoch": 0.7702005730659026,
769
  "step": 840
770
  },
771
  {
772
+ "loss": 1.0576,
773
+ "grad_norm": 1.031610369682312,
774
+ "learning_rate": 1.4278606720796543e-07,
775
  "epoch": 0.7793696275071633,
776
  "step": 850
777
  },
778
  {
779
+ "eval_loss": 1.0599370002746582,
780
+ "eval_runtime": 154.7364,
781
+ "eval_samples_per_second": 5.997,
782
+ "eval_steps_per_second": 2.999,
783
  "epoch": 0.7793696275071633,
784
  "step": 850
785
  },
786
  {
787
+ "loss": 1.1797,
788
+ "grad_norm": 1.0518614053726196,
789
+ "learning_rate": 1.3176726040171e-07,
790
  "epoch": 0.788538681948424,
791
  "step": 860
792
  },
793
  {
794
+ "loss": 1.2085,
795
+ "grad_norm": 0.7290861010551453,
796
+ "learning_rate": 1.21126066661356e-07,
797
  "epoch": 0.7977077363896848,
798
  "step": 870
799
  },
800
  {
801
+ "loss": 1.1165,
802
+ "grad_norm": 0.6315222382545471,
803
+ "learning_rate": 1.108733982534159e-07,
804
  "epoch": 0.8068767908309455,
805
  "step": 880
806
  },
807
  {
808
+ "loss": 1.158,
809
+ "grad_norm": 0.685243546962738,
810
+ "learning_rate": 1.0101976902181225e-07,
811
  "epoch": 0.8160458452722062,
812
  "step": 890
813
  },
814
  {
815
+ "loss": 1.0584,
816
+ "grad_norm": 0.7780338525772095,
817
+ "learning_rate": 9.157528360620415e-08,
818
  "epoch": 0.8252148997134671,
819
  "step": 900
820
  },
821
  {
822
+ "eval_loss": 1.057593584060669,
823
+ "eval_runtime": 153.5904,
824
+ "eval_samples_per_second": 6.042,
825
+ "eval_steps_per_second": 3.021,
826
  "epoch": 0.8252148997134671,
827
  "step": 900
828
  },
829
  {
830
+ "loss": 1.1489,
831
+ "grad_norm": 0.6839588284492493,
832
+ "learning_rate": 8.254962707994373e-08,
833
  "epoch": 0.8343839541547278,
834
  "step": 910
835
  },
836
  {
837
+ "loss": 1.1096,
838
+ "grad_norm": 0.9299020171165466,
839
+ "learning_rate": 7.395205501828577e-08,
840
  "epoch": 0.8435530085959886,
841
  "step": 920
842
  },
843
  {
844
+ "loss": 1.1224,
845
+ "grad_norm": 0.791289746761322,
846
+ "learning_rate": 6.579138400703715e-08,
847
  "epoch": 0.8527220630372493,
848
  "step": 930
849
  },
850
  {
851
+ "loss": 1.038,
852
+ "grad_norm": 0.6159808039665222,
853
+ "learning_rate": 5.807598260137758e-08,
854
  "epoch": 0.86189111747851,
855
  "step": 940
856
  },
857
  {
858
+ "loss": 1.0708,
859
+ "grad_norm": 0.7773894667625427,
860
+ "learning_rate": 5.08137627441253e-08,
861
  "epoch": 0.8710601719197708,
862
  "step": 950
863
  },
864
  {
865
+ "eval_loss": 1.0561405420303345,
866
+ "eval_runtime": 154.3615,
867
+ "eval_samples_per_second": 6.012,
868
+ "eval_steps_per_second": 3.006,
869
  "epoch": 0.8710601719197708,
870
  "step": 950
871
  },
872
  {
873
+ "loss": 1.0656,
874
+ "grad_norm": 0.707645833492279,
875
+ "learning_rate": 4.401217165224563e-08,
876
  "epoch": 0.8802292263610315,
877
  "step": 960
878
  },
879
  {
880
+ "loss": 1.0164,
881
+ "grad_norm": 0.6336905360221863,
882
+ "learning_rate": 3.767818417992446e-08,
883
  "epoch": 0.8893982808022922,
884
  "step": 970
885
  },
886
  {
887
+ "loss": 1.0766,
888
+ "grad_norm": 0.8207520842552185,
889
+ "learning_rate": 3.181829566603772e-08,
890
  "epoch": 0.898567335243553,
891
  "step": 980
892
  },
893
  {
894
+ "loss": 1.0929,
895
+ "grad_norm": 0.6286782026290894,
896
+ "learning_rate": 2.643851527335006e-08,
897
  "epoch": 0.9077363896848137,
898
  "step": 990
899
  },
900
  {
901
+ "loss": 1.2541,
902
+ "grad_norm": 0.817637026309967,
903
+ "learning_rate": 2.1544359826275726e-08,
904
  "epoch": 0.9169054441260746,
905
  "step": 1000
906
  },
907
  {
908
+ "eval_loss": 1.0553829669952393,
909
+ "eval_runtime": 155.0837,
910
+ "eval_samples_per_second": 5.984,
911
+ "eval_steps_per_second": 2.992,
912
  "epoch": 0.9169054441260746,
913
  "step": 1000
914
  },
915
  {
916
+ "loss": 1.0413,
917
+ "grad_norm": 0.9485034942626953,
918
+ "learning_rate": 1.714084815351913e-08,
919
  "epoch": 0.9260744985673353,
920
  "step": 1010
921
  },
922
  {
923
+ "loss": 1.0516,
924
+ "grad_norm": 0.6737267971038818,
925
+ "learning_rate": 1.3232495941396637e-08,
926
  "epoch": 0.935243553008596,
927
  "step": 1020
928
  },
929
  {
930
+ "loss": 1.0517,
931
+ "grad_norm": 0.9414446353912354,
932
+ "learning_rate": 9.82331110311857e-09,
933
  "epoch": 0.9444126074498568,
934
  "step": 1030
935
  },
936
  {
937
+ "loss": 1.1016,
938
+ "grad_norm": 0.8654493689537048,
939
+ "learning_rate": 6.916789668778122e-09,
940
  "epoch": 0.9535816618911175,
941
  "step": 1040
942
  },
943
  {
944
+ "loss": 1.1055,
945
+ "grad_norm": 0.8262504935264587,
946
+ "learning_rate": 4.515912200264427e-09,
947
  "epoch": 0.9627507163323782,
948
  "step": 1050
949
  },
950
  {
951
+ "eval_loss": 1.0550979375839233,
952
+ "eval_runtime": 156.3052,
953
+ "eval_samples_per_second": 5.937,
954
+ "eval_steps_per_second": 2.969,
955
  "epoch": 0.9627507163323782,
956
  "step": 1050
957
  },
958
  {
959
+ "loss": 1.1321,
960
+ "grad_norm": 0.7707592844963074,
961
+ "learning_rate": 2.6231407347736546e-09,
962
  "epoch": 0.971919770773639,
963
  "step": 1060
964
  },
965
  {
966
+ "loss": 1.1039,
967
+ "grad_norm": 0.7415518760681152,
968
+ "learning_rate": 1.2404162600541113e-09,
969
  "epoch": 0.9810888252148997,
970
  "step": 1070
971
  },
972
  {
973
+ "loss": 1.0673,
974
+ "grad_norm": 0.6835209727287292,
975
+ "learning_rate": 3.6915672397436204e-10,
976
  "epoch": 0.9902578796561604,
977
  "step": 1080
978
  },
979
  {
980
+ "loss": 1.1213,
981
+ "grad_norm": 0.6404680013656616,
982
+ "learning_rate": 1.0255580454254786e-11,
983
  "epoch": 0.9994269340974212,
984
  "step": 1090
985
  },
986
  {
987
+ "train_runtime": 10741.972,
988
+ "train_samples_per_second": 1.624,
989
+ "train_steps_per_second": 0.102,
990
+ "total_flos": 1.5816368624117146e+17,
991
+ "train_loss": 1.2491859223840436,
992
  "epoch": 1.0,
993
  "step": 1091,
994
+ "total_runtime_sec": 10743.41768693924
995
  }
996
  ]
997
  }