youssefedweqd commited on
Commit
84644ec
·
verified ·
1 Parent(s): a8d0a03

Training in progress, step 4500, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
26
  "q_proj",
27
  "v_proj",
28
- "k_proj",
29
- "up_proj",
30
  "down_proj",
31
- "gate_proj",
32
- "o_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "gate_proj",
27
  "q_proj",
28
  "v_proj",
29
+ "o_proj",
 
30
  "down_proj",
31
+ "k_proj",
32
+ "up_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:027c28cbacad0920c7a8ec1a4dbaf396f0658e37d9c57aa24903513cf568bf29
3
  size 161533160
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:caf08a37d467af8be6ee4d7f8398900235c664d83757737f8601af66dd61bee5
3
  size 161533160
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a811f08d635f9fd429d0ac8672eee899607dd871ece10f326b8ec3e7266d9db2
3
  size 323292202
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f21859111603253f67b1ca4afa8ce4858e0978e5685b991a3d9c2883b821035
3
  size 323292202
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:824d4a418ca52dbceab02ca3bdda11d00d54b246084fd87a75671a28233a0cb2
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:712366a72cf55e9140e7cb32d65c59ab0aec41cadb87ddf8db2ed2cbbb7181be
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.16556291390728478,
6
  "eval_steps": 100,
7
- "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -164,6 +164,496 @@
164
  "learning_rate": 5.513245033112583e-05,
165
  "loss": 0.7213,
166
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  }
168
  ],
169
  "logging_steps": 50,
@@ -183,7 +673,7 @@
183
  "attributes": {}
184
  }
185
  },
186
- "total_flos": 7833052747137024.0,
187
  "train_batch_size": 1,
188
  "trial_name": null,
189
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.7450331125827815,
6
  "eval_steps": 100,
7
+ "global_step": 4500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
164
  "learning_rate": 5.513245033112583e-05,
165
  "loss": 0.7213,
166
  "step": 1000
167
+ },
168
+ {
169
+ "epoch": 0.173841059602649,
170
+ "grad_norm": 1.8289754390716553,
171
+ "learning_rate": 5.789183222958058e-05,
172
+ "loss": 0.7335,
173
+ "step": 1050
174
+ },
175
+ {
176
+ "epoch": 0.18211920529801323,
177
+ "grad_norm": 1.4989681243896484,
178
+ "learning_rate": 6.065121412803533e-05,
179
+ "loss": 0.7326,
180
+ "step": 1100
181
+ },
182
+ {
183
+ "epoch": 0.19039735099337748,
184
+ "grad_norm": 1.5326098203659058,
185
+ "learning_rate": 6.341059602649006e-05,
186
+ "loss": 0.7311,
187
+ "step": 1150
188
+ },
189
+ {
190
+ "epoch": 0.1986754966887417,
191
+ "grad_norm": 1.4897147417068481,
192
+ "learning_rate": 6.616997792494481e-05,
193
+ "loss": 0.6918,
194
+ "step": 1200
195
+ },
196
+ {
197
+ "epoch": 0.20695364238410596,
198
+ "grad_norm": 1.634765863418579,
199
+ "learning_rate": 6.892935982339957e-05,
200
+ "loss": 0.7051,
201
+ "step": 1250
202
+ },
203
+ {
204
+ "epoch": 0.2152317880794702,
205
+ "grad_norm": 1.4463587999343872,
206
+ "learning_rate": 7.168874172185431e-05,
207
+ "loss": 0.6955,
208
+ "step": 1300
209
+ },
210
+ {
211
+ "epoch": 0.22350993377483444,
212
+ "grad_norm": 1.632133960723877,
213
+ "learning_rate": 7.444812362030905e-05,
214
+ "loss": 0.6901,
215
+ "step": 1350
216
+ },
217
+ {
218
+ "epoch": 0.23178807947019867,
219
+ "grad_norm": 1.4062328338623047,
220
+ "learning_rate": 7.72075055187638e-05,
221
+ "loss": 0.6833,
222
+ "step": 1400
223
+ },
224
+ {
225
+ "epoch": 0.24006622516556292,
226
+ "grad_norm": 1.2914466857910156,
227
+ "learning_rate": 7.996688741721855e-05,
228
+ "loss": 0.6663,
229
+ "step": 1450
230
+ },
231
+ {
232
+ "epoch": 0.24834437086092714,
233
+ "grad_norm": 1.4995919466018677,
234
+ "learning_rate": 8.272626931567329e-05,
235
+ "loss": 0.6959,
236
+ "step": 1500
237
+ },
238
+ {
239
+ "epoch": 0.25662251655629137,
240
+ "grad_norm": 1.1299749612808228,
241
+ "learning_rate": 8.548565121412803e-05,
242
+ "loss": 0.6685,
243
+ "step": 1550
244
+ },
245
+ {
246
+ "epoch": 0.26490066225165565,
247
+ "grad_norm": 1.329004168510437,
248
+ "learning_rate": 8.824503311258279e-05,
249
+ "loss": 0.6678,
250
+ "step": 1600
251
+ },
252
+ {
253
+ "epoch": 0.2731788079470199,
254
+ "grad_norm": 1.5191948413848877,
255
+ "learning_rate": 9.100441501103754e-05,
256
+ "loss": 0.6731,
257
+ "step": 1650
258
+ },
259
+ {
260
+ "epoch": 0.2814569536423841,
261
+ "grad_norm": 1.739169716835022,
262
+ "learning_rate": 9.376379690949227e-05,
263
+ "loss": 0.6691,
264
+ "step": 1700
265
+ },
266
+ {
267
+ "epoch": 0.2897350993377483,
268
+ "grad_norm": 1.2906118631362915,
269
+ "learning_rate": 9.652317880794703e-05,
270
+ "loss": 0.6718,
271
+ "step": 1750
272
+ },
273
+ {
274
+ "epoch": 0.2980132450331126,
275
+ "grad_norm": 1.289502501487732,
276
+ "learning_rate": 9.928256070640178e-05,
277
+ "loss": 0.6581,
278
+ "step": 1800
279
+ },
280
+ {
281
+ "epoch": 0.30629139072847683,
282
+ "grad_norm": 1.3923128843307495,
283
+ "learning_rate": 9.999872989402833e-05,
284
+ "loss": 0.6589,
285
+ "step": 1850
286
+ },
287
+ {
288
+ "epoch": 0.31456953642384106,
289
+ "grad_norm": 1.1048816442489624,
290
+ "learning_rate": 9.999297790520483e-05,
291
+ "loss": 0.6341,
292
+ "step": 1900
293
+ },
294
+ {
295
+ "epoch": 0.3228476821192053,
296
+ "grad_norm": 1.3568603992462158,
297
+ "learning_rate": 9.998258777484084e-05,
298
+ "loss": 0.6318,
299
+ "step": 1950
300
+ },
301
+ {
302
+ "epoch": 0.33112582781456956,
303
+ "grad_norm": 0.923786997795105,
304
+ "learning_rate": 9.996756046688961e-05,
305
+ "loss": 0.6318,
306
+ "step": 2000
307
+ },
308
+ {
309
+ "epoch": 0.3394039735099338,
310
+ "grad_norm": 1.102367877960205,
311
+ "learning_rate": 9.994789737552259e-05,
312
+ "loss": 0.6193,
313
+ "step": 2050
314
+ },
315
+ {
316
+ "epoch": 0.347682119205298,
317
+ "grad_norm": 1.0738896131515503,
318
+ "learning_rate": 9.992360032500001e-05,
319
+ "loss": 0.6184,
320
+ "step": 2100
321
+ },
322
+ {
323
+ "epoch": 0.35596026490066224,
324
+ "grad_norm": 1.279288649559021,
325
+ "learning_rate": 9.98946715695016e-05,
326
+ "loss": 0.626,
327
+ "step": 2150
328
+ },
329
+ {
330
+ "epoch": 0.36423841059602646,
331
+ "grad_norm": 1.2009036540985107,
332
+ "learning_rate": 9.986111379291759e-05,
333
+ "loss": 0.6305,
334
+ "step": 2200
335
+ },
336
+ {
337
+ "epoch": 0.37251655629139074,
338
+ "grad_norm": 0.8177038431167603,
339
+ "learning_rate": 9.982293010859955e-05,
340
+ "loss": 0.6266,
341
+ "step": 2250
342
+ },
343
+ {
344
+ "epoch": 0.38079470198675497,
345
+ "grad_norm": 1.2464983463287354,
346
+ "learning_rate": 9.978012405907165e-05,
347
+ "loss": 0.6148,
348
+ "step": 2300
349
+ },
350
+ {
351
+ "epoch": 0.3890728476821192,
352
+ "grad_norm": 1.2841860055923462,
353
+ "learning_rate": 9.973269961570195e-05,
354
+ "loss": 0.5946,
355
+ "step": 2350
356
+ },
357
+ {
358
+ "epoch": 0.3973509933774834,
359
+ "grad_norm": 1.2200431823730469,
360
+ "learning_rate": 9.968066117833401e-05,
361
+ "loss": 0.6166,
362
+ "step": 2400
363
+ },
364
+ {
365
+ "epoch": 0.4056291390728477,
366
+ "grad_norm": 1.128247857093811,
367
+ "learning_rate": 9.962401357487863e-05,
368
+ "loss": 0.5992,
369
+ "step": 2450
370
+ },
371
+ {
372
+ "epoch": 0.4139072847682119,
373
+ "grad_norm": 1.0683091878890991,
374
+ "learning_rate": 9.956276206086597e-05,
375
+ "loss": 0.6048,
376
+ "step": 2500
377
+ },
378
+ {
379
+ "epoch": 0.42218543046357615,
380
+ "grad_norm": 1.1819758415222168,
381
+ "learning_rate": 9.949691231895791e-05,
382
+ "loss": 0.5944,
383
+ "step": 2550
384
+ },
385
+ {
386
+ "epoch": 0.4304635761589404,
387
+ "grad_norm": 1.0043411254882812,
388
+ "learning_rate": 9.942647045842095e-05,
389
+ "loss": 0.5962,
390
+ "step": 2600
391
+ },
392
+ {
393
+ "epoch": 0.43874172185430466,
394
+ "grad_norm": 1.0588668584823608,
395
+ "learning_rate": 9.93514430145593e-05,
396
+ "loss": 0.6067,
397
+ "step": 2650
398
+ },
399
+ {
400
+ "epoch": 0.4470198675496689,
401
+ "grad_norm": 0.9364084601402283,
402
+ "learning_rate": 9.927183694810862e-05,
403
+ "loss": 0.5928,
404
+ "step": 2700
405
+ },
406
+ {
407
+ "epoch": 0.4552980132450331,
408
+ "grad_norm": 1.155172348022461,
409
+ "learning_rate": 9.918765964459022e-05,
410
+ "loss": 0.5987,
411
+ "step": 2750
412
+ },
413
+ {
414
+ "epoch": 0.46357615894039733,
415
+ "grad_norm": 1.1639224290847778,
416
+ "learning_rate": 9.909891891362587e-05,
417
+ "loss": 0.5745,
418
+ "step": 2800
419
+ },
420
+ {
421
+ "epoch": 0.4718543046357616,
422
+ "grad_norm": 0.9658174514770508,
423
+ "learning_rate": 9.900562298821323e-05,
424
+ "loss": 0.5825,
425
+ "step": 2850
426
+ },
427
+ {
428
+ "epoch": 0.48013245033112584,
429
+ "grad_norm": 1.118033766746521,
430
+ "learning_rate": 9.890778052396205e-05,
431
+ "loss": 0.5806,
432
+ "step": 2900
433
+ },
434
+ {
435
+ "epoch": 0.48841059602649006,
436
+ "grad_norm": 0.9781912565231323,
437
+ "learning_rate": 9.880540059829115e-05,
438
+ "loss": 0.5712,
439
+ "step": 2950
440
+ },
441
+ {
442
+ "epoch": 0.4966887417218543,
443
+ "grad_norm": 1.2145684957504272,
444
+ "learning_rate": 9.869849270958622e-05,
445
+ "loss": 0.5855,
446
+ "step": 3000
447
+ },
448
+ {
449
+ "epoch": 0.5049668874172185,
450
+ "grad_norm": 0.999279260635376,
451
+ "learning_rate": 9.858706677631862e-05,
452
+ "loss": 0.5843,
453
+ "step": 3050
454
+ },
455
+ {
456
+ "epoch": 0.5132450331125827,
457
+ "grad_norm": 1.098258137702942,
458
+ "learning_rate": 9.847113313612517e-05,
459
+ "loss": 0.5605,
460
+ "step": 3100
461
+ },
462
+ {
463
+ "epoch": 0.5215231788079471,
464
+ "grad_norm": 0.627949059009552,
465
+ "learning_rate": 9.835070254484912e-05,
466
+ "loss": 0.5538,
467
+ "step": 3150
468
+ },
469
+ {
470
+ "epoch": 0.5298013245033113,
471
+ "grad_norm": 1.0991902351379395,
472
+ "learning_rate": 9.822578617554219e-05,
473
+ "loss": 0.5555,
474
+ "step": 3200
475
+ },
476
+ {
477
+ "epoch": 0.5380794701986755,
478
+ "grad_norm": 0.9670843482017517,
479
+ "learning_rate": 9.8096395617428e-05,
480
+ "loss": 0.5647,
481
+ "step": 3250
482
+ },
483
+ {
484
+ "epoch": 0.5463576158940397,
485
+ "grad_norm": 0.9838133454322815,
486
+ "learning_rate": 9.796254287482693e-05,
487
+ "loss": 0.5561,
488
+ "step": 3300
489
+ },
490
+ {
491
+ "epoch": 0.554635761589404,
492
+ "grad_norm": 1.1465744972229004,
493
+ "learning_rate": 9.782424036604234e-05,
494
+ "loss": 0.559,
495
+ "step": 3350
496
+ },
497
+ {
498
+ "epoch": 0.5629139072847682,
499
+ "grad_norm": 1.1423758268356323,
500
+ "learning_rate": 9.768150092220849e-05,
501
+ "loss": 0.5517,
502
+ "step": 3400
503
+ },
504
+ {
505
+ "epoch": 0.5711920529801324,
506
+ "grad_norm": 1.1365066766738892,
507
+ "learning_rate": 9.753433778610008e-05,
508
+ "loss": 0.5464,
509
+ "step": 3450
510
+ },
511
+ {
512
+ "epoch": 0.5794701986754967,
513
+ "grad_norm": 0.81045001745224,
514
+ "learning_rate": 9.738276461090371e-05,
515
+ "loss": 0.5493,
516
+ "step": 3500
517
+ },
518
+ {
519
+ "epoch": 0.5877483443708609,
520
+ "grad_norm": 1.0236687660217285,
521
+ "learning_rate": 9.72267954589511e-05,
522
+ "loss": 0.567,
523
+ "step": 3550
524
+ },
525
+ {
526
+ "epoch": 0.5960264900662252,
527
+ "grad_norm": 0.9495602250099182,
528
+ "learning_rate": 9.706644480041455e-05,
529
+ "loss": 0.5474,
530
+ "step": 3600
531
+ },
532
+ {
533
+ "epoch": 0.6043046357615894,
534
+ "grad_norm": 0.960738480091095,
535
+ "learning_rate": 9.690172751196437e-05,
536
+ "loss": 0.5238,
537
+ "step": 3650
538
+ },
539
+ {
540
+ "epoch": 0.6125827814569537,
541
+ "grad_norm": 1.0488675832748413,
542
+ "learning_rate": 9.67326588753887e-05,
543
+ "loss": 0.521,
544
+ "step": 3700
545
+ },
546
+ {
547
+ "epoch": 0.6208609271523179,
548
+ "grad_norm": 0.8753538727760315,
549
+ "learning_rate": 9.65592545761758e-05,
550
+ "loss": 0.5232,
551
+ "step": 3750
552
+ },
553
+ {
554
+ "epoch": 0.6291390728476821,
555
+ "grad_norm": 1.0551217794418335,
556
+ "learning_rate": 9.638153070205871e-05,
557
+ "loss": 0.5432,
558
+ "step": 3800
559
+ },
560
+ {
561
+ "epoch": 0.6374172185430463,
562
+ "grad_norm": 1.158676028251648,
563
+ "learning_rate": 9.619950374152278e-05,
564
+ "loss": 0.5416,
565
+ "step": 3850
566
+ },
567
+ {
568
+ "epoch": 0.6456953642384106,
569
+ "grad_norm": 1.0036752223968506,
570
+ "learning_rate": 9.601319058227589e-05,
571
+ "loss": 0.5496,
572
+ "step": 3900
573
+ },
574
+ {
575
+ "epoch": 0.6539735099337748,
576
+ "grad_norm": 0.8905594348907471,
577
+ "learning_rate": 9.58226085096817e-05,
578
+ "loss": 0.5335,
579
+ "step": 3950
580
+ },
581
+ {
582
+ "epoch": 0.6622516556291391,
583
+ "grad_norm": 0.9868190884590149,
584
+ "learning_rate": 9.562777520515598e-05,
585
+ "loss": 0.5094,
586
+ "step": 4000
587
+ },
588
+ {
589
+ "epoch": 0.6705298013245033,
590
+ "grad_norm": 0.9672690629959106,
591
+ "learning_rate": 9.542870874452618e-05,
592
+ "loss": 0.5061,
593
+ "step": 4050
594
+ },
595
+ {
596
+ "epoch": 0.6788079470198676,
597
+ "grad_norm": 1.044123888015747,
598
+ "learning_rate": 9.52254275963545e-05,
599
+ "loss": 0.5253,
600
+ "step": 4100
601
+ },
602
+ {
603
+ "epoch": 0.6870860927152318,
604
+ "grad_norm": 1.0346958637237549,
605
+ "learning_rate": 9.501795062022434e-05,
606
+ "loss": 0.5149,
607
+ "step": 4150
608
+ },
609
+ {
610
+ "epoch": 0.695364238410596,
611
+ "grad_norm": 1.0799248218536377,
612
+ "learning_rate": 9.48062970649907e-05,
613
+ "loss": 0.5207,
614
+ "step": 4200
615
+ },
616
+ {
617
+ "epoch": 0.7036423841059603,
618
+ "grad_norm": 0.9847925901412964,
619
+ "learning_rate": 9.459048656699427e-05,
620
+ "loss": 0.531,
621
+ "step": 4250
622
+ },
623
+ {
624
+ "epoch": 0.7119205298013245,
625
+ "grad_norm": 1.134179949760437,
626
+ "learning_rate": 9.43705391482397e-05,
627
+ "loss": 0.5202,
628
+ "step": 4300
629
+ },
630
+ {
631
+ "epoch": 0.7201986754966887,
632
+ "grad_norm": 0.9750307202339172,
633
+ "learning_rate": 9.414647521453798e-05,
634
+ "loss": 0.5183,
635
+ "step": 4350
636
+ },
637
+ {
638
+ "epoch": 0.7284768211920529,
639
+ "grad_norm": 1.372010350227356,
640
+ "learning_rate": 9.391831555361341e-05,
641
+ "loss": 0.5203,
642
+ "step": 4400
643
+ },
644
+ {
645
+ "epoch": 0.7367549668874173,
646
+ "grad_norm": 0.9671643376350403,
647
+ "learning_rate": 9.36860813331748e-05,
648
+ "loss": 0.5313,
649
+ "step": 4450
650
+ },
651
+ {
652
+ "epoch": 0.7450331125827815,
653
+ "grad_norm": 1.270264983177185,
654
+ "learning_rate": 9.344979409895178e-05,
655
+ "loss": 0.5236,
656
+ "step": 4500
657
  }
658
  ],
659
  "logging_steps": 50,
 
673
  "attributes": {}
674
  }
675
  },
676
+ "total_flos": 3.5270617397723136e+16,
677
  "train_batch_size": 1,
678
  "trial_name": null,
679
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5dd2ef96eff028fc6db83c8627ce2e789cafe652a25ea367c040819bc392f916
3
  size 5752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2543e07a37d2c3de3cd8e1d682eb10ddfc7a8cf84209a331e0b0e44870af81c3
3
  size 5752