ljcamargo commited on
Commit
a93bfd5
·
verified ·
1 Parent(s): b7b0e13

Training in progress, step 750, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17f1e891cbcbfebdb8fb05fb3584ed52b4d300c873323006bab56bff1a83e477
3
  size 3826461296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f773dc94967b7ee6e551db696f34227eb983340cfd6ce1fc1ae2d7d9ba5d943
3
  size 3826461296
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f395dc9eec3233076c8a6eab517b6046c5f769c34f8cc42ea1aada63996b4194
3
  size 2479123301
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8632716595be573dc0ed32f03651903484a98a76acfa6f6710d2c042e6a3c5ea
3
  size 2479123301
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ff58b41c3672e659a0eb46d9ed11a0ca17415e7a2643a3ddfbaebb9f4e67f8f
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:100000ea5d81ef450688ed224677c94deb5fa0928415e9497ab5b09006179386
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:06a53cb7201a63e56775adc37fbc436df78256e51be756e6ae4dae646e834711
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8b125c082de6f20d827ac9ce3a7228054a763972dd6779dfe18031391e49829
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.2,
6
  "eval_steps": 500,
7
- "global_step": 500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -358,6 +358,181 @@
358
  "learning_rate": 4.0403225806451614e-05,
359
  "loss": 1.112,
360
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
  }
362
  ],
363
  "logging_steps": 10,
@@ -377,7 +552,7 @@
377
  "attributes": {}
378
  }
379
  },
380
- "total_flos": 9020891125518336.0,
381
  "train_batch_size": 2,
382
  "trial_name": null,
383
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.3,
6
  "eval_steps": 500,
7
+ "global_step": 750,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
358
  "learning_rate": 4.0403225806451614e-05,
359
  "loss": 1.112,
360
  "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.204,
364
+ "grad_norm": 17.082433700561523,
365
+ "learning_rate": 4.020161290322581e-05,
366
+ "loss": 0.9832,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.208,
371
+ "grad_norm": 15.006577491760254,
372
+ "learning_rate": 4e-05,
373
+ "loss": 0.8085,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.212,
378
+ "grad_norm": 23.60434913635254,
379
+ "learning_rate": 3.97983870967742e-05,
380
+ "loss": 1.2125,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.216,
385
+ "grad_norm": 20.58735466003418,
386
+ "learning_rate": 3.959677419354839e-05,
387
+ "loss": 0.9544,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.22,
392
+ "grad_norm": 24.197471618652344,
393
+ "learning_rate": 3.939516129032259e-05,
394
+ "loss": 0.762,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.224,
399
+ "grad_norm": 20.651039123535156,
400
+ "learning_rate": 3.9193548387096776e-05,
401
+ "loss": 0.9309,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.228,
406
+ "grad_norm": 20.045127868652344,
407
+ "learning_rate": 3.901209677419355e-05,
408
+ "loss": 0.8862,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.232,
413
+ "grad_norm": 18.766826629638672,
414
+ "learning_rate": 3.8810483870967744e-05,
415
+ "loss": 0.9722,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.236,
420
+ "grad_norm": 28.41654396057129,
421
+ "learning_rate": 3.860887096774194e-05,
422
+ "loss": 1.0501,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 0.24,
427
+ "grad_norm": 14.174357414245605,
428
+ "learning_rate": 3.8407258064516134e-05,
429
+ "loss": 0.9859,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 0.244,
434
+ "grad_norm": 18.710617065429688,
435
+ "learning_rate": 3.820564516129033e-05,
436
+ "loss": 1.0233,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 0.248,
441
+ "grad_norm": 22.399860382080078,
442
+ "learning_rate": 3.800403225806452e-05,
443
+ "loss": 1.1186,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 0.252,
448
+ "grad_norm": 18.62351417541504,
449
+ "learning_rate": 3.780241935483871e-05,
450
+ "loss": 1.1073,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 0.256,
455
+ "grad_norm": 16.706384658813477,
456
+ "learning_rate": 3.7600806451612906e-05,
457
+ "loss": 0.9013,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 0.26,
462
+ "grad_norm": 14.72787857055664,
463
+ "learning_rate": 3.7399193548387094e-05,
464
+ "loss": 0.8391,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 0.264,
469
+ "grad_norm": 17.56301498413086,
470
+ "learning_rate": 3.719758064516129e-05,
471
+ "loss": 0.9974,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 0.268,
476
+ "grad_norm": 18.445892333984375,
477
+ "learning_rate": 3.6995967741935484e-05,
478
+ "loss": 0.9481,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 0.272,
483
+ "grad_norm": 15.60034465789795,
484
+ "learning_rate": 3.679435483870968e-05,
485
+ "loss": 0.8022,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 0.276,
490
+ "grad_norm": 23.985820770263672,
491
+ "learning_rate": 3.659274193548387e-05,
492
+ "loss": 0.8189,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 0.28,
497
+ "grad_norm": 26.464324951171875,
498
+ "learning_rate": 3.639112903225806e-05,
499
+ "loss": 0.8447,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 0.284,
504
+ "grad_norm": 24.74170684814453,
505
+ "learning_rate": 3.6189516129032256e-05,
506
+ "loss": 0.8458,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 0.288,
511
+ "grad_norm": 15.681292533874512,
512
+ "learning_rate": 3.598790322580645e-05,
513
+ "loss": 0.9417,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 0.292,
518
+ "grad_norm": 18.264453887939453,
519
+ "learning_rate": 3.5786290322580645e-05,
520
+ "loss": 0.8536,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 0.296,
525
+ "grad_norm": 15.522205352783203,
526
+ "learning_rate": 3.558467741935484e-05,
527
+ "loss": 0.6493,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 0.3,
532
+ "grad_norm": 36.796165466308594,
533
+ "learning_rate": 3.5383064516129035e-05,
534
+ "loss": 0.8446,
535
+ "step": 750
536
  }
537
  ],
538
  "logging_steps": 10,
 
552
  "attributes": {}
553
  }
554
  },
555
+ "total_flos": 1.3512970727276544e+16,
556
  "train_batch_size": 2,
557
  "trial_name": null,
558
  "trial_params": null