LLJYY commited on
Commit
ee3159e
·
verified ·
1 Parent(s): 164dab4

Training in progress, step 1500, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dd2ce83fb862336b4729018afc8e291aa4c2d38ce3b3b5625756b6a68e191913
3
  size 174663600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0dd24ee6828501b624fa6d66fd1194cee27acdf6fbf4040fa3393ed025f1e0b8
3
  size 174663600
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c4a1c0542d95372b9b98cd04e9e19b1d3278913800f038cdb84306140c9e0f5
3
  size 177908997
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57c4a4dab1575e19036cc179b540af28afc954075c76fa1c3f74f467b18a0a54
3
  size 177908997
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b03751160f25dd1f7c08604bdbd7711f070d950dfb96d9acede0b0ccf333222
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04eefe07496c9ea6eacb03b570d4b4b5896211d650c0810a1180d502bea3bcc3
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9803335b183336349e91b866e4b2332f37cecb5e5bd9cf6a14b120c0067b5d71
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:119c8b8031efeada1dd54137e4c5ca8dc90f054b53a8f73cacb65b1b4acc4f58
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.4724223455769458,
6
  "eval_steps": 500,
7
- "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -408,6 +408,206 @@
408
  "mean_token_accuracy": 0.9370592629909515,
409
  "num_tokens": 16532116.0,
410
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
  }
412
  ],
413
  "logging_steps": 25,
@@ -427,7 +627,7 @@
427
  "attributes": {}
428
  }
429
  },
430
- "total_flos": 1.0567860712982623e+18,
431
  "train_batch_size": 2,
432
  "trial_name": null,
433
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.7086335183654187,
6
  "eval_steps": 500,
7
+ "global_step": 1500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
408
  "mean_token_accuracy": 0.9370592629909515,
409
  "num_tokens": 16532116.0,
410
  "step": 1000
411
+ },
412
+ {
413
+ "entropy": 0.25085023198276757,
414
+ "epoch": 0.48423290421636944,
415
+ "grad_norm": 0.28125,
416
+ "learning_rate": 9.886701615567638e-05,
417
+ "loss": 0.2465,
418
+ "mean_token_accuracy": 0.9340272305905819,
419
+ "num_tokens": 16953093.0,
420
+ "step": 1025
421
+ },
422
+ {
423
+ "entropy": 0.2260630092769861,
424
+ "epoch": 0.4960434628557931,
425
+ "grad_norm": 0.2412109375,
426
+ "learning_rate": 9.871695705615726e-05,
427
+ "loss": 0.2226,
428
+ "mean_token_accuracy": 0.9398330296576023,
429
+ "num_tokens": 17370548.0,
430
+ "step": 1050
431
+ },
432
+ {
433
+ "entropy": 0.22553566984832288,
434
+ "epoch": 0.5078540214952167,
435
+ "grad_norm": 0.2578125,
436
+ "learning_rate": 9.85576972621137e-05,
437
+ "loss": 0.2204,
438
+ "mean_token_accuracy": 0.9398049999773502,
439
+ "num_tokens": 17776168.0,
440
+ "step": 1075
441
+ },
442
+ {
443
+ "entropy": 0.22235366519540548,
444
+ "epoch": 0.5196645801346403,
445
+ "grad_norm": 0.255859375,
446
+ "learning_rate": 9.838926685138316e-05,
447
+ "loss": 0.2185,
448
+ "mean_token_accuracy": 0.9404512317478657,
449
+ "num_tokens": 18184751.0,
450
+ "step": 1100
451
+ },
452
+ {
453
+ "entropy": 0.22692312018945812,
454
+ "epoch": 0.531475138774064,
455
+ "grad_norm": 0.27734375,
456
+ "learning_rate": 9.821169763376763e-05,
457
+ "loss": 0.2211,
458
+ "mean_token_accuracy": 0.939469782114029,
459
+ "num_tokens": 18593080.0,
460
+ "step": 1125
461
+ },
462
+ {
463
+ "entropy": 0.22736175518482923,
464
+ "epoch": 0.5432856974134876,
465
+ "grad_norm": 0.2392578125,
466
+ "learning_rate": 9.802502314502607e-05,
467
+ "loss": 0.224,
468
+ "mean_token_accuracy": 0.9391257779300213,
469
+ "num_tokens": 19006337.0,
470
+ "step": 1150
471
+ },
472
+ {
473
+ "entropy": 0.22326932862401008,
474
+ "epoch": 0.5550962560529114,
475
+ "grad_norm": 0.259765625,
476
+ "learning_rate": 9.782927864054075e-05,
477
+ "loss": 0.2207,
478
+ "mean_token_accuracy": 0.9402251268923283,
479
+ "num_tokens": 19408218.0,
480
+ "step": 1175
481
+ },
482
+ {
483
+ "entropy": 0.21780600540339948,
484
+ "epoch": 0.566906814692335,
485
+ "grad_norm": 0.294921875,
486
+ "learning_rate": 9.762450108865908e-05,
487
+ "loss": 0.2149,
488
+ "mean_token_accuracy": 0.9419379141926766,
489
+ "num_tokens": 19824621.0,
490
+ "step": 1200
491
+ },
492
+ {
493
+ "entropy": 0.21694308878853918,
494
+ "epoch": 0.5787173733317585,
495
+ "grad_norm": 0.2734375,
496
+ "learning_rate": 9.741072916371157e-05,
497
+ "loss": 0.2118,
498
+ "mean_token_accuracy": 0.9426406294107437,
499
+ "num_tokens": 20253370.0,
500
+ "step": 1225
501
+ },
502
+ {
503
+ "entropy": 0.20722413221374153,
504
+ "epoch": 0.5905279319711823,
505
+ "grad_norm": 0.220703125,
506
+ "learning_rate": 9.718800323870792e-05,
507
+ "loss": 0.2032,
508
+ "mean_token_accuracy": 0.9452938529849052,
509
+ "num_tokens": 20667551.0,
510
+ "step": 1250
511
+ },
512
+ {
513
+ "entropy": 0.21645578092895448,
514
+ "epoch": 0.6023384906106058,
515
+ "grad_norm": 0.2392578125,
516
+ "learning_rate": 9.695636537771212e-05,
517
+ "loss": 0.2123,
518
+ "mean_token_accuracy": 0.9423565396666527,
519
+ "num_tokens": 21073835.0,
520
+ "step": 1275
521
+ },
522
+ {
523
+ "entropy": 0.20943242628127337,
524
+ "epoch": 0.6141490492500296,
525
+ "grad_norm": 0.251953125,
526
+ "learning_rate": 9.671585932789821e-05,
527
+ "loss": 0.2062,
528
+ "mean_token_accuracy": 0.9441554906964302,
529
+ "num_tokens": 21484157.0,
530
+ "step": 1300
531
+ },
532
+ {
533
+ "entropy": 0.20016088901087642,
534
+ "epoch": 0.6259596078894532,
535
+ "grad_norm": 0.314453125,
536
+ "learning_rate": 9.64665305112882e-05,
537
+ "loss": 0.1956,
538
+ "mean_token_accuracy": 0.9467314165830613,
539
+ "num_tokens": 21892824.0,
540
+ "step": 1325
541
+ },
542
+ {
543
+ "entropy": 0.2107498816680163,
544
+ "epoch": 0.6377701665288769,
545
+ "grad_norm": 0.287109375,
546
+ "learning_rate": 9.620842601617366e-05,
547
+ "loss": 0.2073,
548
+ "mean_token_accuracy": 0.9435116010904312,
549
+ "num_tokens": 22305526.0,
550
+ "step": 1350
551
+ },
552
+ {
553
+ "entropy": 0.1972426986414939,
554
+ "epoch": 0.6495807251683005,
555
+ "grad_norm": 0.2099609375,
556
+ "learning_rate": 9.594159458822257e-05,
557
+ "loss": 0.1938,
558
+ "mean_token_accuracy": 0.9460577602684498,
559
+ "num_tokens": 22716778.0,
560
+ "step": 1375
561
+ },
562
+ {
563
+ "entropy": 0.19388799883425237,
564
+ "epoch": 0.661391283807724,
565
+ "grad_norm": 0.1953125,
566
+ "learning_rate": 9.56660866212733e-05,
567
+ "loss": 0.1893,
568
+ "mean_token_accuracy": 0.9480688716471195,
569
+ "num_tokens": 23139666.0,
570
+ "step": 1400
571
+ },
572
+ {
573
+ "entropy": 0.19959449878893792,
574
+ "epoch": 0.6732018424471478,
575
+ "grad_norm": 0.2314453125,
576
+ "learning_rate": 9.538195414781707e-05,
577
+ "loss": 0.1964,
578
+ "mean_token_accuracy": 0.9466811017692089,
579
+ "num_tokens": 23541786.0,
580
+ "step": 1425
581
+ },
582
+ {
583
+ "entropy": 0.2100939876586199,
584
+ "epoch": 0.6850124010865714,
585
+ "grad_norm": 0.22265625,
586
+ "learning_rate": 9.50892508291713e-05,
587
+ "loss": 0.2064,
588
+ "mean_token_accuracy": 0.9439319328963757,
589
+ "num_tokens": 23951416.0,
590
+ "step": 1450
591
+ },
592
+ {
593
+ "entropy": 0.19447497643530368,
594
+ "epoch": 0.6968229597259951,
595
+ "grad_norm": 0.2265625,
596
+ "learning_rate": 9.478803194534486e-05,
597
+ "loss": 0.1914,
598
+ "mean_token_accuracy": 0.9480419178307057,
599
+ "num_tokens": 24374545.0,
600
+ "step": 1475
601
+ },
602
+ {
603
+ "entropy": 0.1884359281975776,
604
+ "epoch": 0.7086335183654187,
605
+ "grad_norm": 0.22265625,
606
+ "learning_rate": 9.447835438459811e-05,
607
+ "loss": 0.185,
608
+ "mean_token_accuracy": 0.9487812982499599,
609
+ "num_tokens": 24793698.0,
610
+ "step": 1500
611
  }
612
  ],
613
  "logging_steps": 25,
 
627
  "attributes": {}
628
  }
629
  },
630
+ "total_flos": 1.587361750240276e+18,
631
  "train_batch_size": 2,
632
  "trial_name": null,
633
  "trial_params": null