Azrail commited on
Commit
cdef79e
·
verified ·
1 Parent(s): a921518

Training in progress, step 138000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b38436cae5381f691ba804b915e325932d55429d83532b1470e95efd579a29b
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:294d2d3cfce69d5bcc552541aff1b1d0c5c39d6adabe16e718423a5d850f0d32
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b3d9c01ac2fd401fd65707f0e1d6a24eefcca9fe471c863196aa9b97efe6f47
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57a69c4accd4194b5ef200a371a59ef019db1dfd38dcb87b64dd42832f583b7c
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c7354a4e3d8de85b55d51bbeb0dfcfc86efd5d09ac4e401efe6b4ee83bc0b66a
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdb15604f71f08bf635b865cf27878158a353a64f3dcaa6e5902e3e52c7eb375
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1bf416de216a0fa7180c9c5b3632984e63b58047aa8bc6d944e50f798fb000d5
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d87ee32367beeb896fbea0e404a77621c8cd628a4eb1251b30dc94e06f2eb792
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.3069892793684486,
6
  "eval_steps": 500,
7
- "global_step": 137000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -24394,11 +24394,189 @@
24394
  "eval_steps_per_second": 15.101,
24395
  "num_input_tokens_seen": 71815816608,
24396
  "step": 137000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24397
  }
24398
  ],
24399
  "logging_steps": 50,
24400
  "max_steps": 140000,
24401
- "num_input_tokens_seen": 71815816608,
24402
  "num_train_epochs": 2,
24403
  "save_steps": 1000,
24404
  "stateful_callbacks": {
@@ -24413,7 +24591,7 @@
24413
  "attributes": {}
24414
  }
24415
  },
24416
- "total_flos": 1.271008961912107e+20,
24417
  "train_batch_size": 32,
24418
  "trial_name": null,
24419
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.316529329692214,
6
  "eval_steps": 500,
7
+ "global_step": 138000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
24394
  "eval_steps_per_second": 15.101,
24395
  "num_input_tokens_seen": 71815816608,
24396
  "step": 137000
24397
+ },
24398
+ {
24399
+ "epoch": 1.307466281884637,
24400
+ "grad_norm": 0.11646866798400879,
24401
+ "learning_rate": 2.7139335546282283e-05,
24402
+ "loss": 2.0325,
24403
+ "num_input_tokens_seen": 71842030368,
24404
+ "step": 137050
24405
+ },
24406
+ {
24407
+ "epoch": 1.3079432844008252,
24408
+ "grad_norm": 0.10989837348461151,
24409
+ "learning_rate": 2.6235218056235634e-05,
24410
+ "loss": 2.0325,
24411
+ "num_input_tokens_seen": 71868244768,
24412
+ "step": 137100
24413
+ },
24414
+ {
24415
+ "epoch": 1.3084202869170136,
24416
+ "grad_norm": 0.11658209562301636,
24417
+ "learning_rate": 2.5346010829944367e-05,
24418
+ "loss": 2.0289,
24419
+ "num_input_tokens_seen": 71894452160,
24420
+ "step": 137150
24421
+ },
24422
+ {
24423
+ "epoch": 1.3088972894332018,
24424
+ "grad_norm": 0.11487242579460144,
24425
+ "learning_rate": 2.4471741852423235e-05,
24426
+ "loss": 2.0322,
24427
+ "num_input_tokens_seen": 71920664928,
24428
+ "step": 137200
24429
+ },
24430
+ {
24431
+ "epoch": 1.30937429194939,
24432
+ "grad_norm": 0.11544458568096161,
24433
+ "learning_rate": 2.3612438638551835e-05,
24434
+ "loss": 2.0279,
24435
+ "num_input_tokens_seen": 71946876896,
24436
+ "step": 137250
24437
+ },
24438
+ {
24439
+ "epoch": 1.3098512944655782,
24440
+ "grad_norm": 0.11500503867864609,
24441
+ "learning_rate": 2.276812823220964e-05,
24442
+ "loss": 2.0399,
24443
+ "num_input_tokens_seen": 71973091200,
24444
+ "step": 137300
24445
+ },
24446
+ {
24447
+ "epoch": 1.3103282969817664,
24448
+ "grad_norm": 0.11575910449028015,
24449
+ "learning_rate": 2.1938837205424e-05,
24450
+ "loss": 2.0246,
24451
+ "num_input_tokens_seen": 71999300832,
24452
+ "step": 137350
24453
+ },
24454
+ {
24455
+ "epoch": 1.3108052994979549,
24456
+ "grad_norm": 0.1175985336303711,
24457
+ "learning_rate": 2.1124591657534777e-05,
24458
+ "loss": 2.0225,
24459
+ "num_input_tokens_seen": 72025515232,
24460
+ "step": 137400
24461
+ },
24462
+ {
24463
+ "epoch": 1.311282302014143,
24464
+ "grad_norm": 0.11688115447759628,
24465
+ "learning_rate": 2.032541721437209e-05,
24466
+ "loss": 2.024,
24467
+ "num_input_tokens_seen": 72051723040,
24468
+ "step": 137450
24469
+ },
24470
+ {
24471
+ "epoch": 1.3117593045303315,
24472
+ "grad_norm": 0.11419174075126648,
24473
+ "learning_rate": 1.9541339027450256e-05,
24474
+ "loss": 2.0254,
24475
+ "num_input_tokens_seen": 72077935168,
24476
+ "step": 137500
24477
+ },
24478
+ {
24479
+ "epoch": 1.3117593045303315,
24480
+ "eval_loss": 1.951472282409668,
24481
+ "eval_runtime": 83.1149,
24482
+ "eval_samples_per_second": 60.158,
24483
+ "eval_steps_per_second": 15.039,
24484
+ "num_input_tokens_seen": 72077935168,
24485
+ "step": 137500
24486
+ },
24487
+ {
24488
+ "epoch": 1.3122363070465197,
24489
+ "grad_norm": 0.11731937527656555,
24490
+ "learning_rate": 1.8772381773176416e-05,
24491
+ "loss": 2.0368,
24492
+ "num_input_tokens_seen": 72104145664,
24493
+ "step": 137550
24494
+ },
24495
+ {
24496
+ "epoch": 1.312713309562708,
24497
+ "grad_norm": 0.11281976848840714,
24498
+ "learning_rate": 1.801856965207338e-05,
24499
+ "loss": 2.0243,
24500
+ "num_input_tokens_seen": 72130351488,
24501
+ "step": 137600
24502
+ },
24503
+ {
24504
+ "epoch": 1.3131903120788961,
24505
+ "grad_norm": 0.12566816806793213,
24506
+ "learning_rate": 1.7279926388018564e-05,
24507
+ "loss": 2.0266,
24508
+ "num_input_tokens_seen": 72156564000,
24509
+ "step": 137650
24510
+ },
24511
+ {
24512
+ "epoch": 1.3136673145950846,
24513
+ "grad_norm": 0.1202327162027359,
24514
+ "learning_rate": 1.6556475227496815e-05,
24515
+ "loss": 2.0344,
24516
+ "num_input_tokens_seen": 72182768800,
24517
+ "step": 137700
24518
+ },
24519
+ {
24520
+ "epoch": 1.3141443171112728,
24521
+ "grad_norm": 0.11209400743246078,
24522
+ "learning_rate": 1.584823893886933e-05,
24523
+ "loss": 2.0307,
24524
+ "num_input_tokens_seen": 72208977472,
24525
+ "step": 137750
24526
+ },
24527
+ {
24528
+ "epoch": 1.314621319627461,
24529
+ "grad_norm": 0.11281031370162964,
24530
+ "learning_rate": 1.5155239811656562e-05,
24531
+ "loss": 2.0285,
24532
+ "num_input_tokens_seen": 72235186752,
24533
+ "step": 137800
24534
+ },
24535
+ {
24536
+ "epoch": 1.3150983221436494,
24537
+ "grad_norm": 0.11977609992027283,
24538
+ "learning_rate": 1.4477499655837278e-05,
24539
+ "loss": 2.0307,
24540
+ "num_input_tokens_seen": 72261390432,
24541
+ "step": 137850
24542
+ },
24543
+ {
24544
+ "epoch": 1.3155753246598376,
24545
+ "grad_norm": 0.11602313071489334,
24546
+ "learning_rate": 1.3815039801161721e-05,
24547
+ "loss": 2.0272,
24548
+ "num_input_tokens_seen": 72287596960,
24549
+ "step": 137900
24550
+ },
24551
+ {
24552
+ "epoch": 1.3160523271760258,
24553
+ "grad_norm": 0.11629103124141693,
24554
+ "learning_rate": 1.3167881096480372e-05,
24555
+ "loss": 2.0426,
24556
+ "num_input_tokens_seen": 72313806912,
24557
+ "step": 137950
24558
+ },
24559
+ {
24560
+ "epoch": 1.316529329692214,
24561
+ "grad_norm": 0.11337430030107498,
24562
+ "learning_rate": 1.2536043909088191e-05,
24563
+ "loss": 2.0286,
24564
+ "num_input_tokens_seen": 72340003200,
24565
+ "step": 138000
24566
+ },
24567
+ {
24568
+ "epoch": 1.316529329692214,
24569
+ "eval_loss": 1.9512444734573364,
24570
+ "eval_runtime": 82.1325,
24571
+ "eval_samples_per_second": 60.877,
24572
+ "eval_steps_per_second": 15.219,
24573
+ "num_input_tokens_seen": 72340003200,
24574
+ "step": 138000
24575
  }
24576
  ],
24577
  "logging_steps": 50,
24578
  "max_steps": 140000,
24579
+ "num_input_tokens_seen": 72340003200,
24580
  "num_train_epochs": 2,
24581
  "save_steps": 1000,
24582
  "stateful_callbacks": {
 
24591
  "attributes": {}
24592
  }
24593
  },
24594
+ "total_flos": 1.2802861084741632e+20,
24595
  "train_batch_size": 32,
24596
  "trial_name": null,
24597
  "trial_params": null