FormlessAI commited on
Commit
802a5f3
·
verified ·
1 Parent(s): 35fc41d

Training in progress, epoch 1, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:386a6380325bc3dff1a7a5f881832a0696cbe9be2672febd8c95a996479adb3e
3
  size 1037269336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80d57dd1f2aa0bde9f3bd55de9a262b8de6b8609d2e7c1343bf3751d42242354
3
  size 1037269336
last-checkpoint/global_step9900/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d04fea83d85a84f793425ad94dd0b5eabd724b841c0109c2f5dfdd72f0429f15
3
+ size 781993445
last-checkpoint/global_step9900/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:724d23b792de6b58f7369f916910aedbdb64d653cae44381266f59e3251da219
3
+ size 781993509
last-checkpoint/global_step9900/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f615a7cbf6fa0b3bc685fe9e2f265dfd87c265b0db5cb00b804400ef20670f8
3
+ size 781993509
last-checkpoint/global_step9900/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5462fbd5dd6265ec39a6a030b1dcfff55166fdbc452d28cdc03c5103b4835fed
3
+ size 781993509
last-checkpoint/global_step9900/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:126e49f2ceba5214fc20df2deca8ca69a6fee6bcb0ef9f50375b18efba3677f0
3
+ size 2610290277
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step9700
 
1
+ global_step9900
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b56969535e9a8e88cd3829c988a0a37451d46c9a48a232e2bf2ff895e958e53f
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d91dc1dd14f8f32c0f2217452eabdba7d9d5c72d5834c18f2d9a544844a06ea2
3
  size 15429
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac5d72eb18852fad4db4fcc6f4250d07f49de688916884e0bd15cf332644e3c4
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb9b54face23724bdaca5ec09618f36e5c2b8f499be332a0f9475dbaf3eefc21
3
  size 15429
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e093e8dca30af25bb4868596fab940bd5b96385b2a5252906d4fb7506ec6e3c
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f28748778c6ae6a9269ab98073eb87225303dfa4aad70ad7fd421f531885ed96
3
  size 15429
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a02a2ce27f65153b8be850fa84fb66458319a4fbe52b6b4116118eb9d4b7ccda
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae8077b24d1ab7d135f5d3fd1b77e547df789862744ffa297b2d183e7403fce2
3
  size 15429
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5c6f3cc57d69dd40ef86ebd5faf9e78cc6a0d89512a7f5fd9a4c13cda1f059a
3
  size 1401
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0bb734018af63817744f06e0b869d778449a4d39f667f516ea0fba502652490
3
  size 1401
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "best_global_step": null,
3
- "best_metric": 1.521620512008667,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.4100886756796047,
6
  "eval_steps": 50,
7
- "global_step": 9700,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -15140,6 +15140,318 @@
15140
  "eval_samples_per_second": 173.601,
15141
  "eval_steps_per_second": 10.886,
15142
  "step": 9700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15143
  }
15144
  ],
15145
  "logging_steps": 5,
@@ -15168,7 +15480,7 @@
15168
  "attributes": {}
15169
  }
15170
  },
15171
- "total_flos": 2.5325929674917806e+18,
15172
  "train_batch_size": 4,
15173
  "trial_name": null,
15174
  "trial_params": null
 
1
  {
2
  "best_global_step": null,
3
+ "best_metric": 1.5213963985443115,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.4391626689925863,
6
  "eval_steps": 50,
7
+ "global_step": 9900,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
15140
  "eval_samples_per_second": 173.601,
15141
  "eval_steps_per_second": 10.886,
15142
  "step": 9700
15143
+ },
15144
+ {
15145
+ "epoch": 1.410815525512429,
15146
+ "grad_norm": 2.4059064388275146,
15147
+ "learning_rate": 2.20844720632845e-07,
15148
+ "loss": 1.6476,
15149
+ "step": 9705
15150
+ },
15151
+ {
15152
+ "epoch": 1.4115423753452536,
15153
+ "grad_norm": 2.4945123195648193,
15154
+ "learning_rate": 2.134520232126146e-07,
15155
+ "loss": 1.6076,
15156
+ "step": 9710
15157
+ },
15158
+ {
15159
+ "epoch": 1.4122692251780782,
15160
+ "grad_norm": 2.8330612182617188,
15161
+ "learning_rate": 2.0618491100792133e-07,
15162
+ "loss": 1.5924,
15163
+ "step": 9715
15164
+ },
15165
+ {
15166
+ "epoch": 1.4129960750109027,
15167
+ "grad_norm": 2.243333339691162,
15168
+ "learning_rate": 1.990434023137036e-07,
15169
+ "loss": 1.5927,
15170
+ "step": 9720
15171
+ },
15172
+ {
15173
+ "epoch": 1.4137229248437273,
15174
+ "grad_norm": 2.5298187732696533,
15175
+ "learning_rate": 1.9202751510870365e-07,
15176
+ "loss": 1.5937,
15177
+ "step": 9725
15178
+ },
15179
+ {
15180
+ "epoch": 1.4144497746765519,
15181
+ "grad_norm": 2.333787679672241,
15182
+ "learning_rate": 1.851372670554175e-07,
15183
+ "loss": 1.5879,
15184
+ "step": 9730
15185
+ },
15186
+ {
15187
+ "epoch": 1.4151766245093764,
15188
+ "grad_norm": 2.3179080486297607,
15189
+ "learning_rate": 1.7837267550002254e-07,
15190
+ "loss": 1.3676,
15191
+ "step": 9735
15192
+ },
15193
+ {
15194
+ "epoch": 1.4159034743422008,
15195
+ "grad_norm": 2.2587573528289795,
15196
+ "learning_rate": 1.7173375747237766e-07,
15197
+ "loss": 1.6639,
15198
+ "step": 9740
15199
+ },
15200
+ {
15201
+ "epoch": 1.4166303241750255,
15202
+ "grad_norm": 2.4788784980773926,
15203
+ "learning_rate": 1.6522052968595648e-07,
15204
+ "loss": 1.6174,
15205
+ "step": 9745
15206
+ },
15207
+ {
15208
+ "epoch": 1.4173571740078499,
15209
+ "grad_norm": 2.38806414604187,
15210
+ "learning_rate": 1.5883300853778604e-07,
15211
+ "loss": 1.7383,
15212
+ "step": 9750
15213
+ },
15214
+ {
15215
+ "epoch": 1.4173571740078499,
15216
+ "eval_loss": 1.52242112159729,
15217
+ "eval_runtime": 19.1417,
15218
+ "eval_samples_per_second": 172.451,
15219
+ "eval_steps_per_second": 10.814,
15220
+ "step": 9750
15221
+ },
15222
+ {
15223
+ "epoch": 1.4180840238406744,
15224
+ "grad_norm": 2.2191879749298096,
15225
+ "learning_rate": 1.5257121010846365e-07,
15226
+ "loss": 1.5705,
15227
+ "step": 9755
15228
+ },
15229
+ {
15230
+ "epoch": 1.418810873673499,
15231
+ "grad_norm": 2.3617305755615234,
15232
+ "learning_rate": 1.464351501620456e-07,
15233
+ "loss": 1.5469,
15234
+ "step": 9760
15235
+ },
15236
+ {
15237
+ "epoch": 1.4195377235063236,
15238
+ "grad_norm": 2.5067806243896484,
15239
+ "learning_rate": 1.404248441460582e-07,
15240
+ "loss": 1.7018,
15241
+ "step": 9765
15242
+ },
15243
+ {
15244
+ "epoch": 1.4202645733391481,
15245
+ "grad_norm": 2.475242853164673,
15246
+ "learning_rate": 1.3454030719143674e-07,
15247
+ "loss": 1.6947,
15248
+ "step": 9770
15249
+ },
15250
+ {
15251
+ "epoch": 1.4209914231719727,
15252
+ "grad_norm": 2.2841944694519043,
15253
+ "learning_rate": 1.2878155411250307e-07,
15254
+ "loss": 1.6853,
15255
+ "step": 9775
15256
+ },
15257
+ {
15258
+ "epoch": 1.4217182730047973,
15259
+ "grad_norm": 2.1120128631591797,
15260
+ "learning_rate": 1.231485994069046e-07,
15261
+ "loss": 1.6585,
15262
+ "step": 9780
15263
+ },
15264
+ {
15265
+ "epoch": 1.4224451228376218,
15266
+ "grad_norm": 2.586662769317627,
15267
+ "learning_rate": 1.1764145725560866e-07,
15268
+ "loss": 1.699,
15269
+ "step": 9785
15270
+ },
15271
+ {
15272
+ "epoch": 1.4231719726704464,
15273
+ "grad_norm": 2.748775005340576,
15274
+ "learning_rate": 1.1226014152282453e-07,
15275
+ "loss": 1.5495,
15276
+ "step": 9790
15277
+ },
15278
+ {
15279
+ "epoch": 1.4238988225032707,
15280
+ "grad_norm": 2.5237104892730713,
15281
+ "learning_rate": 1.0700466575602029e-07,
15282
+ "loss": 1.5464,
15283
+ "step": 9795
15284
+ },
15285
+ {
15286
+ "epoch": 1.4246256723360955,
15287
+ "grad_norm": 2.8664605617523193,
15288
+ "learning_rate": 1.018750431858393e-07,
15289
+ "loss": 1.6628,
15290
+ "step": 9800
15291
+ },
15292
+ {
15293
+ "epoch": 1.4246256723360955,
15294
+ "eval_loss": 1.5222878456115723,
15295
+ "eval_runtime": 19.036,
15296
+ "eval_samples_per_second": 173.408,
15297
+ "eval_steps_per_second": 10.874,
15298
+ "step": 9800
15299
+ },
15300
+ {
15301
+ "epoch": 1.4253525221689198,
15302
+ "grad_norm": 2.546454668045044,
15303
+ "learning_rate": 9.687128672611134e-08,
15304
+ "loss": 1.7066,
15305
+ "step": 9805
15306
+ },
15307
+ {
15308
+ "epoch": 1.4260793720017444,
15309
+ "grad_norm": 2.584137201309204,
15310
+ "learning_rate": 9.199340897378033e-08,
15311
+ "loss": 1.6069,
15312
+ "step": 9810
15313
+ },
15314
+ {
15315
+ "epoch": 1.426806221834569,
15316
+ "grad_norm": 2.591409683227539,
15317
+ "learning_rate": 8.724142220889871e-08,
15318
+ "loss": 1.5393,
15319
+ "step": 9815
15320
+ },
15321
+ {
15322
+ "epoch": 1.4275330716673935,
15323
+ "grad_norm": 2.2875685691833496,
15324
+ "learning_rate": 8.261533839458856e-08,
15325
+ "loss": 1.4082,
15326
+ "step": 9820
15327
+ },
15328
+ {
15329
+ "epoch": 1.428259921500218,
15330
+ "grad_norm": 2.495056390762329,
15331
+ "learning_rate": 7.811516917700819e-08,
15332
+ "loss": 1.6082,
15333
+ "step": 9825
15334
+ },
15335
+ {
15336
+ "epoch": 1.4289867713330426,
15337
+ "grad_norm": 2.618781328201294,
15338
+ "learning_rate": 7.374092588532993e-08,
15339
+ "loss": 1.7317,
15340
+ "step": 9830
15341
+ },
15342
+ {
15343
+ "epoch": 1.4297136211658672,
15344
+ "grad_norm": 2.6624369621276855,
15345
+ "learning_rate": 6.949261953171231e-08,
15346
+ "loss": 1.6049,
15347
+ "step": 9835
15348
+ },
15349
+ {
15350
+ "epoch": 1.4304404709986915,
15351
+ "grad_norm": 3.055304527282715,
15352
+ "learning_rate": 6.537026081124995e-08,
15353
+ "loss": 1.6846,
15354
+ "step": 9840
15355
+ },
15356
+ {
15357
+ "epoch": 1.4311673208315163,
15358
+ "grad_norm": 2.291666269302368,
15359
+ "learning_rate": 6.137386010197918e-08,
15360
+ "loss": 1.5199,
15361
+ "step": 9845
15362
+ },
15363
+ {
15364
+ "epoch": 1.4318941706643407,
15365
+ "grad_norm": 2.2404119968414307,
15366
+ "learning_rate": 5.75034274648391e-08,
15367
+ "loss": 1.6049,
15368
+ "step": 9850
15369
+ },
15370
+ {
15371
+ "epoch": 1.4318941706643407,
15372
+ "eval_loss": 1.522445797920227,
15373
+ "eval_runtime": 18.961,
15374
+ "eval_samples_per_second": 174.094,
15375
+ "eval_steps_per_second": 10.917,
15376
+ "step": 9850
15377
+ },
15378
+ {
15379
+ "epoch": 1.4326210204971652,
15380
+ "grad_norm": 2.296211004257202,
15381
+ "learning_rate": 5.37589726436382e-08,
15382
+ "loss": 1.6874,
15383
+ "step": 9855
15384
+ },
15385
+ {
15386
+ "epoch": 1.4333478703299898,
15387
+ "grad_norm": 2.5468204021453857,
15388
+ "learning_rate": 5.014050506503209e-08,
15389
+ "loss": 1.6244,
15390
+ "step": 9860
15391
+ },
15392
+ {
15393
+ "epoch": 1.4340747201628143,
15394
+ "grad_norm": 2.6297662258148193,
15395
+ "learning_rate": 4.664803383851241e-08,
15396
+ "loss": 1.574,
15397
+ "step": 9865
15398
+ },
15399
+ {
15400
+ "epoch": 1.434801569995639,
15401
+ "grad_norm": 2.9105236530303955,
15402
+ "learning_rate": 4.328156775637343e-08,
15403
+ "loss": 1.6189,
15404
+ "step": 9870
15405
+ },
15406
+ {
15407
+ "epoch": 1.4355284198284635,
15408
+ "grad_norm": 2.4492199420928955,
15409
+ "learning_rate": 4.004111529368426e-08,
15410
+ "loss": 1.5159,
15411
+ "step": 9875
15412
+ },
15413
+ {
15414
+ "epoch": 1.436255269661288,
15415
+ "grad_norm": 2.902602195739746,
15416
+ "learning_rate": 3.6926684608283267e-08,
15417
+ "loss": 1.6313,
15418
+ "step": 9880
15419
+ },
15420
+ {
15421
+ "epoch": 1.4369821194941126,
15422
+ "grad_norm": 2.4516420364379883,
15423
+ "learning_rate": 3.393828354074474e-08,
15424
+ "loss": 1.6638,
15425
+ "step": 9885
15426
+ },
15427
+ {
15428
+ "epoch": 1.4377089693269371,
15429
+ "grad_norm": 2.564882516860962,
15430
+ "learning_rate": 3.107591961436216e-08,
15431
+ "loss": 1.5475,
15432
+ "step": 9890
15433
+ },
15434
+ {
15435
+ "epoch": 1.4384358191597615,
15436
+ "grad_norm": 2.7581264972686768,
15437
+ "learning_rate": 2.8339600035137093e-08,
15438
+ "loss": 1.6877,
15439
+ "step": 9895
15440
+ },
15441
+ {
15442
+ "epoch": 1.4391626689925863,
15443
+ "grad_norm": 2.6273791790008545,
15444
+ "learning_rate": 2.5729331691756963e-08,
15445
+ "loss": 1.7434,
15446
+ "step": 9900
15447
+ },
15448
+ {
15449
+ "epoch": 1.4391626689925863,
15450
+ "eval_loss": 1.5213963985443115,
15451
+ "eval_runtime": 18.8476,
15452
+ "eval_samples_per_second": 175.142,
15453
+ "eval_steps_per_second": 10.983,
15454
+ "step": 9900
15455
  }
15456
  ],
15457
  "logging_steps": 5,
 
15480
  "attributes": {}
15481
  }
15482
  },
15483
+ "total_flos": 2.5848701774263747e+18,
15484
  "train_batch_size": 4,
15485
  "trial_name": null,
15486
  "trial_params": null