Wilsonwin commited on
Commit
57c325e
·
verified ·
1 Parent(s): 0495ef7

Training in progress, step 10500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9da6829b1edfacc61441699b4ac6d5dc6abb737be9152be8f29e5862abecd54
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5fec734d804f38e6a6185a808dee5da674e72210df9519b259fe7268fc0f656
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d9c41bcb1f7e3d0ff7cf1e9246c52eba5532bd32a5af7bbe5d88c8501561fc3
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0663ccd7ad33c0549c2779e5d083150bba32ec4d7c6fbd8222ef934fa81f78bc
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0f02b717c272316648da49ca6391d63601d6d8e37a3b73ce0655aa44e0b1efd
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0abf895a019505b4c56af8dcadc1605517fe2ff5671df4ee01bebae31eb06b83
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:53471871a37f3cc35b4a656a6f0cfda18046c304a91d9bf8b29b14eea2ccc156
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f96c5626b64f285225e7bd0540a942ee4b22f3baba9f0a0f2189b039b8bf46c
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.6894745734076704,
6
  "eval_steps": 500,
7
- "global_step": 10000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -7175,6 +7175,364 @@
7175
  "eval_samples_per_second": 257.563,
7176
  "eval_steps_per_second": 5.409,
7177
  "step": 10000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7178
  }
7179
  ],
7180
  "logging_steps": 10,
@@ -7194,7 +7552,7 @@
7194
  "attributes": {}
7195
  }
7196
  },
7197
- "total_flos": 3.344547305037496e+17,
7198
  "train_batch_size": 48,
7199
  "trial_name": null,
7200
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.7739483020780538,
6
  "eval_steps": 500,
7
+ "global_step": 10500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
7175
  "eval_samples_per_second": 257.563,
7176
  "eval_steps_per_second": 5.409,
7177
  "step": 10000
7178
+ },
7179
+ {
7180
+ "epoch": 1.691164047981078,
7181
+ "grad_norm": 0.4754472076892853,
7182
+ "learning_rate": 2.4865303937104007e-05,
7183
+ "loss": 4.285601425170898,
7184
+ "step": 10010
7185
+ },
7186
+ {
7187
+ "epoch": 1.6928535225544854,
7188
+ "grad_norm": 0.4589325785636902,
7189
+ "learning_rate": 2.460181551284876e-05,
7190
+ "loss": 4.316444396972656,
7191
+ "step": 10020
7192
+ },
7193
+ {
7194
+ "epoch": 1.6945429971278934,
7195
+ "grad_norm": 0.4686416685581207,
7196
+ "learning_rate": 2.433960581310091e-05,
7197
+ "loss": 4.296805191040039,
7198
+ "step": 10030
7199
+ },
7200
+ {
7201
+ "epoch": 1.6962324717013009,
7202
+ "grad_norm": 0.44992297887802124,
7203
+ "learning_rate": 2.4078677511694776e-05,
7204
+ "loss": 4.326528930664063,
7205
+ "step": 10040
7206
+ },
7207
+ {
7208
+ "epoch": 1.6979219462747086,
7209
+ "grad_norm": 0.460001677274704,
7210
+ "learning_rate": 2.381903326939777e-05,
7211
+ "loss": 4.270325088500977,
7212
+ "step": 10050
7213
+ },
7214
+ {
7215
+ "epoch": 1.6996114208481163,
7216
+ "grad_norm": 0.45742112398147583,
7217
+ "learning_rate": 2.356067573388355e-05,
7218
+ "loss": 4.311310958862305,
7219
+ "step": 10060
7220
+ },
7221
+ {
7222
+ "epoch": 1.7013008954215239,
7223
+ "grad_norm": 0.5051885843276978,
7224
+ "learning_rate": 2.3303607539704628e-05,
7225
+ "loss": 4.305488586425781,
7226
+ "step": 10070
7227
+ },
7228
+ {
7229
+ "epoch": 1.7029903699949316,
7230
+ "grad_norm": 0.460809588432312,
7231
+ "learning_rate": 2.3047831308265845e-05,
7232
+ "loss": 4.284737777709961,
7233
+ "step": 10080
7234
+ },
7235
+ {
7236
+ "epoch": 1.7046798445683393,
7237
+ "grad_norm": 0.48899003863334656,
7238
+ "learning_rate": 2.2793349647797372e-05,
7239
+ "loss": 4.308148956298828,
7240
+ "step": 10090
7241
+ },
7242
+ {
7243
+ "epoch": 1.7063693191417468,
7244
+ "grad_norm": 0.47210270166397095,
7245
+ "learning_rate": 2.2540165153328345e-05,
7246
+ "loss": 4.300167465209961,
7247
+ "step": 10100
7248
+ },
7249
+ {
7250
+ "epoch": 1.7080587937151546,
7251
+ "grad_norm": 0.4561355710029602,
7252
+ "learning_rate": 2.2288280406660237e-05,
7253
+ "loss": 4.295189285278321,
7254
+ "step": 10110
7255
+ },
7256
+ {
7257
+ "epoch": 1.7097482682885623,
7258
+ "grad_norm": 0.4685342013835907,
7259
+ "learning_rate": 2.2037697976340525e-05,
7260
+ "loss": 4.32569465637207,
7261
+ "step": 10120
7262
+ },
7263
+ {
7264
+ "epoch": 1.7114377428619698,
7265
+ "grad_norm": 0.4782038629055023,
7266
+ "learning_rate": 2.1788420417636704e-05,
7267
+ "loss": 4.281776046752929,
7268
+ "step": 10130
7269
+ },
7270
+ {
7271
+ "epoch": 1.7131272174353775,
7272
+ "grad_norm": 0.45496320724487305,
7273
+ "learning_rate": 2.1540450272509986e-05,
7274
+ "loss": 4.289628219604492,
7275
+ "step": 10140
7276
+ },
7277
+ {
7278
+ "epoch": 1.7148166920087853,
7279
+ "grad_norm": 0.4686676263809204,
7280
+ "learning_rate": 2.129379006958944e-05,
7281
+ "loss": 4.304840087890625,
7282
+ "step": 10150
7283
+ },
7284
+ {
7285
+ "epoch": 1.7165061665821928,
7286
+ "grad_norm": 0.45078393816947937,
7287
+ "learning_rate": 2.104844232414634e-05,
7288
+ "loss": 4.333132934570313,
7289
+ "step": 10160
7290
+ },
7291
+ {
7292
+ "epoch": 1.7181956411556008,
7293
+ "grad_norm": 0.4551495313644409,
7294
+ "learning_rate": 2.080440953806844e-05,
7295
+ "loss": 4.313465118408203,
7296
+ "step": 10170
7297
+ },
7298
+ {
7299
+ "epoch": 1.7198851157290083,
7300
+ "grad_norm": 0.46409592032432556,
7301
+ "learning_rate": 2.056169419983432e-05,
7302
+ "loss": 4.303678131103515,
7303
+ "step": 10180
7304
+ },
7305
+ {
7306
+ "epoch": 1.721574590302416,
7307
+ "grad_norm": 0.46051809191703796,
7308
+ "learning_rate": 2.0320298784488177e-05,
7309
+ "loss": 4.297393798828125,
7310
+ "step": 10190
7311
+ },
7312
+ {
7313
+ "epoch": 1.7232640648758237,
7314
+ "grad_norm": 0.541107714176178,
7315
+ "learning_rate": 2.008022575361464e-05,
7316
+ "loss": 4.302070617675781,
7317
+ "step": 10200
7318
+ },
7319
+ {
7320
+ "epoch": 1.7249535394492312,
7321
+ "grad_norm": 0.46840059757232666,
7322
+ "learning_rate": 1.9841477555313428e-05,
7323
+ "loss": 4.290169143676758,
7324
+ "step": 10210
7325
+ },
7326
+ {
7327
+ "epoch": 1.726643014022639,
7328
+ "grad_norm": 0.46939900517463684,
7329
+ "learning_rate": 1.960405662417458e-05,
7330
+ "loss": 4.315706634521485,
7331
+ "step": 10220
7332
+ },
7333
+ {
7334
+ "epoch": 1.7283324885960467,
7335
+ "grad_norm": 0.4771457016468048,
7336
+ "learning_rate": 1.9367965381253632e-05,
7337
+ "loss": 4.289263534545898,
7338
+ "step": 10230
7339
+ },
7340
+ {
7341
+ "epoch": 1.7300219631694542,
7342
+ "grad_norm": 0.48085805773735046,
7343
+ "learning_rate": 1.9133206234046833e-05,
7344
+ "loss": 4.3228507995605465,
7345
+ "step": 10240
7346
+ },
7347
+ {
7348
+ "epoch": 1.731711437742862,
7349
+ "grad_norm": 0.4604587256908417,
7350
+ "learning_rate": 1.8899781576466605e-05,
7351
+ "loss": 4.296081924438477,
7352
+ "step": 10250
7353
+ },
7354
+ {
7355
+ "epoch": 1.7334009123162697,
7356
+ "grad_norm": 0.4865635633468628,
7357
+ "learning_rate": 1.86676937888172e-05,
7358
+ "loss": 4.302744674682617,
7359
+ "step": 10260
7360
+ },
7361
+ {
7362
+ "epoch": 1.7350903868896772,
7363
+ "grad_norm": 0.4594942033290863,
7364
+ "learning_rate": 1.8436945237770347e-05,
7365
+ "loss": 4.3057910919189455,
7366
+ "step": 10270
7367
+ },
7368
+ {
7369
+ "epoch": 1.736779861463085,
7370
+ "grad_norm": 0.4511856734752655,
7371
+ "learning_rate": 1.8207538276341255e-05,
7372
+ "loss": 4.311210632324219,
7373
+ "step": 10280
7374
+ },
7375
+ {
7376
+ "epoch": 1.7384693360364927,
7377
+ "grad_norm": 0.46823564171791077,
7378
+ "learning_rate": 1.7979475243864422e-05,
7379
+ "loss": 4.291423797607422,
7380
+ "step": 10290
7381
+ },
7382
+ {
7383
+ "epoch": 1.7401588106099002,
7384
+ "grad_norm": 0.456841379404068,
7385
+ "learning_rate": 1.7752758465969835e-05,
7386
+ "loss": 4.291481781005859,
7387
+ "step": 10300
7388
+ },
7389
+ {
7390
+ "epoch": 1.7418482851833081,
7391
+ "grad_norm": 0.464433491230011,
7392
+ "learning_rate": 1.7527390254559564e-05,
7393
+ "loss": 4.306121826171875,
7394
+ "step": 10310
7395
+ },
7396
+ {
7397
+ "epoch": 1.7435377597567157,
7398
+ "grad_norm": 0.43991556763648987,
7399
+ "learning_rate": 1.7303372907783646e-05,
7400
+ "loss": 4.288319778442383,
7401
+ "step": 10320
7402
+ },
7403
+ {
7404
+ "epoch": 1.7452272343301232,
7405
+ "grad_norm": 0.4612221121788025,
7406
+ "learning_rate": 1.708070871001704e-05,
7407
+ "loss": 4.296160125732422,
7408
+ "step": 10330
7409
+ },
7410
+ {
7411
+ "epoch": 1.7469167089035311,
7412
+ "grad_norm": 0.4536151587963104,
7413
+ "learning_rate": 1.6859399931836182e-05,
7414
+ "loss": 4.302063751220703,
7415
+ "step": 10340
7416
+ },
7417
+ {
7418
+ "epoch": 1.7486061834769386,
7419
+ "grad_norm": 0.47430509328842163,
7420
+ "learning_rate": 1.663944882999596e-05,
7421
+ "loss": 4.320109176635742,
7422
+ "step": 10350
7423
+ },
7424
+ {
7425
+ "epoch": 1.7502956580503464,
7426
+ "grad_norm": 0.4570671021938324,
7427
+ "learning_rate": 1.6420857647406533e-05,
7428
+ "loss": 4.309846115112305,
7429
+ "step": 10360
7430
+ },
7431
+ {
7432
+ "epoch": 1.751985132623754,
7433
+ "grad_norm": 0.46541541814804077,
7434
+ "learning_rate": 1.6203628613110513e-05,
7435
+ "loss": 4.321808242797852,
7436
+ "step": 10370
7437
+ },
7438
+ {
7439
+ "epoch": 1.7536746071971616,
7440
+ "grad_norm": 0.4689694941043854,
7441
+ "learning_rate": 1.598776394226035e-05,
7442
+ "loss": 4.342444992065429,
7443
+ "step": 10380
7444
+ },
7445
+ {
7446
+ "epoch": 1.7553640817705694,
7447
+ "grad_norm": 0.4656012952327728,
7448
+ "learning_rate": 1.5773265836095615e-05,
7449
+ "loss": 4.284120178222656,
7450
+ "step": 10390
7451
+ },
7452
+ {
7453
+ "epoch": 1.757053556343977,
7454
+ "grad_norm": 0.44993332028388977,
7455
+ "learning_rate": 1.5560136481920583e-05,
7456
+ "loss": 4.305658340454102,
7457
+ "step": 10400
7458
+ },
7459
+ {
7460
+ "epoch": 1.7587430309173846,
7461
+ "grad_norm": 0.45734935998916626,
7462
+ "learning_rate": 1.5348378053081885e-05,
7463
+ "loss": 4.28479232788086,
7464
+ "step": 10410
7465
+ },
7466
+ {
7467
+ "epoch": 1.7604325054907923,
7468
+ "grad_norm": 0.48875826597213745,
7469
+ "learning_rate": 1.5137992708946522e-05,
7470
+ "loss": 4.30067024230957,
7471
+ "step": 10420
7472
+ },
7473
+ {
7474
+ "epoch": 1.7621219800642,
7475
+ "grad_norm": 0.4599165618419647,
7476
+ "learning_rate": 1.4928982594879602e-05,
7477
+ "loss": 4.302487564086914,
7478
+ "step": 10430
7479
+ },
7480
+ {
7481
+ "epoch": 1.7638114546376076,
7482
+ "grad_norm": 0.45845454931259155,
7483
+ "learning_rate": 1.4721349842222623e-05,
7484
+ "loss": 4.285428619384765,
7485
+ "step": 10440
7486
+ },
7487
+ {
7488
+ "epoch": 1.7655009292110155,
7489
+ "grad_norm": 0.4705585539340973,
7490
+ "learning_rate": 1.4515096568271728e-05,
7491
+ "loss": 4.30066032409668,
7492
+ "step": 10450
7493
+ },
7494
+ {
7495
+ "epoch": 1.767190403784423,
7496
+ "grad_norm": 0.47239530086517334,
7497
+ "learning_rate": 1.4310224876256071e-05,
7498
+ "loss": 4.3203174591064455,
7499
+ "step": 10460
7500
+ },
7501
+ {
7502
+ "epoch": 1.7688798783578306,
7503
+ "grad_norm": 0.4652308225631714,
7504
+ "learning_rate": 1.410673685531638e-05,
7505
+ "loss": 4.307133483886719,
7506
+ "step": 10470
7507
+ },
7508
+ {
7509
+ "epoch": 1.7705693529312385,
7510
+ "grad_norm": 0.4624398946762085,
7511
+ "learning_rate": 1.390463458048357e-05,
7512
+ "loss": 4.315113830566406,
7513
+ "step": 10480
7514
+ },
7515
+ {
7516
+ "epoch": 1.772258827504646,
7517
+ "grad_norm": 0.46324899792671204,
7518
+ "learning_rate": 1.3703920112657856e-05,
7519
+ "loss": 4.304290771484375,
7520
+ "step": 10490
7521
+ },
7522
+ {
7523
+ "epoch": 1.7739483020780538,
7524
+ "grad_norm": 0.4670204222202301,
7525
+ "learning_rate": 1.3504595498587378e-05,
7526
+ "loss": 4.301520919799804,
7527
+ "step": 10500
7528
+ },
7529
+ {
7530
+ "epoch": 1.7739483020780538,
7531
+ "eval_loss": 4.273873805999756,
7532
+ "eval_runtime": 4.0508,
7533
+ "eval_samples_per_second": 246.863,
7534
+ "eval_steps_per_second": 5.184,
7535
+ "step": 10500
7536
  }
7537
  ],
7538
  "logging_steps": 10,
 
7552
  "attributes": {}
7553
  }
7554
  },
7555
+ "total_flos": 3.511775715466936e+17,
7556
  "train_batch_size": 48,
7557
  "trial_name": null,
7558
  "trial_params": null