Azrail commited on
Commit
e1700dd
·
verified ·
1 Parent(s): 8debba9

Training in progress, step 132000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:18f6247fa697227171786e92b63492b81203ba9ab620eea2a35269c2dc5abc91
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9e57d7de320997016d5d2199393f3c6d5ccbb8649da9e46ae713874cd8a8e24
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a79b728b1351b728e46db09ab4e3bda84220fcf605f8e84a1af65a7e98ccf401
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f36f287a4da99bdaf6e0deca55af9eddec679234fd5785a93d74c5b7275a731
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13afedcbea29e4911157dfdebca89adaca3015ec55fbe8952619bfb77f49f98b
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3da6ad8ffd940afd42f47dbccd6a99fedee37b4e239b9c682223ad1635ee1326
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d530307a60624b67b44a38452390579f46394dc6c46c3e7e0b33446906fdcfb9
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21aed170a2d0b5ca9750f891383cff878afad1161fd25ef679259c6d8c42258b
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.249748977425856,
6
  "eval_steps": 500,
7
- "global_step": 131000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -23326,11 +23326,189 @@
23326
  "eval_steps_per_second": 15.164,
23327
  "num_input_tokens_seen": 68670633664,
23328
  "step": 131000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23329
  }
23330
  ],
23331
  "logging_steps": 50,
23332
  "max_steps": 140000,
23333
- "num_input_tokens_seen": 68670633664,
23334
  "num_train_epochs": 2,
23335
  "save_steps": 1000,
23336
  "stateful_callbacks": {
@@ -23345,7 +23523,7 @@
23345
  "attributes": {}
23346
  }
23347
  },
23348
- "total_flos": 1.2153449606169969e+20,
23349
  "train_batch_size": 32,
23350
  "trial_name": null,
23351
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.2592890277496214,
6
  "eval_steps": 500,
7
+ "global_step": 132000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
23326
  "eval_steps_per_second": 15.164,
23327
  "num_input_tokens_seen": 68670633664,
23328
  "step": 131000
23329
+ },
23330
+ {
23331
+ "epoch": 1.2502259799420443,
23332
+ "grad_norm": 0.13041457533836365,
23333
+ "learning_rate": 0.000231613104386454,
23334
+ "loss": 2.0362,
23335
+ "num_input_tokens_seen": 68696842016,
23336
+ "step": 131050
23337
+ },
23338
+ {
23339
+ "epoch": 1.2507029824582325,
23340
+ "grad_norm": 0.1306309849023819,
23341
+ "learning_rate": 0.00022925069366813716,
23342
+ "loss": 2.0593,
23343
+ "num_input_tokens_seen": 68723054176,
23344
+ "step": 131100
23345
+ },
23346
+ {
23347
+ "epoch": 1.2511799849744207,
23348
+ "grad_norm": 0.12761172652244568,
23349
+ "learning_rate": 0.00022689680393686457,
23350
+ "loss": 2.0496,
23351
+ "num_input_tokens_seen": 68749263552,
23352
+ "step": 131150
23353
+ },
23354
+ {
23355
+ "epoch": 1.251656987490609,
23356
+ "grad_norm": 0.12187056988477707,
23357
+ "learning_rate": 0.0002245515092739488,
23358
+ "loss": 2.0417,
23359
+ "num_input_tokens_seen": 68775477952,
23360
+ "step": 131200
23361
+ },
23362
+ {
23363
+ "epoch": 1.2521339900067971,
23364
+ "grad_norm": 0.12770666182041168,
23365
+ "learning_rate": 0.00022221488349019903,
23366
+ "loss": 2.0332,
23367
+ "num_input_tokens_seen": 68801692352,
23368
+ "step": 131250
23369
+ },
23370
+ {
23371
+ "epoch": 1.2526109925229856,
23372
+ "grad_norm": 0.13457396626472473,
23373
+ "learning_rate": 0.00021988700012359863,
23374
+ "loss": 2.0393,
23375
+ "num_input_tokens_seen": 68827900832,
23376
+ "step": 131300
23377
+ },
23378
+ {
23379
+ "epoch": 1.2530879950391738,
23380
+ "grad_norm": 0.12845295667648315,
23381
+ "learning_rate": 0.0002175679324369913,
23382
+ "loss": 2.0507,
23383
+ "num_input_tokens_seen": 68854107328,
23384
+ "step": 131350
23385
+ },
23386
+ {
23387
+ "epoch": 1.2535649975553622,
23388
+ "grad_norm": 0.12990029156208038,
23389
+ "learning_rate": 0.00021525775341577403,
23390
+ "loss": 2.0373,
23391
+ "num_input_tokens_seen": 68880316256,
23392
+ "step": 131400
23393
+ },
23394
+ {
23395
+ "epoch": 1.2540420000715504,
23396
+ "grad_norm": 0.12344187498092651,
23397
+ "learning_rate": 0.00021295653576560165,
23398
+ "loss": 2.0359,
23399
+ "num_input_tokens_seen": 68906521376,
23400
+ "step": 131450
23401
+ },
23402
+ {
23403
+ "epoch": 1.2545190025877386,
23404
+ "grad_norm": 0.12487955391407013,
23405
+ "learning_rate": 0.00021066435191009715,
23406
+ "loss": 2.0432,
23407
+ "num_input_tokens_seen": 68932735776,
23408
+ "step": 131500
23409
+ },
23410
+ {
23411
+ "epoch": 1.2545190025877386,
23412
+ "eval_loss": 1.9613933563232422,
23413
+ "eval_runtime": 82.9225,
23414
+ "eval_samples_per_second": 60.297,
23415
+ "eval_steps_per_second": 15.074,
23416
+ "num_input_tokens_seen": 68932735776,
23417
+ "step": 131500
23418
+ },
23419
+ {
23420
+ "epoch": 1.2549960051039268,
23421
+ "grad_norm": 0.13224980235099792,
23422
+ "learning_rate": 0.00020838127398857382,
23423
+ "loss": 2.0413,
23424
+ "num_input_tokens_seen": 68958946656,
23425
+ "step": 131550
23426
+ },
23427
+ {
23428
+ "epoch": 1.2554730076201153,
23429
+ "grad_norm": 0.12449366599321365,
23430
+ "learning_rate": 0.00020610737385376348,
23431
+ "loss": 2.0503,
23432
+ "num_input_tokens_seen": 68985155520,
23433
+ "step": 131600
23434
+ },
23435
+ {
23436
+ "epoch": 1.2559500101363035,
23437
+ "grad_norm": 0.12943805754184723,
23438
+ "learning_rate": 0.0002038427230695565,
23439
+ "loss": 2.0476,
23440
+ "num_input_tokens_seen": 69011368384,
23441
+ "step": 131650
23442
+ },
23443
+ {
23444
+ "epoch": 1.2564270126524917,
23445
+ "grad_norm": 0.1288331300020218,
23446
+ "learning_rate": 0.00020158739290874821,
23447
+ "loss": 2.0458,
23448
+ "num_input_tokens_seen": 69037580736,
23449
+ "step": 131700
23450
+ },
23451
+ {
23452
+ "epoch": 1.25690401516868,
23453
+ "grad_norm": 0.12655895948410034,
23454
+ "learning_rate": 0.00019934145435079704,
23455
+ "loss": 2.0474,
23456
+ "num_input_tokens_seen": 69063793760,
23457
+ "step": 131750
23458
+ },
23459
+ {
23460
+ "epoch": 1.2573810176848683,
23461
+ "grad_norm": 0.1263783723115921,
23462
+ "learning_rate": 0.0001971049780795901,
23463
+ "loss": 2.0387,
23464
+ "num_input_tokens_seen": 69090002496,
23465
+ "step": 131800
23466
+ },
23467
+ {
23468
+ "epoch": 1.2578580202010565,
23469
+ "grad_norm": 0.13202515244483948,
23470
+ "learning_rate": 0.0001948780344812181,
23471
+ "loss": 2.0531,
23472
+ "num_input_tokens_seen": 69116216896,
23473
+ "step": 131850
23474
+ },
23475
+ {
23476
+ "epoch": 1.2583350227172447,
23477
+ "grad_norm": 0.12061940133571625,
23478
+ "learning_rate": 0.00019266069364176142,
23479
+ "loss": 2.052,
23480
+ "num_input_tokens_seen": 69142427680,
23481
+ "step": 131900
23482
+ },
23483
+ {
23484
+ "epoch": 1.2588120252334332,
23485
+ "grad_norm": 0.1222308874130249,
23486
+ "learning_rate": 0.00019045302534508295,
23487
+ "loss": 2.0409,
23488
+ "num_input_tokens_seen": 69168631136,
23489
+ "step": 131950
23490
+ },
23491
+ {
23492
+ "epoch": 1.2592890277496214,
23493
+ "grad_norm": 0.11664976924657822,
23494
+ "learning_rate": 0.00018825509907063325,
23495
+ "loss": 2.0361,
23496
+ "num_input_tokens_seen": 69194840608,
23497
+ "step": 132000
23498
+ },
23499
+ {
23500
+ "epoch": 1.2592890277496214,
23501
+ "eval_loss": 1.9602855443954468,
23502
+ "eval_runtime": 82.6066,
23503
+ "eval_samples_per_second": 60.528,
23504
+ "eval_steps_per_second": 15.132,
23505
+ "num_input_tokens_seen": 69194840608,
23506
+ "step": 132000
23507
  }
23508
  ],
23509
  "logging_steps": 50,
23510
  "max_steps": 140000,
23511
+ "num_input_tokens_seen": 69194840608,
23512
  "num_train_epochs": 2,
23513
  "save_steps": 1000,
23514
  "stateful_callbacks": {
 
23523
  "attributes": {}
23524
  }
23525
  },
23526
+ "total_flos": 1.224622467372331e+20,
23527
  "train_batch_size": 32,
23528
  "trial_name": null,
23529
  "trial_params": null