moos124 commited on
Commit
0fb857a
·
verified ·
1 Parent(s): 5996656

Training in progress, step 4680, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a93ca27c93bb18118dc8faf2e9e6f8dd528574415ecd725b43e22e5bed162969
3
  size 70430032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a6492ef322d5597fea48d081cb22f028e55d23c7221c1f5d2c0d52b36383977
3
  size 70430032
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1087e683ebd568a145ddfabb7d2efe7d820451cbb4ec74cd3bab8abd48801ea
3
  size 141058579
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c9b537d59fc8f54ed6f9bc67a07461d112b9d35e7bdba97c29c8aeae6b9c47d
3
  size 141058579
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3240f0aebc4d6a296fd2d5c3d89b0dfb51f76d5bee9087d1bdcb98f947fcc35d
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:627c156134610fd8bc9611a809d7d0e96a5b62384327d5b22ed21fa23dd24cb0
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13a20148b8ca05863768cad01be7d695bc69f08669f517b5f3fd3a6d6e738c47
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a7021d506222316729103157c09f6bde2051538c7c9b802480f486047db26ae
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.9365333333333333,
6
  "eval_steps": 500,
7
- "global_step": 4390,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4408,6 +4408,296 @@
4408
  "mean_token_accuracy": 0.764773941040039,
4409
  "num_tokens": 20432372.0,
4410
  "step": 4390
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4411
  }
4412
  ],
4413
  "logging_steps": 10,
@@ -4427,7 +4717,7 @@
4427
  "attributes": {}
4428
  }
4429
  },
4430
- "total_flos": 9.673928307549082e+16,
4431
  "train_batch_size": 4,
4432
  "trial_name": null,
4433
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.9984,
6
  "eval_steps": 500,
7
+ "global_step": 4680,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4408
  "mean_token_accuracy": 0.764773941040039,
4409
  "num_tokens": 20432372.0,
4410
  "step": 4390
4411
+ },
4412
+ {
4413
+ "entropy": 0.9848338901996613,
4414
+ "epoch": 0.9386666666666666,
4415
+ "grad_norm": 0.2765245735645294,
4416
+ "learning_rate": 5.756862957908433e-05,
4417
+ "loss": 1.1192432403564454,
4418
+ "mean_token_accuracy": 0.7547446370124817,
4419
+ "num_tokens": 20481366.0,
4420
+ "step": 4400
4421
+ },
4422
+ {
4423
+ "entropy": 0.9790939651429653,
4424
+ "epoch": 0.9408,
4425
+ "grad_norm": 0.23915551602840424,
4426
+ "learning_rate": 5.739750748666606e-05,
4427
+ "loss": 1.036961555480957,
4428
+ "mean_token_accuracy": 0.7573970347642899,
4429
+ "num_tokens": 20526985.0,
4430
+ "step": 4410
4431
+ },
4432
+ {
4433
+ "entropy": 0.9054527454078197,
4434
+ "epoch": 0.9429333333333333,
4435
+ "grad_norm": 0.24054944515228271,
4436
+ "learning_rate": 5.7226296761186274e-05,
4437
+ "loss": 0.9758554458618164,
4438
+ "mean_token_accuracy": 0.7724366948008538,
4439
+ "num_tokens": 20571815.0,
4440
+ "step": 4420
4441
+ },
4442
+ {
4443
+ "entropy": 0.9364707127213479,
4444
+ "epoch": 0.9450666666666667,
4445
+ "grad_norm": 0.28272607922554016,
4446
+ "learning_rate": 5.705499945400223e-05,
4447
+ "loss": 1.0225676536560058,
4448
+ "mean_token_accuracy": 0.7622330486774445,
4449
+ "num_tokens": 20615072.0,
4450
+ "step": 4430
4451
+ },
4452
+ {
4453
+ "entropy": 1.0657535366714002,
4454
+ "epoch": 0.9472,
4455
+ "grad_norm": 0.23734600841999054,
4456
+ "learning_rate": 5.688361761750861e-05,
4457
+ "loss": 1.1335111618041993,
4458
+ "mean_token_accuracy": 0.7402229458093643,
4459
+ "num_tokens": 20666534.0,
4460
+ "step": 4440
4461
+ },
4462
+ {
4463
+ "entropy": 0.9826746597886086,
4464
+ "epoch": 0.9493333333333334,
4465
+ "grad_norm": 0.28600969910621643,
4466
+ "learning_rate": 5.671215330511283e-05,
4467
+ "loss": 1.066628646850586,
4468
+ "mean_token_accuracy": 0.7560828119516373,
4469
+ "num_tokens": 20715376.0,
4470
+ "step": 4450
4471
+ },
4472
+ {
4473
+ "entropy": 0.9109843887388707,
4474
+ "epoch": 0.9514666666666667,
4475
+ "grad_norm": 0.2514685392379761,
4476
+ "learning_rate": 5.65406085712105e-05,
4477
+ "loss": 1.0114540100097655,
4478
+ "mean_token_accuracy": 0.7724284827709198,
4479
+ "num_tokens": 20758838.0,
4480
+ "step": 4460
4481
+ },
4482
+ {
4483
+ "entropy": 0.8498819716274738,
4484
+ "epoch": 0.9536,
4485
+ "grad_norm": 0.28889158368110657,
4486
+ "learning_rate": 5.6368985471160804e-05,
4487
+ "loss": 0.9062424659729004,
4488
+ "mean_token_accuracy": 0.785689315199852,
4489
+ "num_tokens": 20799444.0,
4490
+ "step": 4470
4491
+ },
4492
+ {
4493
+ "entropy": 0.8840778715908527,
4494
+ "epoch": 0.9557333333333333,
4495
+ "grad_norm": 0.2577449083328247,
4496
+ "learning_rate": 5.6197286061261875e-05,
4497
+ "loss": 0.9439300537109375,
4498
+ "mean_token_accuracy": 0.7696003526449203,
4499
+ "num_tokens": 20843766.0,
4500
+ "step": 4480
4501
+ },
4502
+ {
4503
+ "entropy": 0.8888865426182747,
4504
+ "epoch": 0.9578666666666666,
4505
+ "grad_norm": 0.27302756905555725,
4506
+ "learning_rate": 5.602551239872616e-05,
4507
+ "loss": 0.9372305870056152,
4508
+ "mean_token_accuracy": 0.7730641543865204,
4509
+ "num_tokens": 20888764.0,
4510
+ "step": 4490
4511
+ },
4512
+ {
4513
+ "entropy": 0.9558203481137753,
4514
+ "epoch": 0.96,
4515
+ "grad_norm": 0.3576233386993408,
4516
+ "learning_rate": 5.58536665416557e-05,
4517
+ "loss": 1.0556070327758789,
4518
+ "mean_token_accuracy": 0.762606156617403,
4519
+ "num_tokens": 20936028.0,
4520
+ "step": 4500
4521
+ },
4522
+ {
4523
+ "entropy": 0.9054192140698433,
4524
+ "epoch": 0.9621333333333333,
4525
+ "grad_norm": 0.2521965205669403,
4526
+ "learning_rate": 5.568175054901763e-05,
4527
+ "loss": 0.9705222129821778,
4528
+ "mean_token_accuracy": 0.7672724887728691,
4529
+ "num_tokens": 20985057.0,
4530
+ "step": 4510
4531
+ },
4532
+ {
4533
+ "entropy": 0.9011006608605385,
4534
+ "epoch": 0.9642666666666667,
4535
+ "grad_norm": 0.27024832367897034,
4536
+ "learning_rate": 5.550976648061934e-05,
4537
+ "loss": 0.9830186843872071,
4538
+ "mean_token_accuracy": 0.7754541039466858,
4539
+ "num_tokens": 21028567.0,
4540
+ "step": 4520
4541
+ },
4542
+ {
4543
+ "entropy": 0.9991332605481148,
4544
+ "epoch": 0.9664,
4545
+ "grad_norm": 0.2703147828578949,
4546
+ "learning_rate": 5.533771639708388e-05,
4547
+ "loss": 1.1589097023010253,
4548
+ "mean_token_accuracy": 0.7532796613872051,
4549
+ "num_tokens": 21072699.0,
4550
+ "step": 4530
4551
+ },
4552
+ {
4553
+ "entropy": 0.9183724671602249,
4554
+ "epoch": 0.9685333333333334,
4555
+ "grad_norm": 0.2243046760559082,
4556
+ "learning_rate": 5.516560235982527e-05,
4557
+ "loss": 0.9856460571289063,
4558
+ "mean_token_accuracy": 0.771567003428936,
4559
+ "num_tokens": 21121413.0,
4560
+ "step": 4540
4561
+ },
4562
+ {
4563
+ "entropy": 0.8655671834945678,
4564
+ "epoch": 0.9706666666666667,
4565
+ "grad_norm": 0.3306775987148285,
4566
+ "learning_rate": 5.499342643102381e-05,
4567
+ "loss": 0.9172829627990723,
4568
+ "mean_token_accuracy": 0.777653044462204,
4569
+ "num_tokens": 21162927.0,
4570
+ "step": 4550
4571
+ },
4572
+ {
4573
+ "entropy": 0.9436637915670871,
4574
+ "epoch": 0.9728,
4575
+ "grad_norm": 0.2542389929294586,
4576
+ "learning_rate": 5.482119067360132e-05,
4577
+ "loss": 1.0658721923828125,
4578
+ "mean_token_accuracy": 0.767835621535778,
4579
+ "num_tokens": 21206936.0,
4580
+ "step": 4560
4581
+ },
4582
+ {
4583
+ "entropy": 0.7974261797964572,
4584
+ "epoch": 0.9749333333333333,
4585
+ "grad_norm": 0.24307052791118622,
4586
+ "learning_rate": 5.4648897151196455e-05,
4587
+ "loss": 0.8578211784362793,
4588
+ "mean_token_accuracy": 0.7923481151461601,
4589
+ "num_tokens": 21252732.0,
4590
+ "step": 4570
4591
+ },
4592
+ {
4593
+ "entropy": 0.9691430673003196,
4594
+ "epoch": 0.9770666666666666,
4595
+ "grad_norm": 0.2720329165458679,
4596
+ "learning_rate": 5.447654792814e-05,
4597
+ "loss": 1.0459741592407226,
4598
+ "mean_token_accuracy": 0.7617560073733329,
4599
+ "num_tokens": 21298972.0,
4600
+ "step": 4580
4601
+ },
4602
+ {
4603
+ "entropy": 0.9178217075765133,
4604
+ "epoch": 0.9792,
4605
+ "grad_norm": 0.2640475630760193,
4606
+ "learning_rate": 5.4304145069430115e-05,
4607
+ "loss": 1.0324625015258788,
4608
+ "mean_token_accuracy": 0.7745086327195168,
4609
+ "num_tokens": 21348870.0,
4610
+ "step": 4590
4611
+ },
4612
+ {
4613
+ "entropy": 0.8973256818950176,
4614
+ "epoch": 0.9813333333333333,
4615
+ "grad_norm": 0.2828875184059143,
4616
+ "learning_rate": 5.4131690640707574e-05,
4617
+ "loss": 0.9894962310791016,
4618
+ "mean_token_accuracy": 0.7752941563725472,
4619
+ "num_tokens": 21390716.0,
4620
+ "step": 4600
4621
+ },
4622
+ {
4623
+ "entropy": 0.9490196861326694,
4624
+ "epoch": 0.9834666666666667,
4625
+ "grad_norm": 0.27414020895957947,
4626
+ "learning_rate": 5.3959186708231046e-05,
4627
+ "loss": 1.0264591217041015,
4628
+ "mean_token_accuracy": 0.7639399319887161,
4629
+ "num_tokens": 21440700.0,
4630
+ "step": 4610
4631
+ },
4632
+ {
4633
+ "entropy": 0.9219519071280956,
4634
+ "epoch": 0.9856,
4635
+ "grad_norm": 0.2545549273490906,
4636
+ "learning_rate": 5.3786635338852346e-05,
4637
+ "loss": 1.0511361122131349,
4638
+ "mean_token_accuracy": 0.7739394150674344,
4639
+ "num_tokens": 21483867.0,
4640
+ "step": 4620
4641
+ },
4642
+ {
4643
+ "entropy": 0.99324054941535,
4644
+ "epoch": 0.9877333333333334,
4645
+ "grad_norm": 0.272182434797287,
4646
+ "learning_rate": 5.361403859999161e-05,
4647
+ "loss": 1.116584587097168,
4648
+ "mean_token_accuracy": 0.7553175091743469,
4649
+ "num_tokens": 21535354.0,
4650
+ "step": 4630
4651
+ },
4652
+ {
4653
+ "entropy": 0.8828953221440315,
4654
+ "epoch": 0.9898666666666667,
4655
+ "grad_norm": 0.29537713527679443,
4656
+ "learning_rate": 5.344139855961262e-05,
4657
+ "loss": 0.9682372093200684,
4658
+ "mean_token_accuracy": 0.7781552016735077,
4659
+ "num_tokens": 21578265.0,
4660
+ "step": 4640
4661
+ },
4662
+ {
4663
+ "entropy": 0.9005228154361248,
4664
+ "epoch": 0.992,
4665
+ "grad_norm": 0.3032234013080597,
4666
+ "learning_rate": 5.3268717286197945e-05,
4667
+ "loss": 0.9423254013061524,
4668
+ "mean_token_accuracy": 0.7735077708959579,
4669
+ "num_tokens": 21618545.0,
4670
+ "step": 4650
4671
+ },
4672
+ {
4673
+ "entropy": 0.8464630447328091,
4674
+ "epoch": 0.9941333333333333,
4675
+ "grad_norm": 0.32000964879989624,
4676
+ "learning_rate": 5.3095996848724184e-05,
4677
+ "loss": 0.9030919075012207,
4678
+ "mean_token_accuracy": 0.7863337904214859,
4679
+ "num_tokens": 21657735.0,
4680
+ "step": 4660
4681
+ },
4682
+ {
4683
+ "entropy": 0.8923816077411175,
4684
+ "epoch": 0.9962666666666666,
4685
+ "grad_norm": 0.3551577627658844,
4686
+ "learning_rate": 5.292323931663719e-05,
4687
+ "loss": 0.9792759895324707,
4688
+ "mean_token_accuracy": 0.7739578939974308,
4689
+ "num_tokens": 21705183.0,
4690
+ "step": 4670
4691
+ },
4692
+ {
4693
+ "entropy": 0.9760521411895752,
4694
+ "epoch": 0.9984,
4695
+ "grad_norm": 0.2613706886768341,
4696
+ "learning_rate": 5.275044675982724e-05,
4697
+ "loss": 1.055685043334961,
4698
+ "mean_token_accuracy": 0.7623668745160103,
4699
+ "num_tokens": 21747104.0,
4700
+ "step": 4680
4701
  }
4702
  ],
4703
  "logging_steps": 10,
 
4717
  "attributes": {}
4718
  }
4719
  },
4720
+ "total_flos": 1.0298188561140326e+17,
4721
  "train_batch_size": 4,
4722
  "trial_name": null,
4723
  "trial_params": null