AgPerry's picture
Upload num12 10pct midtrain checkpoint-620
4514624 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 620,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.016145307769929364,
"grad_norm": 1.2564789264326033,
"learning_rate": 1.4516129032258066e-06,
"loss": 0.7453375816345215,
"step": 10
},
{
"epoch": 0.03229061553985873,
"grad_norm": 0.7821283299054873,
"learning_rate": 3.0645161290322584e-06,
"loss": 0.6848444461822509,
"step": 20
},
{
"epoch": 0.04843592330978809,
"grad_norm": 0.48911008950007523,
"learning_rate": 4.67741935483871e-06,
"loss": 0.6249249935150146,
"step": 30
},
{
"epoch": 0.06458123107971746,
"grad_norm": 0.48038005351170454,
"learning_rate": 6.290322580645162e-06,
"loss": 0.594818115234375,
"step": 40
},
{
"epoch": 0.08072653884964683,
"grad_norm": 0.5691753419661809,
"learning_rate": 7.903225806451613e-06,
"loss": 0.556916618347168,
"step": 50
},
{
"epoch": 0.09687184661957618,
"grad_norm": 0.4427583428156907,
"learning_rate": 9.516129032258065e-06,
"loss": 0.5476222991943359,
"step": 60
},
{
"epoch": 0.11301715438950555,
"grad_norm": 0.6279589261316931,
"learning_rate": 9.99611750215541e-06,
"loss": 0.5416336536407471,
"step": 70
},
{
"epoch": 0.12916246215943492,
"grad_norm": 0.4755087891144107,
"learning_rate": 9.977115699791622e-06,
"loss": 0.5427621841430664,
"step": 80
},
{
"epoch": 0.14530776992936428,
"grad_norm": 0.47753673953077186,
"learning_rate": 9.942341621640558e-06,
"loss": 0.5373227596282959,
"step": 90
},
{
"epoch": 0.16145307769929365,
"grad_norm": 0.45055986756766414,
"learning_rate": 9.89190546533151e-06,
"loss": 0.5228934288024902,
"step": 100
},
{
"epoch": 0.17759838546922302,
"grad_norm": 0.4280290012967081,
"learning_rate": 9.825967060977933e-06,
"loss": 0.5259585380554199,
"step": 110
},
{
"epoch": 0.19374369323915236,
"grad_norm": 1.3885595528460506,
"learning_rate": 9.744735364682347e-06,
"loss": 0.5162020683288574,
"step": 120
},
{
"epoch": 0.20988900100908173,
"grad_norm": 0.46180163742899866,
"learning_rate": 9.648467796363019e-06,
"loss": 0.5223239898681641,
"step": 130
},
{
"epoch": 0.2260343087790111,
"grad_norm": 0.5854886166691778,
"learning_rate": 9.53746942400078e-06,
"loss": 0.5147815227508545,
"step": 140
},
{
"epoch": 0.24217961654894046,
"grad_norm": 0.4453241177281989,
"learning_rate": 9.412091996891097e-06,
"loss": 0.5132325172424317,
"step": 150
},
{
"epoch": 0.25832492431886983,
"grad_norm": 0.5292387781901823,
"learning_rate": 9.272732830964948e-06,
"loss": 0.5157772541046143,
"step": 160
},
{
"epoch": 0.27447023208879917,
"grad_norm": 0.44718182092406705,
"learning_rate": 9.119833549710927e-06,
"loss": 0.5131439685821533,
"step": 170
},
{
"epoch": 0.29061553985872857,
"grad_norm": 0.490885659513409,
"learning_rate": 8.953878684688492e-06,
"loss": 0.5093417644500733,
"step": 180
},
{
"epoch": 0.3067608476286579,
"grad_norm": 0.4132644245198082,
"learning_rate": 8.775394140067299e-06,
"loss": 0.5086756706237793,
"step": 190
},
{
"epoch": 0.3229061553985873,
"grad_norm": 0.4174212395756048,
"learning_rate": 8.584945526058426e-06,
"loss": 0.5185892105102539,
"step": 200
},
{
"epoch": 0.33905146316851664,
"grad_norm": 0.4204703411929031,
"learning_rate": 8.383136366518788e-06,
"loss": 0.5063163757324218,
"step": 210
},
{
"epoch": 0.35519677093844604,
"grad_norm": 0.5602588417199708,
"learning_rate": 8.170606186408756e-06,
"loss": 0.5080778121948242,
"step": 220
},
{
"epoch": 0.3713420787083754,
"grad_norm": 0.722448363378425,
"learning_rate": 7.948028485163744e-06,
"loss": 0.5074011802673339,
"step": 230
},
{
"epoch": 0.3874873864783047,
"grad_norm": 0.4692237397451838,
"learning_rate": 7.716108602402094e-06,
"loss": 0.4988128662109375,
"step": 240
},
{
"epoch": 0.4036326942482341,
"grad_norm": 0.4264064356657283,
"learning_rate": 7.475581482732717e-06,
"loss": 0.5008936405181885,
"step": 250
},
{
"epoch": 0.41977800201816345,
"grad_norm": 0.44503110431079446,
"learning_rate": 7.2272093467457226e-06,
"loss": 0.5001473426818848,
"step": 260
},
{
"epoch": 0.43592330978809285,
"grad_norm": 0.4451748109485531,
"learning_rate": 6.971779275566593e-06,
"loss": 0.5012603759765625,
"step": 270
},
{
"epoch": 0.4520686175580222,
"grad_norm": 0.4559698908675189,
"learning_rate": 6.710100716628345e-06,
"loss": 0.4997749328613281,
"step": 280
},
{
"epoch": 0.4682139253279516,
"grad_norm": 0.6917328470735025,
"learning_rate": 6.443002918565754e-06,
"loss": 0.5029205322265625,
"step": 290
},
{
"epoch": 0.4843592330978809,
"grad_norm": 0.4424890527067699,
"learning_rate": 6.171332303360411e-06,
"loss": 0.4997657299041748,
"step": 300
},
{
"epoch": 0.5005045408678103,
"grad_norm": 0.4720083796406712,
"learning_rate": 5.895949784064126e-06,
"loss": 0.49444894790649413,
"step": 310
},
{
"epoch": 0.5166498486377397,
"grad_norm": 0.39089223138014384,
"learning_rate": 5.617728036600734e-06,
"loss": 0.4930765151977539,
"step": 320
},
{
"epoch": 0.5327951564076691,
"grad_norm": 0.47595217764441355,
"learning_rate": 5.337548734291827e-06,
"loss": 0.48953914642333984,
"step": 330
},
{
"epoch": 0.5489404641775983,
"grad_norm": 0.4437674870169117,
"learning_rate": 5.0562997538701295e-06,
"loss": 0.4975291728973389,
"step": 340
},
{
"epoch": 0.5650857719475277,
"grad_norm": 0.44066493166516885,
"learning_rate": 4.7748723618344865e-06,
"loss": 0.4917713165283203,
"step": 350
},
{
"epoch": 0.5812310797174571,
"grad_norm": 0.5314417170919645,
"learning_rate": 4.49415839006284e-06,
"loss": 0.48806447982788087,
"step": 360
},
{
"epoch": 0.5973763874873865,
"grad_norm": 1.4947386266075393,
"learning_rate": 4.2150474096335356e-06,
"loss": 0.49144668579101564,
"step": 370
},
{
"epoch": 0.6135216952573158,
"grad_norm": 0.4350590241078372,
"learning_rate": 3.938423911811021e-06,
"loss": 0.4932772159576416,
"step": 380
},
{
"epoch": 0.6296670030272452,
"grad_norm": 0.4903885897812025,
"learning_rate": 3.6651645051292415e-06,
"loss": 0.48783349990844727,
"step": 390
},
{
"epoch": 0.6458123107971746,
"grad_norm": 0.6878893085423103,
"learning_rate": 3.3961351374551234e-06,
"loss": 0.49228334426879883,
"step": 400
},
{
"epoch": 0.6619576185671039,
"grad_norm": 0.41010549903336163,
"learning_rate": 3.132188351835232e-06,
"loss": 0.49144487380981444,
"step": 410
},
{
"epoch": 0.6781029263370333,
"grad_norm": 0.4191620209508489,
"learning_rate": 2.874160584821798e-06,
"loss": 0.493665075302124,
"step": 420
},
{
"epoch": 0.6942482341069627,
"grad_norm": 0.4637660042911083,
"learning_rate": 2.622869515839524e-06,
"loss": 0.487306547164917,
"step": 430
},
{
"epoch": 0.7103935418768921,
"grad_norm": 0.4032504407350088,
"learning_rate": 2.3791114759930013e-06,
"loss": 0.4938949108123779,
"step": 440
},
{
"epoch": 0.7265388496468214,
"grad_norm": 0.463632173922839,
"learning_rate": 2.1436589245260375e-06,
"loss": 0.482679557800293,
"step": 450
},
{
"epoch": 0.7426841574167508,
"grad_norm": 0.4455245146514806,
"learning_rate": 1.9172580009299735e-06,
"loss": 0.4972172737121582,
"step": 460
},
{
"epoch": 0.7588294651866802,
"grad_norm": 0.4313461560156946,
"learning_rate": 1.7006261604581725e-06,
"loss": 0.483304500579834,
"step": 470
},
{
"epoch": 0.7749747729566094,
"grad_norm": 0.40802500432486366,
"learning_rate": 1.4944499005397372e-06,
"loss": 0.4877010822296143,
"step": 480
},
{
"epoch": 0.7911200807265388,
"grad_norm": 0.48272456919743567,
"learning_rate": 1.2993825852972559e-06,
"loss": 0.487977123260498,
"step": 490
},
{
"epoch": 0.8072653884964682,
"grad_norm": 0.44816082724241857,
"learning_rate": 1.1160423750626693e-06,
"loss": 0.4939706325531006,
"step": 500
},
{
"epoch": 0.8234106962663976,
"grad_norm": 0.4192584603019428,
"learning_rate": 9.450102674524952e-07,
"loss": 0.49027338027954104,
"step": 510
},
{
"epoch": 0.8395560040363269,
"grad_norm": 0.5505560128103805,
"learning_rate": 7.868282562101819e-07,
"loss": 0.4772333145141602,
"step": 520
},
{
"epoch": 0.8557013118062563,
"grad_norm": 0.408392546206603,
"learning_rate": 6.419976136501377e-07,
"loss": 0.48678107261657716,
"step": 530
},
{
"epoch": 0.8718466195761857,
"grad_norm": 0.4719879078262765,
"learning_rate": 5.109773021462921e-07,
"loss": 0.4825289726257324,
"step": 540
},
{
"epoch": 0.887991927346115,
"grad_norm": 0.45855836941006267,
"learning_rate": 3.941825196991378e-07,
"loss": 0.4801445007324219,
"step": 550
},
{
"epoch": 0.9041372351160444,
"grad_norm": 0.4384510920334218,
"learning_rate": 2.919833841902714e-07,
"loss": 0.4838963508605957,
"step": 560
},
{
"epoch": 0.9202825428859738,
"grad_norm": 0.4203469347237003,
"learning_rate": 2.0470376049398944e-07,
"loss": 0.4862393379211426,
"step": 570
},
{
"epoch": 0.9364278506559032,
"grad_norm": 0.5986073034196603,
"learning_rate": 1.3262023416276414e-07,
"loss": 0.4833742618560791,
"step": 580
},
{
"epoch": 0.9525731584258325,
"grad_norm": 0.4395767849486276,
"learning_rate": 7.59612349389599e-08,
"loss": 0.48693051338195803,
"step": 590
},
{
"epoch": 0.9687184661957619,
"grad_norm": 0.6521570079176698,
"learning_rate": 3.4906312870331973e-08,
"loss": 0.4918181896209717,
"step": 600
},
{
"epoch": 0.9848637739656912,
"grad_norm": 0.42588128409358406,
"learning_rate": 9.585569323284915e-09,
"loss": 0.483614444732666,
"step": 610
},
{
"epoch": 1.0,
"grad_norm": 0.9356887206854803,
"learning_rate": 7.924469696718451e-11,
"loss": 0.492323112487793,
"step": 620
}
],
"logging_steps": 10,
"max_steps": 620,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 155,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.4385102991169946e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}