aaravriyer193's picture
Training in progress, step 337, checkpoint
b0b60ca verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 337,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.2208438903093337,
"epoch": 0.01486988847583643,
"grad_norm": 0.55859375,
"learning_rate": 1.6000000000000003e-05,
"loss": 2.118885803222656,
"mean_token_accuracy": 0.5921677611768246,
"num_tokens": 162022.0,
"step": 5
},
{
"entropy": 1.2382806837558746,
"epoch": 0.02973977695167286,
"grad_norm": 0.08056640625,
"learning_rate": 3.6e-05,
"loss": 1.6261102676391601,
"mean_token_accuracy": 0.647309884428978,
"num_tokens": 324206.0,
"step": 10
},
{
"entropy": 1.5387765020132065,
"epoch": 0.04460966542750929,
"grad_norm": 0.07470703125,
"learning_rate": 5.6000000000000006e-05,
"loss": 1.5022705078125,
"mean_token_accuracy": 0.6562040001153946,
"num_tokens": 486182.0,
"step": 15
},
{
"entropy": 1.479974600672722,
"epoch": 0.05947955390334572,
"grad_norm": 0.0703125,
"learning_rate": 7.6e-05,
"loss": 1.3166878700256348,
"mean_token_accuracy": 0.6958272859454155,
"num_tokens": 648008.0,
"step": 20
},
{
"entropy": 1.048148949444294,
"epoch": 0.07434944237918216,
"grad_norm": 0.0751953125,
"learning_rate": 9.6e-05,
"loss": 1.0947124481201171,
"mean_token_accuracy": 0.7425475120544434,
"num_tokens": 809135.0,
"step": 25
},
{
"entropy": 0.8074702247977257,
"epoch": 0.08921933085501858,
"grad_norm": 0.047607421875,
"learning_rate": 0.000116,
"loss": 0.8717484474182129,
"mean_token_accuracy": 0.8009411200881005,
"num_tokens": 971805.0,
"step": 30
},
{
"entropy": 0.7200885951519013,
"epoch": 0.10408921933085502,
"grad_norm": 0.0299072265625,
"learning_rate": 0.00013600000000000003,
"loss": 0.7251162052154541,
"mean_token_accuracy": 0.8336925625801086,
"num_tokens": 1133288.0,
"step": 35
},
{
"entropy": 0.6965990558266639,
"epoch": 0.11895910780669144,
"grad_norm": 0.0279541015625,
"learning_rate": 0.00015600000000000002,
"loss": 0.7003275871276855,
"mean_token_accuracy": 0.8368907004594803,
"num_tokens": 1296330.0,
"step": 40
},
{
"entropy": 0.6499797679483891,
"epoch": 0.13382899628252787,
"grad_norm": 0.0228271484375,
"learning_rate": 0.00017600000000000002,
"loss": 0.6792300701141357,
"mean_token_accuracy": 0.8395274579524994,
"num_tokens": 1458535.0,
"step": 45
},
{
"entropy": 0.6847037307918071,
"epoch": 0.14869888475836432,
"grad_norm": 0.02294921875,
"learning_rate": 0.000196,
"loss": 0.6913325786590576,
"mean_token_accuracy": 0.8368435353040695,
"num_tokens": 1619838.0,
"step": 50
},
{
"entropy": 0.6361287623643875,
"epoch": 0.16356877323420074,
"grad_norm": 0.0263671875,
"learning_rate": 0.00019990415784861047,
"loss": 0.6388590335845947,
"mean_token_accuracy": 0.840661846101284,
"num_tokens": 1781356.0,
"step": 55
},
{
"entropy": 0.6305057637393474,
"epoch": 0.17843866171003717,
"grad_norm": 0.0233154296875,
"learning_rate": 0.00019951511394922507,
"loss": 0.6357249736785888,
"mean_token_accuracy": 0.8429885223507881,
"num_tokens": 1943586.0,
"step": 60
},
{
"entropy": 0.6157322488725185,
"epoch": 0.19330855018587362,
"grad_norm": 0.024658203125,
"learning_rate": 0.00019882804237803488,
"loss": 0.6146057605743408,
"mean_token_accuracy": 0.8475824564695358,
"num_tokens": 2105272.0,
"step": 65
},
{
"entropy": 0.6572050869464874,
"epoch": 0.20817843866171004,
"grad_norm": 0.025390625,
"learning_rate": 0.00019784500077565944,
"loss": 0.6681472778320312,
"mean_token_accuracy": 0.8372571259737015,
"num_tokens": 2268167.0,
"step": 70
},
{
"entropy": 0.5824843347072601,
"epoch": 0.22304832713754646,
"grad_norm": 0.02294921875,
"learning_rate": 0.00019656893315319837,
"loss": 0.5855841636657715,
"mean_token_accuracy": 0.8480771005153656,
"num_tokens": 2430543.0,
"step": 75
},
{
"entropy": 0.5482360351830721,
"epoch": 0.2379182156133829,
"grad_norm": 0.018310546875,
"learning_rate": 0.00019500366107551252,
"loss": 0.5611765861511231,
"mean_token_accuracy": 0.8610954374074936,
"num_tokens": 2592695.0,
"step": 80
},
{
"entropy": 0.5686104819178581,
"epoch": 0.2527881040892193,
"grad_norm": 0.0272216796875,
"learning_rate": 0.00019315387221640874,
"loss": 0.581544017791748,
"mean_token_accuracy": 0.8532564893364907,
"num_tokens": 2753981.0,
"step": 85
},
{
"entropy": 0.49499988108873366,
"epoch": 0.26765799256505574,
"grad_norm": 0.02001953125,
"learning_rate": 0.00019102510632000363,
"loss": 0.4944618225097656,
"mean_token_accuracy": 0.8736693963408471,
"num_tokens": 2915266.0,
"step": 90
},
{
"entropy": 0.5620223179459571,
"epoch": 0.2825278810408922,
"grad_norm": 0.018310546875,
"learning_rate": 0.00018862373861030837,
"loss": 0.5739808082580566,
"mean_token_accuracy": 0.8533669888973237,
"num_tokens": 3077274.0,
"step": 95
},
{
"entropy": 0.5610138960182667,
"epoch": 0.29739776951672864,
"grad_norm": 0.033203125,
"learning_rate": 0.00018595696069872013,
"loss": 0.5716083526611329,
"mean_token_accuracy": 0.8515751019120217,
"num_tokens": 3240540.0,
"step": 100
},
{
"entropy": 0.5545240785926581,
"epoch": 0.31226765799256506,
"grad_norm": 0.0184326171875,
"learning_rate": 0.00018303275904659806,
"loss": 0.5515320301055908,
"mean_token_accuracy": 0.8615253224968911,
"num_tokens": 3403401.0,
"step": 105
},
{
"entropy": 0.5450488172471524,
"epoch": 0.3271375464684015,
"grad_norm": 0.0233154296875,
"learning_rate": 0.00017985989104742434,
"loss": 0.5519495487213135,
"mean_token_accuracy": 0.8612264782190323,
"num_tokens": 3565026.0,
"step": 110
},
{
"entropy": 0.5393761422485113,
"epoch": 0.3420074349442379,
"grad_norm": 0.020263671875,
"learning_rate": 0.00017644785880017874,
"loss": 0.5512795448303223,
"mean_token_accuracy": 0.8637161552906036,
"num_tokens": 3727695.0,
"step": 115
},
{
"entropy": 0.5624090366065502,
"epoch": 0.35687732342007433,
"grad_norm": 0.0234375,
"learning_rate": 0.00017280688065247118,
"loss": 0.5684682846069335,
"mean_token_accuracy": 0.8566416442394257,
"num_tokens": 3889998.0,
"step": 120
},
{
"entropy": 0.5313072741031647,
"epoch": 0.37174721189591076,
"grad_norm": 0.0201416015625,
"learning_rate": 0.00016894786059865383,
"loss": 0.5380096435546875,
"mean_token_accuracy": 0.8626979544758797,
"num_tokens": 4052483.0,
"step": 125
},
{
"entropy": 0.5576793540269136,
"epoch": 0.38661710037174724,
"grad_norm": 0.0205078125,
"learning_rate": 0.00016488235562455965,
"loss": 0.5612647533416748,
"mean_token_accuracy": 0.8576673969626427,
"num_tokens": 4215207.0,
"step": 130
},
{
"entropy": 0.5940531171858311,
"epoch": 0.40148698884758366,
"grad_norm": 2.765625,
"learning_rate": 0.0001606225410966638,
"loss": 0.6199069499969483,
"mean_token_accuracy": 0.8474377766251564,
"num_tokens": 4375245.0,
"step": 135
},
{
"entropy": 0.6086725879460573,
"epoch": 0.4163568773234201,
"grad_norm": 0.7421875,
"learning_rate": 0.00015618117429931926,
"loss": 0.7376153469085693,
"mean_token_accuracy": 0.8377696231007576,
"num_tokens": 4536593.0,
"step": 140
},
{
"entropy": 0.6506265237927437,
"epoch": 0.4312267657992565,
"grad_norm": 0.04150390625,
"learning_rate": 0.0001515715562292662,
"loss": 0.6828119277954101,
"mean_token_accuracy": 0.835132221877575,
"num_tokens": 4697609.0,
"step": 145
},
{
"entropy": 0.5346488334238529,
"epoch": 0.44609665427509293,
"grad_norm": 0.056884765625,
"learning_rate": 0.00014680749176183274,
"loss": 0.5391588687896729,
"mean_token_accuracy": 0.8637033969163894,
"num_tokens": 4859476.0,
"step": 150
},
{
"entropy": 0.5779553644359112,
"epoch": 0.46096654275092935,
"grad_norm": 0.0198974609375,
"learning_rate": 0.00014190324830812067,
"loss": 0.5770033836364746,
"mean_token_accuracy": 0.8554806470870971,
"num_tokens": 5022222.0,
"step": 155
},
{
"entropy": 0.5745491735637188,
"epoch": 0.4758364312267658,
"grad_norm": 0.0218505859375,
"learning_rate": 0.00013687351308699027,
"loss": 0.5682717323303222,
"mean_token_accuracy": 0.8550971180200577,
"num_tokens": 5184389.0,
"step": 160
},
{
"entropy": 0.5769639994949103,
"epoch": 0.49070631970260226,
"grad_norm": 0.10107421875,
"learning_rate": 0.00013173334913980534,
"loss": 0.5720061302185059,
"mean_token_accuracy": 0.857777065038681,
"num_tokens": 5346122.0,
"step": 165
},
{
"entropy": 0.521759420260787,
"epoch": 0.5055762081784386,
"grad_norm": 0.0257568359375,
"learning_rate": 0.0001264981502196662,
"loss": 0.5244236469268799,
"mean_token_accuracy": 0.8673782303929329,
"num_tokens": 5509440.0,
"step": 170
},
{
"entropy": 0.5925083503127098,
"epoch": 0.5204460966542751,
"grad_norm": 0.10400390625,
"learning_rate": 0.00012118359469022712,
"loss": 0.595769739151001,
"mean_token_accuracy": 0.8512872710824013,
"num_tokens": 5672412.0,
"step": 175
},
{
"entropy": 0.6406555585563183,
"epoch": 0.5353159851301115,
"grad_norm": 0.2451171875,
"learning_rate": 0.00011580559857216347,
"loss": 0.645458173751831,
"mean_token_accuracy": 0.8404021769762039,
"num_tokens": 5833737.0,
"step": 180
},
{
"entropy": 0.5327178973704576,
"epoch": 0.550185873605948,
"grad_norm": 0.032958984375,
"learning_rate": 0.0001103802678779032,
"loss": 0.536646842956543,
"mean_token_accuracy": 0.8626102104783058,
"num_tokens": 5995659.0,
"step": 185
},
{
"entropy": 0.606436661630869,
"epoch": 0.5650557620817844,
"grad_norm": 0.0250244140625,
"learning_rate": 0.00010492385037737207,
"loss": 0.5936649322509766,
"mean_token_accuracy": 0.8469037219882012,
"num_tokens": 6157375.0,
"step": 190
},
{
"entropy": 0.5063752841204405,
"epoch": 0.5799256505576208,
"grad_norm": 0.0274658203125,
"learning_rate": 9.945268693920346e-05,
"loss": 0.5121519088745117,
"mean_token_accuracy": 0.8688234716653824,
"num_tokens": 6320049.0,
"step": 195
},
{
"entropy": 0.5400943882763386,
"epoch": 0.5947955390334573,
"grad_norm": 0.0224609375,
"learning_rate": 9.398316259313637e-05,
"loss": 0.5355047225952149,
"mean_token_accuracy": 0.8615404218435287,
"num_tokens": 6482227.0,
"step": 200
},
{
"entropy": 0.5946259450167417,
"epoch": 0.6096654275092936,
"grad_norm": 0.0189208984375,
"learning_rate": 8.853165746015997e-05,
"loss": 0.5926107883453369,
"mean_token_accuracy": 0.8485366463661194,
"num_tokens": 6644489.0,
"step": 205
},
{
"entropy": 0.5018576122820377,
"epoch": 0.6245353159851301,
"grad_norm": 0.0191650390625,
"learning_rate": 8.311449769735873e-05,
"loss": 0.5042452335357666,
"mean_token_accuracy": 0.8692936778068543,
"num_tokens": 6806204.0,
"step": 210
},
{
"entropy": 0.5548792567104102,
"epoch": 0.6394052044609665,
"grad_norm": 0.0223388671875,
"learning_rate": 7.774790660436858e-05,
"loss": 0.563843059539795,
"mean_token_accuracy": 0.8553685575723649,
"num_tokens": 6967855.0,
"step": 215
},
{
"entropy": 0.5388505697250366,
"epoch": 0.654275092936803,
"grad_norm": 0.0174560546875,
"learning_rate": 7.244795603787036e-05,
"loss": 0.5390424728393555,
"mean_token_accuracy": 0.8610016539692879,
"num_tokens": 7128748.0,
"step": 220
},
{
"entropy": 0.5537650570273399,
"epoch": 0.6691449814126395,
"grad_norm": 0.0191650390625,
"learning_rate": 6.723051827962445e-05,
"loss": 0.54982008934021,
"mean_token_accuracy": 0.8591041445732117,
"num_tokens": 7290603.0,
"step": 225
},
{
"entropy": 0.5197319515049458,
"epoch": 0.6840148698884758,
"grad_norm": 0.017822265625,
"learning_rate": 6.211121850219175e-05,
"loss": 0.5279690265655518,
"mean_token_accuracy": 0.8655778139829635,
"num_tokens": 7453375.0,
"step": 230
},
{
"entropy": 0.49672888703644275,
"epoch": 0.6988847583643123,
"grad_norm": 0.0198974609375,
"learning_rate": 5.7105387974697063e-05,
"loss": 0.4927337646484375,
"mean_token_accuracy": 0.8734062314033508,
"num_tokens": 7616243.0,
"step": 235
},
{
"entropy": 0.5233525596559048,
"epoch": 0.7137546468401487,
"grad_norm": 0.0196533203125,
"learning_rate": 5.222801814877369e-05,
"loss": 0.5159758567810059,
"mean_token_accuracy": 0.8658116608858109,
"num_tokens": 7776719.0,
"step": 240
},
{
"entropy": 0.515385128930211,
"epoch": 0.7286245353159851,
"grad_norm": 0.0177001953125,
"learning_rate": 4.749371576219317e-05,
"loss": 0.5128642559051514,
"mean_token_accuracy": 0.8703905552625656,
"num_tokens": 7938195.0,
"step": 245
},
{
"entropy": 0.4934384971857071,
"epoch": 0.7434944237918215,
"grad_norm": 0.0216064453125,
"learning_rate": 4.291665909463477e-05,
"loss": 0.4990520477294922,
"mean_token_accuracy": 0.8704771339893341,
"num_tokens": 8098729.0,
"step": 250
},
{
"entropy": 0.5353568136692047,
"epoch": 0.758364312267658,
"grad_norm": 0.02294921875,
"learning_rate": 3.8510555506600974e-05,
"loss": 0.542482566833496,
"mean_token_accuracy": 0.8630232095718384,
"num_tokens": 8261835.0,
"step": 255
},
{
"entropy": 0.48506755754351616,
"epoch": 0.7732342007434945,
"grad_norm": 0.021484375,
"learning_rate": 3.4288600388640714e-05,
"loss": 0.4917303085327148,
"mean_token_accuracy": 0.8723696529865265,
"num_tokens": 8423616.0,
"step": 260
},
{
"entropy": 0.5245153240859508,
"epoch": 0.7881040892193308,
"grad_norm": 0.019287109375,
"learning_rate": 3.026343764381887e-05,
"loss": 0.5242561340332031,
"mean_token_accuracy": 0.8671647250652313,
"num_tokens": 8585786.0,
"step": 265
},
{
"entropy": 0.525696974992752,
"epoch": 0.8029739776951673,
"grad_norm": 0.0245361328125,
"learning_rate": 2.6447121821779917e-05,
"loss": 0.5204005718231202,
"mean_token_accuracy": 0.8655787914991379,
"num_tokens": 8747495.0,
"step": 270
},
{
"entropy": 0.44850245080888274,
"epoch": 0.8178438661710037,
"grad_norm": 0.0201416015625,
"learning_rate": 2.2851082017805703e-05,
"loss": 0.4419082164764404,
"mean_token_accuracy": 0.8838535219430923,
"num_tokens": 8909848.0,
"step": 275
},
{
"entropy": 0.583437193930149,
"epoch": 0.8327137546468402,
"grad_norm": 0.02099609375,
"learning_rate": 1.9486087644983054e-05,
"loss": 0.5837182521820068,
"mean_token_accuracy": 0.8495006680488586,
"num_tokens": 9069163.0,
"step": 280
},
{
"entropy": 0.5175457876175642,
"epoch": 0.8475836431226765,
"grad_norm": 0.0245361328125,
"learning_rate": 1.6362216181986002e-05,
"loss": 0.5189806461334229,
"mean_token_accuracy": 0.8652941584587097,
"num_tokens": 9230950.0,
"step": 285
},
{
"entropy": 0.5524289276450872,
"epoch": 0.862453531598513,
"grad_norm": 0.0216064453125,
"learning_rate": 1.3488822993062089e-05,
"loss": 0.5572507858276368,
"mean_token_accuracy": 0.8533222541213036,
"num_tokens": 9392141.0,
"step": 290
},
{
"entropy": 0.5627229742705822,
"epoch": 0.8773234200743495,
"grad_norm": 0.0247802734375,
"learning_rate": 1.0874513310605628e-05,
"loss": 0.5729455471038818,
"mean_token_accuracy": 0.8517941504716873,
"num_tokens": 9553843.0,
"step": 295
},
{
"entropy": 0.49910875745117667,
"epoch": 0.8921933085501859,
"grad_norm": 0.0238037109375,
"learning_rate": 8.527116464224127e-06,
"loss": 0.4991349697113037,
"mean_token_accuracy": 0.8706730246543884,
"num_tokens": 9716187.0,
"step": 300
},
{
"entropy": 0.4898031514137983,
"epoch": 0.9070631970260223,
"grad_norm": 0.0205078125,
"learning_rate": 6.453662433477136e-06,
"loss": 0.4925398826599121,
"mean_token_accuracy": 0.8753444463014602,
"num_tokens": 9879023.0,
"step": 305
},
{
"entropy": 0.45501707717776296,
"epoch": 0.9219330855018587,
"grad_norm": 0.0203857421875,
"learning_rate": 4.660360794506946e-06,
"loss": 0.4548198699951172,
"mean_token_accuracy": 0.8821182236075401,
"num_tokens": 10041800.0,
"step": 310
},
{
"entropy": 0.5240208253264427,
"epoch": 0.9368029739776952,
"grad_norm": 0.0233154296875,
"learning_rate": 3.1525821236119577e-06,
"loss": 0.5236988067626953,
"mean_token_accuracy": 0.8641670763492584,
"num_tokens": 10204694.0,
"step": 315
},
{
"entropy": 0.538949977979064,
"epoch": 0.9516728624535316,
"grad_norm": 0.0220947265625,
"learning_rate": 1.934841913455032e-06,
"loss": 0.5439452648162841,
"mean_token_accuracy": 0.8550667986273766,
"num_tokens": 10366660.0,
"step": 320
},
{
"entropy": 0.5102543152868748,
"epoch": 0.966542750929368,
"grad_norm": 0.018310546875,
"learning_rate": 1.010787050074835e-06,
"loss": 0.5104735374450684,
"mean_token_accuracy": 0.8640209168195725,
"num_tokens": 10529304.0,
"step": 325
},
{
"entropy": 0.4799253273755312,
"epoch": 0.9814126394052045,
"grad_norm": 0.019287109375,
"learning_rate": 3.831848911984959e-07,
"loss": 0.47628107070922854,
"mean_token_accuracy": 0.8758200943470001,
"num_tokens": 10692105.0,
"step": 330
},
{
"entropy": 0.45510734170675277,
"epoch": 0.9962825278810409,
"grad_norm": 0.0213623046875,
"learning_rate": 5.391497856399585e-08,
"loss": 0.4581630229949951,
"mean_token_accuracy": 0.8793978497385979,
"num_tokens": 10854094.0,
"step": 335
}
],
"logging_steps": 5,
"max_steps": 337,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 999999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.965993164065407e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}