ARZ-LLM-CHECKPOINTS / checkpoint-1024 /trainer_state.json
Marcus888's picture
Add files using upload-large-folder tool
ab79e94 verified
{
"best_global_step": 352,
"best_metric": 1.5587613582611084,
"best_model_checkpoint": "./my_model/checkpoint-352",
"epoch": 64.0,
"eval_steps": 500,
"global_step": 1024,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 2.611812502145767,
"epoch": 0.06451612903225806,
"grad_norm": 2.948519468307495,
"learning_rate": 0.0,
"loss": 5.2626,
"mean_token_accuracy": 0.2785332165658474,
"num_tokens": 1354.0,
"step": 1
},
{
"entropy": 2.4764878584278955,
"epoch": 0.6451612903225806,
"grad_norm": 2.852177858352661,
"learning_rate": 5.625e-07,
"loss": 5.0479,
"mean_token_accuracy": 0.30352623243298793,
"num_tokens": 13915.0,
"step": 10
},
{
"epoch": 1.0,
"eval_entropy": 2.549605812345232,
"eval_loss": 5.082220554351807,
"eval_mean_token_accuracy": 0.29234256382499424,
"eval_num_tokens": 21527.0,
"eval_runtime": 0.8908,
"eval_samples_per_second": 61.741,
"eval_steps_per_second": 15.716,
"step": 16
},
{
"entropy": 2.4721337352928363,
"epoch": 1.2580645161290323,
"grad_norm": 2.8321847915649414,
"learning_rate": 1.1875e-06,
"loss": 5.033,
"mean_token_accuracy": 0.30711135052536664,
"num_tokens": 27177.0,
"step": 20
},
{
"entropy": 2.514200139045715,
"epoch": 1.903225806451613,
"grad_norm": 3.025498151779175,
"learning_rate": 1.8125e-06,
"loss": 5.0773,
"mean_token_accuracy": 0.2981846956536174,
"num_tokens": 40986.0,
"step": 30
},
{
"epoch": 2.0,
"eval_entropy": 2.5709889445986067,
"eval_loss": 5.047023773193359,
"eval_mean_token_accuracy": 0.2919592655130795,
"eval_num_tokens": 43054.0,
"eval_runtime": 0.8868,
"eval_samples_per_second": 62.018,
"eval_steps_per_second": 15.786,
"step": 32
},
{
"entropy": 2.541774250959095,
"epoch": 2.5161290322580645,
"grad_norm": 3.4038965702056885,
"learning_rate": 2.4375000000000004e-06,
"loss": 5.0438,
"mean_token_accuracy": 0.2972054022707437,
"num_tokens": 54010.0,
"step": 40
},
{
"epoch": 3.0,
"eval_entropy": 2.5724382059914723,
"eval_loss": 4.9238505363464355,
"eval_mean_token_accuracy": 0.2935441700475557,
"eval_num_tokens": 64581.0,
"eval_runtime": 0.997,
"eval_samples_per_second": 55.165,
"eval_steps_per_second": 14.042,
"step": 48
},
{
"entropy": 2.5254996732661597,
"epoch": 3.129032258064516,
"grad_norm": 4.6959710121154785,
"learning_rate": 3.0625e-06,
"loss": 4.8754,
"mean_token_accuracy": 0.3122231556396735,
"num_tokens": 67318.0,
"step": 50
},
{
"entropy": 2.532456985116005,
"epoch": 3.774193548387097,
"grad_norm": 1.8910281658172607,
"learning_rate": 3.6875e-06,
"loss": 4.8683,
"mean_token_accuracy": 0.31472160052508114,
"num_tokens": 81243.0,
"step": 60
},
{
"epoch": 4.0,
"eval_entropy": 2.6716178315026418,
"eval_loss": 4.745687484741211,
"eval_mean_token_accuracy": 0.3186415561607906,
"eval_num_tokens": 86108.0,
"eval_runtime": 0.8831,
"eval_samples_per_second": 62.282,
"eval_steps_per_second": 15.854,
"step": 64
},
{
"entropy": 2.6316533935697457,
"epoch": 4.387096774193548,
"grad_norm": 1.985713243484497,
"learning_rate": 4.3125e-06,
"loss": 4.7158,
"mean_token_accuracy": 0.32339329899925934,
"num_tokens": 94546.0,
"step": 70
},
{
"entropy": 2.6654595481721977,
"epoch": 5.0,
"grad_norm": 1.8635355234146118,
"learning_rate": 4.937500000000001e-06,
"loss": 4.4521,
"mean_token_accuracy": 0.3342058395868854,
"num_tokens": 107635.0,
"step": 80
},
{
"epoch": 5.0,
"eval_entropy": 2.7309968301228116,
"eval_loss": 4.356206893920898,
"eval_mean_token_accuracy": 0.3469075581857136,
"eval_num_tokens": 107635.0,
"eval_runtime": 1.4048,
"eval_samples_per_second": 39.153,
"eval_steps_per_second": 9.966,
"step": 80
},
{
"entropy": 2.6752349376678466,
"epoch": 5.645161290322581,
"grad_norm": 1.7081493139266968,
"learning_rate": 5.5625000000000005e-06,
"loss": 4.1996,
"mean_token_accuracy": 0.3722452763468027,
"num_tokens": 121563.0,
"step": 90
},
{
"epoch": 6.0,
"eval_entropy": 2.802973576954433,
"eval_loss": 3.9146478176116943,
"eval_mean_token_accuracy": 0.38693774597985403,
"eval_num_tokens": 129162.0,
"eval_runtime": 0.9423,
"eval_samples_per_second": 58.371,
"eval_steps_per_second": 14.858,
"step": 96
},
{
"entropy": 2.770364958988993,
"epoch": 6.258064516129032,
"grad_norm": 1.6673827171325684,
"learning_rate": 6.1875000000000005e-06,
"loss": 3.9392,
"mean_token_accuracy": 0.387496774133883,
"num_tokens": 134741.0,
"step": 100
},
{
"entropy": 2.7224088311195374,
"epoch": 6.903225806451613,
"grad_norm": 1.527048110961914,
"learning_rate": 6.8125e-06,
"loss": 3.62,
"mean_token_accuracy": 0.45071578361094,
"num_tokens": 148587.0,
"step": 110
},
{
"epoch": 7.0,
"eval_entropy": 2.6817347833088467,
"eval_loss": 3.47971248626709,
"eval_mean_token_accuracy": 0.487923339009285,
"eval_num_tokens": 150689.0,
"eval_runtime": 0.9499,
"eval_samples_per_second": 57.9,
"eval_steps_per_second": 14.738,
"step": 112
},
{
"entropy": 2.5822339403001884,
"epoch": 7.516129032258064,
"grad_norm": 1.3976553678512573,
"learning_rate": 7.4375e-06,
"loss": 3.3133,
"mean_token_accuracy": 0.5055951774120331,
"num_tokens": 161826.0,
"step": 120
},
{
"epoch": 8.0,
"eval_entropy": 2.662975311279297,
"eval_loss": 3.109734296798706,
"eval_mean_token_accuracy": 0.5159200259617397,
"eval_num_tokens": 172216.0,
"eval_runtime": 0.9281,
"eval_samples_per_second": 59.26,
"eval_steps_per_second": 15.084,
"step": 128
},
{
"entropy": 2.563361422011727,
"epoch": 8.129032258064516,
"grad_norm": 1.7696324586868286,
"learning_rate": 8.062500000000001e-06,
"loss": 3.1155,
"mean_token_accuracy": 0.5187091541133428,
"num_tokens": 175017.0,
"step": 130
},
{
"entropy": 2.588775309920311,
"epoch": 8.774193548387096,
"grad_norm": 2.261418342590332,
"learning_rate": 8.6875e-06,
"loss": 2.8235,
"mean_token_accuracy": 0.5475961033254861,
"num_tokens": 188936.0,
"step": 140
},
{
"epoch": 9.0,
"eval_entropy": 2.5937297514506747,
"eval_loss": 2.6096889972686768,
"eval_mean_token_accuracy": 0.5660783967801503,
"eval_num_tokens": 193743.0,
"eval_runtime": 0.9053,
"eval_samples_per_second": 60.757,
"eval_steps_per_second": 15.465,
"step": 144
},
{
"entropy": 2.5060042989881417,
"epoch": 9.387096774193548,
"grad_norm": 1.5702177286148071,
"learning_rate": 9.312500000000001e-06,
"loss": 2.5439,
"mean_token_accuracy": 0.5805619108049493,
"num_tokens": 202001.0,
"step": 150
},
{
"entropy": 2.201874099279705,
"epoch": 10.0,
"grad_norm": 1.8405438661575317,
"learning_rate": 9.937500000000001e-06,
"loss": 2.3102,
"mean_token_accuracy": 0.6006814374735481,
"num_tokens": 215270.0,
"step": 160
},
{
"epoch": 10.0,
"eval_entropy": 2.1728043726512363,
"eval_loss": 2.3522286415100098,
"eval_mean_token_accuracy": 0.5939501779420036,
"eval_num_tokens": 215270.0,
"eval_runtime": 0.9322,
"eval_samples_per_second": 58.999,
"eval_steps_per_second": 15.018,
"step": 160
},
{
"entropy": 2.1090281650424005,
"epoch": 10.64516129032258,
"grad_norm": 1.3122938871383667,
"learning_rate": 1.0562500000000001e-05,
"loss": 2.2236,
"mean_token_accuracy": 0.605573232471943,
"num_tokens": 229135.0,
"step": 170
},
{
"epoch": 11.0,
"eval_entropy": 2.126331014292581,
"eval_loss": 2.2020554542541504,
"eval_mean_token_accuracy": 0.6053547646318164,
"eval_num_tokens": 236797.0,
"eval_runtime": 0.9054,
"eval_samples_per_second": 60.745,
"eval_steps_per_second": 15.462,
"step": 176
},
{
"entropy": 2.053142737401159,
"epoch": 11.258064516129032,
"grad_norm": 1.5178934335708618,
"learning_rate": 1.1187500000000001e-05,
"loss": 2.0915,
"mean_token_accuracy": 0.61878864937707,
"num_tokens": 242377.0,
"step": 180
},
{
"entropy": 2.013620141148567,
"epoch": 11.903225806451612,
"grad_norm": 1.4182125329971313,
"learning_rate": 1.1812499999999999e-05,
"loss": 2.0478,
"mean_token_accuracy": 0.6269605554640293,
"num_tokens": 256298.0,
"step": 190
},
{
"epoch": 12.0,
"eval_entropy": 2.0988540819713046,
"eval_loss": 2.093829393386841,
"eval_mean_token_accuracy": 0.6164226361683437,
"eval_num_tokens": 258324.0,
"eval_runtime": 0.9948,
"eval_samples_per_second": 55.289,
"eval_steps_per_second": 14.074,
"step": 192
},
{
"entropy": 1.9682148710677498,
"epoch": 12.516129032258064,
"grad_norm": 1.3009895086288452,
"learning_rate": 1.24375e-05,
"loss": 1.9508,
"mean_token_accuracy": 0.6344620632497888,
"num_tokens": 269514.0,
"step": 200
},
{
"epoch": 13.0,
"eval_entropy": 2.022765415055411,
"eval_loss": 1.9986889362335205,
"eval_mean_token_accuracy": 0.6248509841305869,
"eval_num_tokens": 279851.0,
"eval_runtime": 1.3989,
"eval_samples_per_second": 39.316,
"eval_steps_per_second": 10.008,
"step": 208
},
{
"entropy": 1.9475462483732324,
"epoch": 13.129032258064516,
"grad_norm": 1.2914632558822632,
"learning_rate": 1.3062499999999999e-05,
"loss": 1.9446,
"mean_token_accuracy": 0.6389191699655432,
"num_tokens": 282652.0,
"step": 210
},
{
"entropy": 1.9222171217203141,
"epoch": 13.774193548387096,
"grad_norm": 1.631095290184021,
"learning_rate": 1.36875e-05,
"loss": 1.8472,
"mean_token_accuracy": 0.6466637052595615,
"num_tokens": 296522.0,
"step": 220
},
{
"epoch": 14.0,
"eval_entropy": 1.9563691530908858,
"eval_loss": 1.9220991134643555,
"eval_mean_token_accuracy": 0.6309446564742497,
"eval_num_tokens": 301378.0,
"eval_runtime": 1.0668,
"eval_samples_per_second": 51.555,
"eval_steps_per_second": 13.123,
"step": 224
},
{
"entropy": 1.870294423479783,
"epoch": 14.387096774193548,
"grad_norm": 1.3715276718139648,
"learning_rate": 1.43125e-05,
"loss": 1.7688,
"mean_token_accuracy": 0.6547230929136276,
"num_tokens": 309834.0,
"step": 230
},
{
"entropy": 1.8683158645504399,
"epoch": 15.0,
"grad_norm": 1.9479233026504517,
"learning_rate": 1.4937500000000002e-05,
"loss": 1.7425,
"mean_token_accuracy": 0.6565716470542707,
"num_tokens": 322905.0,
"step": 240
},
{
"epoch": 15.0,
"eval_entropy": 1.9176117862973894,
"eval_loss": 1.8483814001083374,
"eval_mean_token_accuracy": 0.6409520549433572,
"eval_num_tokens": 322905.0,
"eval_runtime": 0.9202,
"eval_samples_per_second": 59.77,
"eval_steps_per_second": 15.214,
"step": 240
},
{
"entropy": 1.8178010761737824,
"epoch": 15.64516129032258,
"grad_norm": 1.6151540279388428,
"learning_rate": 1.5562500000000002e-05,
"loss": 1.6598,
"mean_token_accuracy": 0.6694157928228378,
"num_tokens": 336780.0,
"step": 250
},
{
"epoch": 16.0,
"eval_entropy": 1.8427454914365495,
"eval_loss": 1.7848949432373047,
"eval_mean_token_accuracy": 0.6483822464942932,
"eval_num_tokens": 344432.0,
"eval_runtime": 0.8965,
"eval_samples_per_second": 61.351,
"eval_steps_per_second": 15.617,
"step": 256
},
{
"entropy": 1.7828364309511686,
"epoch": 16.258064516129032,
"grad_norm": 1.4990966320037842,
"learning_rate": 1.61875e-05,
"loss": 1.5925,
"mean_token_accuracy": 0.6848364127309698,
"num_tokens": 349950.0,
"step": 260
},
{
"entropy": 1.7749755203723907,
"epoch": 16.903225806451612,
"grad_norm": 1.6659446954727173,
"learning_rate": 1.6812500000000002e-05,
"loss": 1.5674,
"mean_token_accuracy": 0.7018352136015892,
"num_tokens": 363887.0,
"step": 270
},
{
"epoch": 17.0,
"eval_entropy": 1.7996527978352137,
"eval_loss": 1.7261488437652588,
"eval_mean_token_accuracy": 0.6748419829777309,
"eval_num_tokens": 365959.0,
"eval_runtime": 0.956,
"eval_samples_per_second": 57.533,
"eval_steps_per_second": 14.645,
"step": 272
},
{
"entropy": 1.7126218849106838,
"epoch": 17.516129032258064,
"grad_norm": 1.7774029970169067,
"learning_rate": 1.74375e-05,
"loss": 1.4737,
"mean_token_accuracy": 0.7113534455236635,
"num_tokens": 377051.0,
"step": 280
},
{
"epoch": 18.0,
"eval_entropy": 1.7618773494447981,
"eval_loss": 1.6715681552886963,
"eval_mean_token_accuracy": 0.6734440326690674,
"eval_num_tokens": 387486.0,
"eval_runtime": 1.3598,
"eval_samples_per_second": 40.446,
"eval_steps_per_second": 10.295,
"step": 288
},
{
"entropy": 1.684084642874567,
"epoch": 18.129032258064516,
"grad_norm": 1.8488755226135254,
"learning_rate": 1.8062500000000002e-05,
"loss": 1.4353,
"mean_token_accuracy": 0.717556736186931,
"num_tokens": 390288.0,
"step": 290
},
{
"entropy": 1.6123981848359108,
"epoch": 18.774193548387096,
"grad_norm": 1.94077467918396,
"learning_rate": 1.8687500000000004e-05,
"loss": 1.347,
"mean_token_accuracy": 0.7245998069643974,
"num_tokens": 404246.0,
"step": 300
},
{
"epoch": 19.0,
"eval_entropy": 1.6447282092911857,
"eval_loss": 1.6219120025634766,
"eval_mean_token_accuracy": 0.6779105194977352,
"eval_num_tokens": 409013.0,
"eval_runtime": 0.9117,
"eval_samples_per_second": 60.326,
"eval_steps_per_second": 15.356,
"step": 304
},
{
"entropy": 1.5541185391576666,
"epoch": 19.387096774193548,
"grad_norm": 2.0378854274749756,
"learning_rate": 1.93125e-05,
"loss": 1.2785,
"mean_token_accuracy": 0.7347154123218436,
"num_tokens": 417317.0,
"step": 310
},
{
"entropy": 1.403356123911707,
"epoch": 20.0,
"grad_norm": 2.347236394882202,
"learning_rate": 1.99375e-05,
"loss": 1.219,
"mean_token_accuracy": 0.7410024349626742,
"num_tokens": 430540.0,
"step": 320
},
{
"epoch": 20.0,
"eval_entropy": 1.4916774034500122,
"eval_loss": 1.587111473083496,
"eval_mean_token_accuracy": 0.6795690613133567,
"eval_num_tokens": 430540.0,
"eval_runtime": 0.9048,
"eval_samples_per_second": 60.786,
"eval_steps_per_second": 15.473,
"step": 320
},
{
"entropy": 1.338417048752308,
"epoch": 20.64516129032258,
"grad_norm": 2.188633441925049,
"learning_rate": 2.0562500000000002e-05,
"loss": 1.1363,
"mean_token_accuracy": 0.7564118377864361,
"num_tokens": 444500.0,
"step": 330
},
{
"epoch": 21.0,
"eval_entropy": 1.3439774853842599,
"eval_loss": 1.5705065727233887,
"eval_mean_token_accuracy": 0.6734923720359802,
"eval_num_tokens": 452067.0,
"eval_runtime": 1.3705,
"eval_samples_per_second": 40.131,
"eval_steps_per_second": 10.215,
"step": 336
},
{
"entropy": 1.2597325475592362,
"epoch": 21.258064516129032,
"grad_norm": 2.0218992233276367,
"learning_rate": 2.1187500000000003e-05,
"loss": 1.0859,
"mean_token_accuracy": 0.7610587077705484,
"num_tokens": 457711.0,
"step": 340
},
{
"entropy": 1.1984408333897592,
"epoch": 21.903225806451612,
"grad_norm": 2.203165292739868,
"learning_rate": 2.18125e-05,
"loss": 1.0145,
"mean_token_accuracy": 0.7772165350615978,
"num_tokens": 471531.0,
"step": 350
},
{
"epoch": 22.0,
"eval_entropy": 1.2664960197040014,
"eval_loss": 1.5587613582611084,
"eval_mean_token_accuracy": 0.6787797468049186,
"eval_num_tokens": 473594.0,
"eval_runtime": 0.9351,
"eval_samples_per_second": 58.82,
"eval_steps_per_second": 14.972,
"step": 352
},
{
"entropy": 1.1226187560119127,
"epoch": 22.516129032258064,
"grad_norm": 2.4278323650360107,
"learning_rate": 2.24375e-05,
"loss": 0.9255,
"mean_token_accuracy": 0.7932525535947398,
"num_tokens": 484678.0,
"step": 360
},
{
"epoch": 23.0,
"eval_entropy": 1.2237448862620763,
"eval_loss": 1.579514741897583,
"eval_mean_token_accuracy": 0.6716454710279193,
"eval_num_tokens": 495121.0,
"eval_runtime": 1.3763,
"eval_samples_per_second": 39.963,
"eval_steps_per_second": 10.172,
"step": 368
},
{
"entropy": 1.0918044400842566,
"epoch": 23.129032258064516,
"grad_norm": 2.5345664024353027,
"learning_rate": 2.30625e-05,
"loss": 0.8737,
"mean_token_accuracy": 0.7978831976652145,
"num_tokens": 497974.0,
"step": 370
},
{
"entropy": 1.006336173415184,
"epoch": 23.774193548387096,
"grad_norm": 2.779085874557495,
"learning_rate": 2.36875e-05,
"loss": 0.7646,
"mean_token_accuracy": 0.8216063916683197,
"num_tokens": 511856.0,
"step": 380
},
{
"epoch": 24.0,
"eval_entropy": 1.1918106589998518,
"eval_loss": 1.5942119359970093,
"eval_mean_token_accuracy": 0.6677229617323194,
"eval_num_tokens": 516648.0,
"eval_runtime": 0.9097,
"eval_samples_per_second": 60.46,
"eval_steps_per_second": 15.39,
"step": 384
},
{
"entropy": 1.0034341443526118,
"epoch": 24.387096774193548,
"grad_norm": 2.9064760208129883,
"learning_rate": 2.43125e-05,
"loss": 0.7059,
"mean_token_accuracy": 0.8363859167224482,
"num_tokens": 524969.0,
"step": 390
},
{
"entropy": 0.9153783635089272,
"epoch": 25.0,
"grad_norm": 3.5367863178253174,
"learning_rate": 2.4937500000000003e-05,
"loss": 0.6268,
"mean_token_accuracy": 0.8444738458645972,
"num_tokens": 538175.0,
"step": 400
},
{
"epoch": 25.0,
"eval_entropy": 1.1169007931436812,
"eval_loss": 1.666494369506836,
"eval_mean_token_accuracy": 0.6655990694250379,
"eval_num_tokens": 538175.0,
"eval_runtime": 0.9258,
"eval_samples_per_second": 59.408,
"eval_steps_per_second": 15.122,
"step": 400
},
{
"entropy": 0.8221864104270935,
"epoch": 25.64516129032258,
"grad_norm": 2.6190507411956787,
"learning_rate": 2.55625e-05,
"loss": 0.5171,
"mean_token_accuracy": 0.8675303012132645,
"num_tokens": 552002.0,
"step": 410
},
{
"epoch": 26.0,
"eval_entropy": 1.0611292464392525,
"eval_loss": 1.6921919584274292,
"eval_mean_token_accuracy": 0.65951726266316,
"eval_num_tokens": 559702.0,
"eval_runtime": 1.0329,
"eval_samples_per_second": 53.25,
"eval_steps_per_second": 13.555,
"step": 416
},
{
"entropy": 0.8153277470877296,
"epoch": 26.258064516129032,
"grad_norm": 2.4259774684906006,
"learning_rate": 2.6187500000000003e-05,
"loss": 0.4676,
"mean_token_accuracy": 0.8747850750621996,
"num_tokens": 565255.0,
"step": 420
},
{
"entropy": 0.7013958178460598,
"epoch": 26.903225806451612,
"grad_norm": 2.4626195430755615,
"learning_rate": 2.68125e-05,
"loss": 0.3867,
"mean_token_accuracy": 0.8833703070878982,
"num_tokens": 579163.0,
"step": 430
},
{
"epoch": 27.0,
"eval_entropy": 0.9333125693457467,
"eval_loss": 1.7267862558364868,
"eval_mean_token_accuracy": 0.658582159451076,
"eval_num_tokens": 581229.0,
"eval_runtime": 0.9233,
"eval_samples_per_second": 59.571,
"eval_steps_per_second": 15.163,
"step": 432
},
{
"entropy": 0.5856798651971316,
"epoch": 27.516129032258064,
"grad_norm": 2.773681402206421,
"learning_rate": 2.74375e-05,
"loss": 0.3408,
"mean_token_accuracy": 0.8944279308381834,
"num_tokens": 592320.0,
"step": 440
},
{
"epoch": 28.0,
"eval_entropy": 0.831269736800875,
"eval_loss": 1.8276340961456299,
"eval_mean_token_accuracy": 0.6552059480122158,
"eval_num_tokens": 602756.0,
"eval_runtime": 0.9803,
"eval_samples_per_second": 56.107,
"eval_steps_per_second": 14.282,
"step": 448
},
{
"entropy": 0.5249183703409998,
"epoch": 28.129032258064516,
"grad_norm": 1.7010736465454102,
"learning_rate": 2.80625e-05,
"loss": 0.3165,
"mean_token_accuracy": 0.9029051528165215,
"num_tokens": 605474.0,
"step": 450
},
{
"entropy": 0.4536327484995127,
"epoch": 28.774193548387096,
"grad_norm": 2.132962226867676,
"learning_rate": 2.86875e-05,
"loss": 0.2865,
"mean_token_accuracy": 0.9036249771714211,
"num_tokens": 619457.0,
"step": 460
},
{
"epoch": 29.0,
"eval_entropy": 0.7956289521285466,
"eval_loss": 1.9158949851989746,
"eval_mean_token_accuracy": 0.6526114770344326,
"eval_num_tokens": 624283.0,
"eval_runtime": 0.9267,
"eval_samples_per_second": 59.35,
"eval_steps_per_second": 15.107,
"step": 464
},
{
"entropy": 0.4277557734596102,
"epoch": 29.387096774193548,
"grad_norm": 1.9701188802719116,
"learning_rate": 2.9312500000000004e-05,
"loss": 0.2736,
"mean_token_accuracy": 0.9080228907497305,
"num_tokens": 632664.0,
"step": 470
},
{
"entropy": 0.3978642586030458,
"epoch": 30.0,
"grad_norm": 2.9814867973327637,
"learning_rate": 2.9937500000000003e-05,
"loss": 0.2796,
"mean_token_accuracy": 0.9062309014169794,
"num_tokens": 645810.0,
"step": 480
},
{
"epoch": 30.0,
"eval_entropy": 0.7981852037566048,
"eval_loss": 2.003068447113037,
"eval_mean_token_accuracy": 0.6490011853831155,
"eval_num_tokens": 645810.0,
"eval_runtime": 0.9368,
"eval_samples_per_second": 58.708,
"eval_steps_per_second": 14.944,
"step": 480
},
{
"entropy": 0.37885100245475767,
"epoch": 30.64516129032258,
"grad_norm": 2.112239360809326,
"learning_rate": 3.05625e-05,
"loss": 0.2518,
"mean_token_accuracy": 0.9144969284534454,
"num_tokens": 659784.0,
"step": 490
},
{
"epoch": 31.0,
"eval_entropy": 0.7598817561353955,
"eval_loss": 2.0737624168395996,
"eval_mean_token_accuracy": 0.6507022082805634,
"eval_num_tokens": 667337.0,
"eval_runtime": 0.8916,
"eval_samples_per_second": 61.689,
"eval_steps_per_second": 15.703,
"step": 496
},
{
"entropy": 0.3670440645594346,
"epoch": 31.258064516129032,
"grad_norm": 1.9339967966079712,
"learning_rate": 3.1187500000000006e-05,
"loss": 0.2544,
"mean_token_accuracy": 0.9085111869008917,
"num_tokens": 672944.0,
"step": 500
},
{
"entropy": 0.34540521949529646,
"epoch": 31.903225806451612,
"grad_norm": 2.0099146366119385,
"learning_rate": 3.18125e-05,
"loss": 0.2468,
"mean_token_accuracy": 0.9127625226974487,
"num_tokens": 686848.0,
"step": 510
},
{
"epoch": 32.0,
"eval_entropy": 0.6959893958909171,
"eval_loss": 2.104912519454956,
"eval_mean_token_accuracy": 0.6566033831664494,
"eval_num_tokens": 688864.0,
"eval_runtime": 1.2103,
"eval_samples_per_second": 45.445,
"eval_steps_per_second": 11.568,
"step": 512
},
{
"entropy": 0.3452399529908833,
"epoch": 32.516129032258064,
"grad_norm": 1.8694605827331543,
"learning_rate": 3.24375e-05,
"loss": 0.2461,
"mean_token_accuracy": 0.9124428755358646,
"num_tokens": 699838.0,
"step": 520
},
{
"epoch": 33.0,
"eval_entropy": 0.6757666979517255,
"eval_loss": 2.1449453830718994,
"eval_mean_token_accuracy": 0.6545567682811192,
"eval_num_tokens": 710391.0,
"eval_runtime": 0.912,
"eval_samples_per_second": 60.306,
"eval_steps_per_second": 15.351,
"step": 528
},
{
"entropy": 0.3346626260562947,
"epoch": 33.12903225806452,
"grad_norm": 1.2991915941238403,
"learning_rate": 3.3062500000000004e-05,
"loss": 0.2393,
"mean_token_accuracy": 0.9142316736673054,
"num_tokens": 713076.0,
"step": 530
},
{
"entropy": 0.3058926550671458,
"epoch": 33.774193548387096,
"grad_norm": 2.250917911529541,
"learning_rate": 3.36875e-05,
"loss": 0.2366,
"mean_token_accuracy": 0.9110181450843811,
"num_tokens": 726913.0,
"step": 540
},
{
"epoch": 34.0,
"eval_entropy": 0.713920282466071,
"eval_loss": 2.044567823410034,
"eval_mean_token_accuracy": 0.6525738835334778,
"eval_num_tokens": 731918.0,
"eval_runtime": 0.8817,
"eval_samples_per_second": 62.379,
"eval_steps_per_second": 15.878,
"step": 544
},
{
"entropy": 0.3304567150771618,
"epoch": 34.38709677419355,
"grad_norm": 1.6248284578323364,
"learning_rate": 3.43125e-05,
"loss": 0.234,
"mean_token_accuracy": 0.9141417553550318,
"num_tokens": 740119.0,
"step": 550
},
{
"entropy": 0.2964809501641675,
"epoch": 35.0,
"grad_norm": 2.199978828430176,
"learning_rate": 3.49375e-05,
"loss": 0.2354,
"mean_token_accuracy": 0.9153264400206114,
"num_tokens": 753445.0,
"step": 560
},
{
"epoch": 35.0,
"eval_entropy": 0.6718730543340955,
"eval_loss": 2.131523847579956,
"eval_mean_token_accuracy": 0.6514393900121961,
"eval_num_tokens": 753445.0,
"eval_runtime": 0.919,
"eval_samples_per_second": 59.849,
"eval_steps_per_second": 15.234,
"step": 560
},
{
"entropy": 0.29936634581536054,
"epoch": 35.645161290322584,
"grad_norm": 1.9858863353729248,
"learning_rate": 3.5562500000000004e-05,
"loss": 0.2233,
"mean_token_accuracy": 0.9170688688755035,
"num_tokens": 767352.0,
"step": 570
},
{
"epoch": 36.0,
"eval_entropy": 0.6986751215798515,
"eval_loss": 2.072589874267578,
"eval_mean_token_accuracy": 0.655285677739552,
"eval_num_tokens": 774972.0,
"eval_runtime": 0.9265,
"eval_samples_per_second": 59.365,
"eval_steps_per_second": 15.111,
"step": 576
},
{
"entropy": 0.30756315883052976,
"epoch": 36.25806451612903,
"grad_norm": 1.2706995010375977,
"learning_rate": 3.61875e-05,
"loss": 0.2278,
"mean_token_accuracy": 0.9169254663743471,
"num_tokens": 780722.0,
"step": 580
},
{
"entropy": 0.2909585501998663,
"epoch": 36.903225806451616,
"grad_norm": 2.095874786376953,
"learning_rate": 3.68125e-05,
"loss": 0.2266,
"mean_token_accuracy": 0.9121941901743412,
"num_tokens": 794511.0,
"step": 590
},
{
"epoch": 37.0,
"eval_entropy": 0.636478283575603,
"eval_loss": 2.166316270828247,
"eval_mean_token_accuracy": 0.6568594745227269,
"eval_num_tokens": 796499.0,
"eval_runtime": 0.9013,
"eval_samples_per_second": 61.02,
"eval_steps_per_second": 15.532,
"step": 592
},
{
"entropy": 0.28608485017167895,
"epoch": 37.516129032258064,
"grad_norm": 2.4893622398376465,
"learning_rate": 3.74375e-05,
"loss": 0.218,
"mean_token_accuracy": 0.9165164412636506,
"num_tokens": 807681.0,
"step": 600
},
{
"epoch": 38.0,
"eval_entropy": 0.6342436969280243,
"eval_loss": 2.156416177749634,
"eval_mean_token_accuracy": 0.6568004020622799,
"eval_num_tokens": 818026.0,
"eval_runtime": 0.9354,
"eval_samples_per_second": 58.798,
"eval_steps_per_second": 14.967,
"step": 608
},
{
"entropy": 0.2943071307320344,
"epoch": 38.12903225806452,
"grad_norm": 1.960349202156067,
"learning_rate": 3.8062500000000004e-05,
"loss": 0.2245,
"mean_token_accuracy": 0.9143017130462747,
"num_tokens": 820826.0,
"step": 610
},
{
"entropy": 0.26704031582921745,
"epoch": 38.774193548387096,
"grad_norm": 1.1493830680847168,
"learning_rate": 3.8687500000000005e-05,
"loss": 0.2165,
"mean_token_accuracy": 0.9143977962434292,
"num_tokens": 834709.0,
"step": 620
},
{
"epoch": 39.0,
"eval_entropy": 0.6424140781164169,
"eval_loss": 2.2105581760406494,
"eval_mean_token_accuracy": 0.6563994671617236,
"eval_num_tokens": 839553.0,
"eval_runtime": 0.9226,
"eval_samples_per_second": 59.612,
"eval_steps_per_second": 15.174,
"step": 624
},
{
"entropy": 0.27404083448805305,
"epoch": 39.38709677419355,
"grad_norm": 1.7598483562469482,
"learning_rate": 3.93125e-05,
"loss": 0.2157,
"mean_token_accuracy": 0.9152095623706517,
"num_tokens": 847976.0,
"step": 630
},
{
"entropy": 0.2730099213750739,
"epoch": 40.0,
"grad_norm": 1.9577555656433105,
"learning_rate": 3.99375e-05,
"loss": 0.2216,
"mean_token_accuracy": 0.9135262981841439,
"num_tokens": 861080.0,
"step": 640
},
{
"epoch": 40.0,
"eval_entropy": 0.623557556952749,
"eval_loss": 2.177314043045044,
"eval_mean_token_accuracy": 0.6597372846943992,
"eval_num_tokens": 861080.0,
"eval_runtime": 0.9132,
"eval_samples_per_second": 60.227,
"eval_steps_per_second": 15.33,
"step": 640
},
{
"entropy": 0.26116420738399027,
"epoch": 40.645161290322584,
"grad_norm": 1.4962230920791626,
"learning_rate": 4.0562500000000003e-05,
"loss": 0.2104,
"mean_token_accuracy": 0.9174227572977542,
"num_tokens": 874945.0,
"step": 650
},
{
"epoch": 41.0,
"eval_entropy": 0.6280922591686249,
"eval_loss": 2.182685136795044,
"eval_mean_token_accuracy": 0.6488148740359715,
"eval_num_tokens": 882607.0,
"eval_runtime": 0.9382,
"eval_samples_per_second": 58.625,
"eval_steps_per_second": 14.923,
"step": 656
},
{
"entropy": 0.27049236548574346,
"epoch": 41.25806451612903,
"grad_norm": 1.5585705041885376,
"learning_rate": 4.11875e-05,
"loss": 0.2172,
"mean_token_accuracy": 0.9116643212343517,
"num_tokens": 888188.0,
"step": 660
},
{
"entropy": 0.25881535150110724,
"epoch": 41.903225806451616,
"grad_norm": 1.7957926988601685,
"learning_rate": 4.181250000000001e-05,
"loss": 0.2171,
"mean_token_accuracy": 0.9126927703619003,
"num_tokens": 902034.0,
"step": 670
},
{
"epoch": 42.0,
"eval_entropy": 0.6354757377079555,
"eval_loss": 2.1949057579040527,
"eval_mean_token_accuracy": 0.651188816343035,
"eval_num_tokens": 904134.0,
"eval_runtime": 0.8942,
"eval_samples_per_second": 61.51,
"eval_steps_per_second": 15.657,
"step": 672
},
{
"entropy": 0.2616837781510855,
"epoch": 42.516129032258064,
"grad_norm": 1.9319422245025635,
"learning_rate": 4.24375e-05,
"loss": 0.2111,
"mean_token_accuracy": 0.9157685621788627,
"num_tokens": 915303.0,
"step": 680
},
{
"epoch": 43.0,
"eval_entropy": 0.615446959223066,
"eval_loss": 2.2043023109436035,
"eval_mean_token_accuracy": 0.6558100581169128,
"eval_num_tokens": 925661.0,
"eval_runtime": 0.9252,
"eval_samples_per_second": 59.445,
"eval_steps_per_second": 15.131,
"step": 688
},
{
"entropy": 0.25998294510339437,
"epoch": 43.12903225806452,
"grad_norm": 2.184018850326538,
"learning_rate": 4.30625e-05,
"loss": 0.2175,
"mean_token_accuracy": 0.914588484324907,
"num_tokens": 928440.0,
"step": 690
},
{
"entropy": 0.25247995406389234,
"epoch": 43.774193548387096,
"grad_norm": 2.9504449367523193,
"learning_rate": 4.3687500000000005e-05,
"loss": 0.216,
"mean_token_accuracy": 0.9170804493129253,
"num_tokens": 942357.0,
"step": 700
},
{
"epoch": 44.0,
"eval_entropy": 0.6387277500970023,
"eval_loss": 2.154686212539673,
"eval_mean_token_accuracy": 0.6561333068779537,
"eval_num_tokens": 947188.0,
"eval_runtime": 0.912,
"eval_samples_per_second": 60.307,
"eval_steps_per_second": 15.351,
"step": 704
},
{
"entropy": 0.26556668744275447,
"epoch": 44.38709677419355,
"grad_norm": 2.3016467094421387,
"learning_rate": 4.43125e-05,
"loss": 0.2129,
"mean_token_accuracy": 0.9151790534195147,
"num_tokens": 955541.0,
"step": 710
},
{
"entropy": 0.25153220680199173,
"epoch": 45.0,
"grad_norm": 1.5553311109542847,
"learning_rate": 4.49375e-05,
"loss": 0.2197,
"mean_token_accuracy": 0.912982240319252,
"num_tokens": 968715.0,
"step": 720
},
{
"epoch": 45.0,
"eval_entropy": 0.6276453903743199,
"eval_loss": 2.186168670654297,
"eval_mean_token_accuracy": 0.6584120520523616,
"eval_num_tokens": 968715.0,
"eval_runtime": 0.9005,
"eval_samples_per_second": 61.074,
"eval_steps_per_second": 15.546,
"step": 720
},
{
"entropy": 0.2620750930160284,
"epoch": 45.645161290322584,
"grad_norm": 2.157158136367798,
"learning_rate": 4.55625e-05,
"loss": 0.2042,
"mean_token_accuracy": 0.9174154184758663,
"num_tokens": 982594.0,
"step": 730
},
{
"epoch": 46.0,
"eval_entropy": 0.6001809579985482,
"eval_loss": 2.269158124923706,
"eval_mean_token_accuracy": 0.6551194148404258,
"eval_num_tokens": 990242.0,
"eval_runtime": 0.9149,
"eval_samples_per_second": 60.114,
"eval_steps_per_second": 15.302,
"step": 736
},
{
"entropy": 0.24646662116834991,
"epoch": 46.25806451612903,
"grad_norm": 0.8333325982093811,
"learning_rate": 4.61875e-05,
"loss": 0.2186,
"mean_token_accuracy": 0.9161424346660313,
"num_tokens": 995750.0,
"step": 740
},
{
"entropy": 0.25320138819515703,
"epoch": 46.903225806451616,
"grad_norm": 1.5136183500289917,
"learning_rate": 4.6812500000000006e-05,
"loss": 0.212,
"mean_token_accuracy": 0.9146950207650661,
"num_tokens": 1009725.0,
"step": 750
},
{
"epoch": 47.0,
"eval_entropy": 0.6296457903725761,
"eval_loss": 2.1639528274536133,
"eval_mean_token_accuracy": 0.6583362604890551,
"eval_num_tokens": 1011769.0,
"eval_runtime": 0.8933,
"eval_samples_per_second": 61.57,
"eval_steps_per_second": 15.672,
"step": 752
},
{
"entropy": 0.25277749547048617,
"epoch": 47.516129032258064,
"grad_norm": 2.703397512435913,
"learning_rate": 4.74375e-05,
"loss": 0.2107,
"mean_token_accuracy": 0.9154670128696843,
"num_tokens": 1022922.0,
"step": 760
},
{
"epoch": 48.0,
"eval_entropy": 0.6431450226477214,
"eval_loss": 2.113975763320923,
"eval_mean_token_accuracy": 0.6571915745735168,
"eval_num_tokens": 1033296.0,
"eval_runtime": 1.4606,
"eval_samples_per_second": 37.656,
"eval_steps_per_second": 9.585,
"step": 768
},
{
"entropy": 0.2505563164227887,
"epoch": 48.12903225806452,
"grad_norm": 1.7070248126983643,
"learning_rate": 4.80625e-05,
"loss": 0.2181,
"mean_token_accuracy": 0.9125990342152747,
"num_tokens": 1036124.0,
"step": 770
},
{
"entropy": 0.2494984647259116,
"epoch": 48.774193548387096,
"grad_norm": 2.354995012283325,
"learning_rate": 4.8687500000000004e-05,
"loss": 0.2069,
"mean_token_accuracy": 0.9165523618459701,
"num_tokens": 1050032.0,
"step": 780
},
{
"epoch": 49.0,
"eval_entropy": 0.603933504649571,
"eval_loss": 2.2200510501861572,
"eval_mean_token_accuracy": 0.6548148649079459,
"eval_num_tokens": 1054823.0,
"eval_runtime": 0.9123,
"eval_samples_per_second": 60.286,
"eval_steps_per_second": 15.346,
"step": 784
},
{
"entropy": 0.24296215019728007,
"epoch": 49.38709677419355,
"grad_norm": 1.5552977323532104,
"learning_rate": 4.93125e-05,
"loss": 0.2083,
"mean_token_accuracy": 0.9156087629104915,
"num_tokens": 1063159.0,
"step": 790
},
{
"entropy": 0.24393935266293978,
"epoch": 50.0,
"grad_norm": 1.544976830482483,
"learning_rate": 4.99375e-05,
"loss": 0.2157,
"mean_token_accuracy": 0.9146833968789954,
"num_tokens": 1076350.0,
"step": 800
},
{
"epoch": 50.0,
"eval_entropy": 0.6158908797161919,
"eval_loss": 2.2002458572387695,
"eval_mean_token_accuracy": 0.6551659618105207,
"eval_num_tokens": 1076350.0,
"eval_runtime": 0.9028,
"eval_samples_per_second": 60.923,
"eval_steps_per_second": 15.508,
"step": 800
},
{
"entropy": 0.24434087462723256,
"epoch": 50.645161290322584,
"grad_norm": 1.5835460424423218,
"learning_rate": 5.05625e-05,
"loss": 0.2053,
"mean_token_accuracy": 0.9157114021480084,
"num_tokens": 1090170.0,
"step": 810
},
{
"epoch": 51.0,
"eval_entropy": 0.6001614396061216,
"eval_loss": 2.134579658508301,
"eval_mean_token_accuracy": 0.6620945462158748,
"eval_num_tokens": 1097877.0,
"eval_runtime": 0.9182,
"eval_samples_per_second": 59.901,
"eval_steps_per_second": 15.248,
"step": 816
},
{
"entropy": 0.2386562437995484,
"epoch": 51.25806451612903,
"grad_norm": 0.8457896709442139,
"learning_rate": 5.11875e-05,
"loss": 0.2112,
"mean_token_accuracy": 0.9135481056414152,
"num_tokens": 1103447.0,
"step": 820
},
{
"entropy": 0.23914105109870434,
"epoch": 51.903225806451616,
"grad_norm": 1.2757948637008667,
"learning_rate": 5.18125e-05,
"loss": 0.2121,
"mean_token_accuracy": 0.917195787280798,
"num_tokens": 1117426.0,
"step": 830
},
{
"epoch": 52.0,
"eval_entropy": 0.6030709551913398,
"eval_loss": 2.1421921253204346,
"eval_mean_token_accuracy": 0.6576450892857143,
"eval_num_tokens": 1119404.0,
"eval_runtime": 0.9157,
"eval_samples_per_second": 60.065,
"eval_steps_per_second": 15.289,
"step": 832
},
{
"entropy": 0.23953668361431674,
"epoch": 52.516129032258064,
"grad_norm": 0.7424585223197937,
"learning_rate": 5.243750000000001e-05,
"loss": 0.206,
"mean_token_accuracy": 0.9173323300323988,
"num_tokens": 1130592.0,
"step": 840
},
{
"epoch": 53.0,
"eval_entropy": 0.6143183495317187,
"eval_loss": 2.136685609817505,
"eval_mean_token_accuracy": 0.661886956010546,
"eval_num_tokens": 1140931.0,
"eval_runtime": 0.9082,
"eval_samples_per_second": 60.559,
"eval_steps_per_second": 15.415,
"step": 848
},
{
"entropy": 0.24182377598787608,
"epoch": 53.12903225806452,
"grad_norm": 1.3732322454452515,
"learning_rate": 5.30625e-05,
"loss": 0.2121,
"mean_token_accuracy": 0.9144565549335981,
"num_tokens": 1143685.0,
"step": 850
},
{
"entropy": 0.22879959754645823,
"epoch": 53.774193548387096,
"grad_norm": 2.147244453430176,
"learning_rate": 5.3687500000000004e-05,
"loss": 0.2054,
"mean_token_accuracy": 0.9151782430708408,
"num_tokens": 1157611.0,
"step": 860
},
{
"epoch": 54.0,
"eval_entropy": 0.6321406619889396,
"eval_loss": 2.204887628555298,
"eval_mean_token_accuracy": 0.6532182906355176,
"eval_num_tokens": 1162458.0,
"eval_runtime": 0.9208,
"eval_samples_per_second": 59.732,
"eval_steps_per_second": 15.205,
"step": 864
},
{
"entropy": 0.23761214551172757,
"epoch": 54.38709677419355,
"grad_norm": 1.3003839254379272,
"learning_rate": 5.43125e-05,
"loss": 0.2035,
"mean_token_accuracy": 0.9193885200902036,
"num_tokens": 1170903.0,
"step": 870
},
{
"entropy": 0.2340365965899668,
"epoch": 55.0,
"grad_norm": 2.1319892406463623,
"learning_rate": 5.49375e-05,
"loss": 0.2094,
"mean_token_accuracy": 0.9123364067391345,
"num_tokens": 1183985.0,
"step": 880
},
{
"epoch": 55.0,
"eval_entropy": 0.5765226589781898,
"eval_loss": 2.259164333343506,
"eval_mean_token_accuracy": 0.6540640251977103,
"eval_num_tokens": 1183985.0,
"eval_runtime": 1.3389,
"eval_samples_per_second": 41.077,
"eval_steps_per_second": 10.456,
"step": 880
},
{
"entropy": 0.22173939775675536,
"epoch": 55.645161290322584,
"grad_norm": 0.9774155616760254,
"learning_rate": 5.556250000000001e-05,
"loss": 0.2028,
"mean_token_accuracy": 0.9162920407950879,
"num_tokens": 1197803.0,
"step": 890
},
{
"epoch": 56.0,
"eval_entropy": 0.6301779236112323,
"eval_loss": 2.164349317550659,
"eval_mean_token_accuracy": 0.6591020779950278,
"eval_num_tokens": 1205512.0,
"eval_runtime": 0.8858,
"eval_samples_per_second": 62.094,
"eval_steps_per_second": 15.806,
"step": 896
},
{
"entropy": 0.24067558975596176,
"epoch": 56.25806451612903,
"grad_norm": 0.6777637004852295,
"learning_rate": 5.6187500000000004e-05,
"loss": 0.2047,
"mean_token_accuracy": 0.9156502038240433,
"num_tokens": 1211040.0,
"step": 900
},
{
"entropy": 0.22286444082856177,
"epoch": 56.903225806451616,
"grad_norm": 1.1683127880096436,
"learning_rate": 5.68125e-05,
"loss": 0.2092,
"mean_token_accuracy": 0.9164877288043499,
"num_tokens": 1224951.0,
"step": 910
},
{
"epoch": 57.0,
"eval_entropy": 0.6137891731091908,
"eval_loss": 2.166926622390747,
"eval_mean_token_accuracy": 0.655881038733891,
"eval_num_tokens": 1227039.0,
"eval_runtime": 0.9144,
"eval_samples_per_second": 60.149,
"eval_steps_per_second": 15.311,
"step": 912
},
{
"entropy": 0.22742715889686033,
"epoch": 57.516129032258064,
"grad_norm": 1.949874997138977,
"learning_rate": 5.74375e-05,
"loss": 0.1963,
"mean_token_accuracy": 0.9188078069373181,
"num_tokens": 1238175.0,
"step": 920
},
{
"epoch": 58.0,
"eval_entropy": 0.6465144370283399,
"eval_loss": 2.1403896808624268,
"eval_mean_token_accuracy": 0.6535507994038718,
"eval_num_tokens": 1248566.0,
"eval_runtime": 0.9857,
"eval_samples_per_second": 55.798,
"eval_steps_per_second": 14.203,
"step": 928
},
{
"entropy": 0.2348696542413611,
"epoch": 58.12903225806452,
"grad_norm": 0.7200958728790283,
"learning_rate": 5.8062499999999995e-05,
"loss": 0.2062,
"mean_token_accuracy": 0.9138364462476027,
"num_tokens": 1251370.0,
"step": 930
},
{
"entropy": 0.23166095688939095,
"epoch": 58.774193548387096,
"grad_norm": 2.3401575088500977,
"learning_rate": 5.8687500000000003e-05,
"loss": 0.2022,
"mean_token_accuracy": 0.9133141487836838,
"num_tokens": 1265252.0,
"step": 940
},
{
"epoch": 59.0,
"eval_entropy": 0.5926203238112586,
"eval_loss": 2.1573755741119385,
"eval_mean_token_accuracy": 0.6642243266105652,
"eval_num_tokens": 1270093.0,
"eval_runtime": 1.3512,
"eval_samples_per_second": 40.706,
"eval_steps_per_second": 10.362,
"step": 944
},
{
"entropy": 0.228854532304563,
"epoch": 59.38709677419355,
"grad_norm": 1.2065060138702393,
"learning_rate": 5.9312500000000005e-05,
"loss": 0.204,
"mean_token_accuracy": 0.915436535289413,
"num_tokens": 1278422.0,
"step": 950
},
{
"entropy": 0.2283171534930405,
"epoch": 60.0,
"grad_norm": 1.6328575611114502,
"learning_rate": 5.99375e-05,
"loss": 0.2106,
"mean_token_accuracy": 0.9148200319001549,
"num_tokens": 1291620.0,
"step": 960
},
{
"epoch": 60.0,
"eval_entropy": 0.6066372990608215,
"eval_loss": 2.2142343521118164,
"eval_mean_token_accuracy": 0.663521830524717,
"eval_num_tokens": 1291620.0,
"eval_runtime": 0.918,
"eval_samples_per_second": 59.916,
"eval_steps_per_second": 15.251,
"step": 960
},
{
"entropy": 0.22587883714586496,
"epoch": 60.645161290322584,
"grad_norm": 1.1729897260665894,
"learning_rate": 6.05625e-05,
"loss": 0.2004,
"mean_token_accuracy": 0.918365728110075,
"num_tokens": 1305660.0,
"step": 970
},
{
"epoch": 61.0,
"eval_entropy": 0.6117608717509678,
"eval_loss": 2.096813678741455,
"eval_mean_token_accuracy": 0.6604258716106415,
"eval_num_tokens": 1313147.0,
"eval_runtime": 0.9077,
"eval_samples_per_second": 60.594,
"eval_steps_per_second": 15.424,
"step": 976
},
{
"entropy": 0.23531277536561615,
"epoch": 61.25806451612903,
"grad_norm": 0.6755152344703674,
"learning_rate": 6.11875e-05,
"loss": 0.2082,
"mean_token_accuracy": 0.9143172777012775,
"num_tokens": 1318734.0,
"step": 980
},
{
"entropy": 0.2311376605182886,
"epoch": 61.903225806451616,
"grad_norm": 1.2641390562057495,
"learning_rate": 6.18125e-05,
"loss": 0.2046,
"mean_token_accuracy": 0.9152424365282059,
"num_tokens": 1332705.0,
"step": 990
},
{
"epoch": 62.0,
"eval_entropy": 0.614239479814257,
"eval_loss": 2.1853811740875244,
"eval_mean_token_accuracy": 0.6579888761043549,
"eval_num_tokens": 1334674.0,
"eval_runtime": 0.912,
"eval_samples_per_second": 60.308,
"eval_steps_per_second": 15.351,
"step": 992
},
{
"entropy": 0.22623609006404877,
"epoch": 62.516129032258064,
"grad_norm": 0.73882657289505,
"learning_rate": 6.24375e-05,
"loss": 0.196,
"mean_token_accuracy": 0.9183140248060226,
"num_tokens": 1345954.0,
"step": 1000
},
{
"epoch": 63.0,
"eval_entropy": 0.5942062480109078,
"eval_loss": 2.2710092067718506,
"eval_mean_token_accuracy": 0.6534528051103864,
"eval_num_tokens": 1356201.0,
"eval_runtime": 0.9816,
"eval_samples_per_second": 56.034,
"eval_steps_per_second": 14.263,
"step": 1008
},
{
"entropy": 0.23058188216466652,
"epoch": 63.12903225806452,
"grad_norm": 1.617550253868103,
"learning_rate": 6.306250000000001e-05,
"loss": 0.2061,
"mean_token_accuracy": 0.9149901968868155,
"num_tokens": 1358961.0,
"step": 1010
},
{
"entropy": 0.2263046816922724,
"epoch": 63.774193548387096,
"grad_norm": 1.7583829164505005,
"learning_rate": 6.36875e-05,
"loss": 0.1997,
"mean_token_accuracy": 0.9183584488928318,
"num_tokens": 1372891.0,
"step": 1020
},
{
"epoch": 64.0,
"eval_entropy": 0.5932143756321498,
"eval_loss": 2.1655139923095703,
"eval_mean_token_accuracy": 0.659838148525783,
"eval_num_tokens": 1377728.0,
"eval_runtime": 0.8899,
"eval_samples_per_second": 61.807,
"eval_steps_per_second": 15.733,
"step": 1024
}
],
"logging_steps": 10,
"max_steps": 16000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1000,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.765547267067904e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}