diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4249 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.18047073354834284, + "eval_steps": 10, + "global_step": 2630, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 6.862005077883757e-05, + "grad_norm": 1.9216967821121216, + "learning_rate": 0.0, + "loss": 1.6865, + "step": 1 + }, + { + "epoch": 0.0006862005077883758, + "grad_norm": 1.8931514024734497, + "learning_rate": 8.234217749313815e-07, + "loss": 1.7993, + "step": 10 + }, + { + "epoch": 0.0006862005077883758, + "eval_accuracy": 0.5898158335459887, + "eval_loss": 1.8085482120513916, + "eval_runtime": 2190.4949, + "eval_samples_per_second": 13.441, + "eval_steps_per_second": 1.68, + "step": 10 + }, + { + "epoch": 0.0013724010155767516, + "grad_norm": 1.8920422792434692, + "learning_rate": 1.7383348581884722e-06, + "loss": 1.8431, + "step": 20 + }, + { + "epoch": 0.0013724010155767516, + "eval_accuracy": 0.5915482852347239, + "eval_loss": 1.7978498935699463, + "eval_runtime": 2191.0867, + "eval_samples_per_second": 13.437, + "eval_steps_per_second": 1.68, + "step": 20 + }, + { + "epoch": 0.0020586015233651275, + "grad_norm": 1.7306218147277832, + "learning_rate": 2.653247941445563e-06, + "loss": 1.7786, + "step": 30 + }, + { + "epoch": 0.0020586015233651275, + "eval_accuracy": 0.5931407264065678, + "eval_loss": 1.7860997915267944, + "eval_runtime": 2192.9273, + "eval_samples_per_second": 13.426, + "eval_steps_per_second": 1.679, + "step": 30 + }, + { + "epoch": 0.002744802031153503, + "grad_norm": 1.678896427154541, + "learning_rate": 3.5681610247026537e-06, + "loss": 1.7845, + "step": 40 + }, + { + "epoch": 0.002744802031153503, + "eval_accuracy": 0.5940313221156256, + "eval_loss": 1.777963638305664, + "eval_runtime": 2193.6313, + "eval_samples_per_second": 13.422, + "eval_steps_per_second": 1.678, + "step": 40 + }, + { + "epoch": 0.003431002538941879, + "grad_norm": 1.608749270439148, + "learning_rate": 4.483074107959744e-06, + "loss": 1.7561, + "step": 50 + }, + { + "epoch": 0.003431002538941879, + "eval_accuracy": 0.5950869622352757, + "eval_loss": 1.7716691493988037, + "eval_runtime": 2195.0575, + "eval_samples_per_second": 13.413, + "eval_steps_per_second": 1.677, + "step": 50 + }, + { + "epoch": 0.004117203046730255, + "grad_norm": 1.5839513540267944, + "learning_rate": 5.397987191216835e-06, + "loss": 1.7809, + "step": 60 + }, + { + "epoch": 0.004117203046730255, + "eval_accuracy": 0.5955689808940925, + "eval_loss": 1.767722725868225, + "eval_runtime": 2194.6298, + "eval_samples_per_second": 13.415, + "eval_steps_per_second": 1.677, + "step": 60 + }, + { + "epoch": 0.00480340355451863, + "grad_norm": 1.6457411050796509, + "learning_rate": 6.312900274473925e-06, + "loss": 1.7554, + "step": 70 + }, + { + "epoch": 0.00480340355451863, + "eval_accuracy": 0.5958804437015288, + "eval_loss": 1.765299916267395, + "eval_runtime": 2194.3885, + "eval_samples_per_second": 13.417, + "eval_steps_per_second": 1.677, + "step": 70 + }, + { + "epoch": 0.005489604062307006, + "grad_norm": 1.5799423456192017, + "learning_rate": 7.227813357731015e-06, + "loss": 1.7361, + "step": 80 + }, + { + "epoch": 0.005489604062307006, + "eval_accuracy": 0.5960319751217547, + "eval_loss": 1.7639434337615967, + "eval_runtime": 2195.254, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 80 + }, + { + "epoch": 0.0061758045700953815, + "grad_norm": 1.7162026166915894, + "learning_rate": 8.142726440988106e-06, + "loss": 1.7427, + "step": 90 + }, + { + "epoch": 0.0061758045700953815, + "eval_accuracy": 0.5961102309406575, + "eval_loss": 1.7636356353759766, + "eval_runtime": 2195.6998, + "eval_samples_per_second": 13.409, + "eval_steps_per_second": 1.676, + "step": 90 + }, + { + "epoch": 0.006862005077883758, + "grad_norm": 1.703099012374878, + "learning_rate": 9.057639524245198e-06, + "loss": 1.7479, + "step": 100 + }, + { + "epoch": 0.006862005077883758, + "eval_accuracy": 0.5962347363801508, + "eval_loss": 1.7628475427627563, + "eval_runtime": 2195.4873, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 100 + }, + { + "epoch": 0.007548205585672134, + "grad_norm": 1.5629897117614746, + "learning_rate": 9.972552607502288e-06, + "loss": 1.7511, + "step": 110 + }, + { + "epoch": 0.007548205585672134, + "eval_accuracy": 0.5963783990566007, + "eval_loss": 1.761244535446167, + "eval_runtime": 2194.8861, + "eval_samples_per_second": 13.414, + "eval_steps_per_second": 1.677, + "step": 110 + }, + { + "epoch": 0.00823440609346051, + "grad_norm": 1.8800327777862549, + "learning_rate": 1.0887465690759378e-05, + "loss": 1.7918, + "step": 120 + }, + { + "epoch": 0.00823440609346051, + "eval_accuracy": 0.595912250691138, + "eval_loss": 1.7631102800369263, + "eval_runtime": 2195.8634, + "eval_samples_per_second": 13.408, + "eval_steps_per_second": 1.676, + "step": 120 + }, + { + "epoch": 0.008920606601248885, + "grad_norm": 1.6483738422393799, + "learning_rate": 1.180237877401647e-05, + "loss": 1.7451, + "step": 130 + }, + { + "epoch": 0.008920606601248885, + "eval_accuracy": 0.5959484734736679, + "eval_loss": 1.7631044387817383, + "eval_runtime": 2194.7206, + "eval_samples_per_second": 13.415, + "eval_steps_per_second": 1.677, + "step": 130 + }, + { + "epoch": 0.00960680710903726, + "grad_norm": 1.5531351566314697, + "learning_rate": 1.2717291857273558e-05, + "loss": 1.8069, + "step": 140 + }, + { + "epoch": 0.00960680710903726, + "eval_accuracy": 0.595846910236492, + "eval_loss": 1.7640652656555176, + "eval_runtime": 2194.5306, + "eval_samples_per_second": 13.416, + "eval_steps_per_second": 1.677, + "step": 140 + }, + { + "epoch": 0.010293007616825637, + "grad_norm": 1.653224229812622, + "learning_rate": 1.3632204940530649e-05, + "loss": 1.774, + "step": 150 + }, + { + "epoch": 0.010293007616825637, + "eval_accuracy": 0.5955285083258945, + "eval_loss": 1.76512610912323, + "eval_runtime": 2194.8618, + "eval_samples_per_second": 13.414, + "eval_steps_per_second": 1.677, + "step": 150 + }, + { + "epoch": 0.010979208124614013, + "grad_norm": 1.4969476461410522, + "learning_rate": 1.454711802378774e-05, + "loss": 1.7293, + "step": 160 + }, + { + "epoch": 0.010979208124614013, + "eval_accuracy": 0.5952186723895343, + "eval_loss": 1.7661479711532593, + "eval_runtime": 2195.9262, + "eval_samples_per_second": 13.408, + "eval_steps_per_second": 1.676, + "step": 160 + }, + { + "epoch": 0.011665408632402388, + "grad_norm": 1.5510554313659668, + "learning_rate": 1.546203110704483e-05, + "loss": 1.8144, + "step": 170 + }, + { + "epoch": 0.011665408632402388, + "eval_accuracy": 0.5952723923364943, + "eval_loss": 1.7658929824829102, + "eval_runtime": 2195.7289, + "eval_samples_per_second": 13.409, + "eval_steps_per_second": 1.676, + "step": 170 + }, + { + "epoch": 0.012351609140190763, + "grad_norm": 1.5824214220046997, + "learning_rate": 1.6376944190301923e-05, + "loss": 1.762, + "step": 180 + }, + { + "epoch": 0.012351609140190763, + "eval_accuracy": 0.5950958270225676, + "eval_loss": 1.7668511867523193, + "eval_runtime": 2196.0319, + "eval_samples_per_second": 13.407, + "eval_steps_per_second": 1.676, + "step": 180 + }, + { + "epoch": 0.01303780964797914, + "grad_norm": 1.536055088043213, + "learning_rate": 1.7291857273559013e-05, + "loss": 1.7647, + "step": 190 + }, + { + "epoch": 0.01303780964797914, + "eval_accuracy": 0.5946427600286144, + "eval_loss": 1.7685387134552002, + "eval_runtime": 2194.9602, + "eval_samples_per_second": 13.413, + "eval_steps_per_second": 1.677, + "step": 190 + }, + { + "epoch": 0.013724010155767515, + "grad_norm": 1.5057470798492432, + "learning_rate": 1.8206770356816103e-05, + "loss": 1.7745, + "step": 200 + }, + { + "epoch": 0.013724010155767515, + "eval_accuracy": 0.5946046447634041, + "eval_loss": 1.7696959972381592, + "eval_runtime": 2194.9716, + "eval_samples_per_second": 13.413, + "eval_steps_per_second": 1.677, + "step": 200 + }, + { + "epoch": 0.01441021066355589, + "grad_norm": 1.3596829175949097, + "learning_rate": 1.9121683440073193e-05, + "loss": 1.7936, + "step": 210 + }, + { + "epoch": 0.01441021066355589, + "eval_accuracy": 0.5943018475345566, + "eval_loss": 1.7707691192626953, + "eval_runtime": 2195.7035, + "eval_samples_per_second": 13.409, + "eval_steps_per_second": 1.676, + "step": 210 + }, + { + "epoch": 0.015096411171344267, + "grad_norm": 1.5187965631484985, + "learning_rate": 2.0036596523330283e-05, + "loss": 1.7726, + "step": 220 + }, + { + "epoch": 0.015096411171344267, + "eval_accuracy": 0.5939556892113148, + "eval_loss": 1.7723795175552368, + "eval_runtime": 2196.0461, + "eval_samples_per_second": 13.407, + "eval_steps_per_second": 1.676, + "step": 220 + }, + { + "epoch": 0.01578261167913264, + "grad_norm": 1.4719396829605103, + "learning_rate": 2.0951509606587374e-05, + "loss": 1.7646, + "step": 230 + }, + { + "epoch": 0.01578261167913264, + "eval_accuracy": 0.5935164340207827, + "eval_loss": 1.7753576040267944, + "eval_runtime": 2196.1999, + "eval_samples_per_second": 13.406, + "eval_steps_per_second": 1.676, + "step": 230 + }, + { + "epoch": 0.01646881218692102, + "grad_norm": 1.4845266342163086, + "learning_rate": 2.1866422689844464e-05, + "loss": 1.7424, + "step": 240 + }, + { + "epoch": 0.01646881218692102, + "eval_accuracy": 0.5930875044813658, + "eval_loss": 1.777539849281311, + "eval_runtime": 2196.2424, + "eval_samples_per_second": 13.406, + "eval_steps_per_second": 1.676, + "step": 240 + }, + { + "epoch": 0.017155012694709395, + "grad_norm": 1.3525185585021973, + "learning_rate": 2.2781335773101558e-05, + "loss": 1.7129, + "step": 250 + }, + { + "epoch": 0.017155012694709395, + "eval_accuracy": 0.5927156814368632, + "eval_loss": 1.7804397344589233, + "eval_runtime": 2195.161, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 250 + }, + { + "epoch": 0.01784121320249777, + "grad_norm": 1.3116035461425781, + "learning_rate": 2.3696248856358648e-05, + "loss": 1.7057, + "step": 260 + }, + { + "epoch": 0.01784121320249777, + "eval_accuracy": 0.5927907831179655, + "eval_loss": 1.77951180934906, + "eval_runtime": 2195.0811, + "eval_samples_per_second": 13.413, + "eval_steps_per_second": 1.677, + "step": 260 + }, + { + "epoch": 0.018527413710286145, + "grad_norm": 1.427150011062622, + "learning_rate": 2.4611161939615738e-05, + "loss": 1.7832, + "step": 270 + }, + { + "epoch": 0.018527413710286145, + "eval_accuracy": 0.5925692298385685, + "eval_loss": 1.78168523311615, + "eval_runtime": 2195.7308, + "eval_samples_per_second": 13.409, + "eval_steps_per_second": 1.676, + "step": 270 + }, + { + "epoch": 0.01921361421807452, + "grad_norm": 1.4132215976715088, + "learning_rate": 2.552607502287283e-05, + "loss": 1.7742, + "step": 280 + }, + { + "epoch": 0.01921361421807452, + "eval_accuracy": 0.5916866688805393, + "eval_loss": 1.7839642763137817, + "eval_runtime": 2194.9088, + "eval_samples_per_second": 13.414, + "eval_steps_per_second": 1.677, + "step": 280 + }, + { + "epoch": 0.019899814725862896, + "grad_norm": 1.3926459550857544, + "learning_rate": 2.6440988106129922e-05, + "loss": 1.7713, + "step": 290 + }, + { + "epoch": 0.019899814725862896, + "eval_accuracy": 0.592083725027446, + "eval_loss": 1.784454345703125, + "eval_runtime": 2195.1648, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 290 + }, + { + "epoch": 0.020586015233651275, + "grad_norm": 1.2985360622406006, + "learning_rate": 2.7355901189387012e-05, + "loss": 1.8099, + "step": 300 + }, + { + "epoch": 0.020586015233651275, + "eval_accuracy": 0.5915102363724148, + "eval_loss": 1.7870558500289917, + "eval_runtime": 2196.189, + "eval_samples_per_second": 13.406, + "eval_steps_per_second": 1.676, + "step": 300 + }, + { + "epoch": 0.02127221574143965, + "grad_norm": 1.3202718496322632, + "learning_rate": 2.8270814272644102e-05, + "loss": 1.8158, + "step": 310 + }, + { + "epoch": 0.02127221574143965, + "eval_accuracy": 0.5911451532223702, + "eval_loss": 1.789017677307129, + "eval_runtime": 2195.7623, + "eval_samples_per_second": 13.409, + "eval_steps_per_second": 1.676, + "step": 310 + }, + { + "epoch": 0.021958416249228025, + "grad_norm": 1.3661465644836426, + "learning_rate": 2.9185727355901192e-05, + "loss": 1.7867, + "step": 320 + }, + { + "epoch": 0.021958416249228025, + "eval_accuracy": 0.5906047664135189, + "eval_loss": 1.7920600175857544, + "eval_runtime": 2196.3235, + "eval_samples_per_second": 13.405, + "eval_steps_per_second": 1.676, + "step": 320 + }, + { + "epoch": 0.0226446167570164, + "grad_norm": 1.2746013402938843, + "learning_rate": 3.0100640439158283e-05, + "loss": 1.7356, + "step": 330 + }, + { + "epoch": 0.0226446167570164, + "eval_accuracy": 0.5906917210124609, + "eval_loss": 1.7915929555892944, + "eval_runtime": 2196.2755, + "eval_samples_per_second": 13.405, + "eval_steps_per_second": 1.676, + "step": 330 + }, + { + "epoch": 0.023330817264804776, + "grad_norm": 1.3848775625228882, + "learning_rate": 3.101555352241537e-05, + "loss": 1.7845, + "step": 340 + }, + { + "epoch": 0.023330817264804776, + "eval_accuracy": 0.5902880577768986, + "eval_loss": 1.7954319715499878, + "eval_runtime": 2195.6581, + "eval_samples_per_second": 13.409, + "eval_steps_per_second": 1.676, + "step": 340 + }, + { + "epoch": 0.02401701777259315, + "grad_norm": 1.3434277772903442, + "learning_rate": 3.193046660567246e-05, + "loss": 1.8831, + "step": 350 + }, + { + "epoch": 0.02401701777259315, + "eval_accuracy": 0.5896593219081829, + "eval_loss": 1.7972683906555176, + "eval_runtime": 2196.3435, + "eval_samples_per_second": 13.405, + "eval_steps_per_second": 1.676, + "step": 350 + }, + { + "epoch": 0.024703218280381526, + "grad_norm": 1.266026258468628, + "learning_rate": 3.284537968892955e-05, + "loss": 1.7802, + "step": 360 + }, + { + "epoch": 0.024703218280381526, + "eval_accuracy": 0.5896452112917071, + "eval_loss": 1.7977614402770996, + "eval_runtime": 2194.9145, + "eval_samples_per_second": 13.414, + "eval_steps_per_second": 1.677, + "step": 360 + }, + { + "epoch": 0.025389418788169905, + "grad_norm": 1.2292088270187378, + "learning_rate": 3.3760292772186643e-05, + "loss": 1.8164, + "step": 370 + }, + { + "epoch": 0.025389418788169905, + "eval_accuracy": 0.5890495108662703, + "eval_loss": 1.8011078834533691, + "eval_runtime": 2195.3229, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 370 + }, + { + "epoch": 0.02607561929595828, + "grad_norm": 1.2202467918395996, + "learning_rate": 3.467520585544373e-05, + "loss": 1.8117, + "step": 380 + }, + { + "epoch": 0.02607561929595828, + "eval_accuracy": 0.5890773336818158, + "eval_loss": 1.8025044202804565, + "eval_runtime": 2195.1721, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 380 + }, + { + "epoch": 0.026761819803746655, + "grad_norm": 1.2751961946487427, + "learning_rate": 3.5590118938700824e-05, + "loss": 1.8681, + "step": 390 + }, + { + "epoch": 0.026761819803746655, + "eval_accuracy": 0.5889283255718303, + "eval_loss": 1.8029353618621826, + "eval_runtime": 2194.9883, + "eval_samples_per_second": 13.413, + "eval_steps_per_second": 1.677, + "step": 390 + }, + { + "epoch": 0.02744802031153503, + "grad_norm": 1.1804041862487793, + "learning_rate": 3.650503202195791e-05, + "loss": 1.7925, + "step": 400 + }, + { + "epoch": 0.02744802031153503, + "eval_accuracy": 0.5888150754240672, + "eval_loss": 1.8044122457504272, + "eval_runtime": 2195.7663, + "eval_samples_per_second": 13.409, + "eval_steps_per_second": 1.676, + "step": 400 + }, + { + "epoch": 0.028134220819323406, + "grad_norm": 1.2452822923660278, + "learning_rate": 3.7419945105215004e-05, + "loss": 1.8606, + "step": 410 + }, + { + "epoch": 0.028134220819323406, + "eval_accuracy": 0.5877631206654261, + "eval_loss": 1.8087084293365479, + "eval_runtime": 2195.4238, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 410 + }, + { + "epoch": 0.02882042132711178, + "grad_norm": 1.3155659437179565, + "learning_rate": 3.83348581884721e-05, + "loss": 1.7495, + "step": 420 + }, + { + "epoch": 0.02882042132711178, + "eval_accuracy": 0.5877042876950842, + "eval_loss": 1.8115092515945435, + "eval_runtime": 2195.0116, + "eval_samples_per_second": 13.413, + "eval_steps_per_second": 1.677, + "step": 420 + }, + { + "epoch": 0.029506621834900156, + "grad_norm": 1.2945083379745483, + "learning_rate": 3.9249771271729185e-05, + "loss": 1.8207, + "step": 430 + }, + { + "epoch": 0.029506621834900156, + "eval_accuracy": 0.5871699103487793, + "eval_loss": 1.812555193901062, + "eval_runtime": 2196.4551, + "eval_samples_per_second": 13.404, + "eval_steps_per_second": 1.676, + "step": 430 + }, + { + "epoch": 0.030192822342688535, + "grad_norm": 1.2076581716537476, + "learning_rate": 4.016468435498628e-05, + "loss": 1.8009, + "step": 440 + }, + { + "epoch": 0.030192822342688535, + "eval_accuracy": 0.5872769186238423, + "eval_loss": 1.8119513988494873, + "eval_runtime": 2195.9383, + "eval_samples_per_second": 13.407, + "eval_steps_per_second": 1.676, + "step": 440 + }, + { + "epoch": 0.03087902285047691, + "grad_norm": 1.2269212007522583, + "learning_rate": 4.1079597438243365e-05, + "loss": 1.8264, + "step": 450 + }, + { + "epoch": 0.03087902285047691, + "eval_accuracy": 0.5867487167473362, + "eval_loss": 1.815908670425415, + "eval_runtime": 2195.2403, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 450 + }, + { + "epoch": 0.03156522335826528, + "grad_norm": 1.2208211421966553, + "learning_rate": 4.199451052150046e-05, + "loss": 1.8048, + "step": 460 + }, + { + "epoch": 0.03156522335826528, + "eval_accuracy": 0.5865989450039885, + "eval_loss": 1.8151869773864746, + "eval_runtime": 2196.2404, + "eval_samples_per_second": 13.406, + "eval_steps_per_second": 1.676, + "step": 460 + }, + { + "epoch": 0.03225142386605366, + "grad_norm": 1.1229742765426636, + "learning_rate": 4.2909423604757546e-05, + "loss": 1.8115, + "step": 470 + }, + { + "epoch": 0.03225142386605366, + "eval_accuracy": 0.5854921082476188, + "eval_loss": 1.8196277618408203, + "eval_runtime": 2196.4836, + "eval_samples_per_second": 13.404, + "eval_steps_per_second": 1.676, + "step": 470 + }, + { + "epoch": 0.03293762437384204, + "grad_norm": 1.1489847898483276, + "learning_rate": 4.382433668801464e-05, + "loss": 1.8195, + "step": 480 + }, + { + "epoch": 0.03293762437384204, + "eval_accuracy": 0.5857865387109324, + "eval_loss": 1.8216103315353394, + "eval_runtime": 2195.3397, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 480 + }, + { + "epoch": 0.03362382488163041, + "grad_norm": 1.2842459678649902, + "learning_rate": 4.473924977127173e-05, + "loss": 1.7904, + "step": 490 + }, + { + "epoch": 0.03362382488163041, + "eval_accuracy": 0.5854642190291723, + "eval_loss": 1.823735237121582, + "eval_runtime": 2195.5222, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 490 + }, + { + "epoch": 0.03431002538941879, + "grad_norm": 1.1588455438613892, + "learning_rate": 4.565416285452882e-05, + "loss": 1.8076, + "step": 500 + }, + { + "epoch": 0.03431002538941879, + "eval_accuracy": 0.584858989787433, + "eval_loss": 1.82528555393219, + "eval_runtime": 2196.6782, + "eval_samples_per_second": 13.403, + "eval_steps_per_second": 1.676, + "step": 500 + }, + { + "epoch": 0.03499622589720716, + "grad_norm": 1.0610827207565308, + "learning_rate": 4.656907593778591e-05, + "loss": 1.7858, + "step": 510 + }, + { + "epoch": 0.03499622589720716, + "eval_accuracy": 0.5846732608731596, + "eval_loss": 1.8272473812103271, + "eval_runtime": 2195.3692, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 510 + }, + { + "epoch": 0.03568242640499554, + "grad_norm": 1.1871790885925293, + "learning_rate": 4.7483989021043e-05, + "loss": 1.8204, + "step": 520 + }, + { + "epoch": 0.03568242640499554, + "eval_accuracy": 0.5842074113207517, + "eval_loss": 1.829712986946106, + "eval_runtime": 2195.2548, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 520 + }, + { + "epoch": 0.03636862691278391, + "grad_norm": 1.1551142930984497, + "learning_rate": 4.8398902104300094e-05, + "loss": 1.8692, + "step": 530 + }, + { + "epoch": 0.03636862691278391, + "eval_accuracy": 0.5840751699432846, + "eval_loss": 1.8313241004943848, + "eval_runtime": 2196.5372, + "eval_samples_per_second": 13.404, + "eval_steps_per_second": 1.676, + "step": 530 + }, + { + "epoch": 0.03705482742057229, + "grad_norm": 1.1456727981567383, + "learning_rate": 4.931381518755718e-05, + "loss": 1.8427, + "step": 540 + }, + { + "epoch": 0.03705482742057229, + "eval_accuracy": 0.5839454850775084, + "eval_loss": 1.8317539691925049, + "eval_runtime": 2197.0175, + "eval_samples_per_second": 13.401, + "eval_steps_per_second": 1.675, + "step": 540 + }, + { + "epoch": 0.03774102792836067, + "grad_norm": 1.045682668685913, + "learning_rate": 5.0228728270814274e-05, + "loss": 1.8425, + "step": 550 + }, + { + "epoch": 0.03774102792836067, + "eval_accuracy": 0.5831079120849495, + "eval_loss": 1.83717679977417, + "eval_runtime": 2195.5341, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 550 + }, + { + "epoch": 0.03842722843614904, + "grad_norm": 1.0730020999908447, + "learning_rate": 5.114364135407137e-05, + "loss": 1.7764, + "step": 560 + }, + { + "epoch": 0.03842722843614904, + "eval_accuracy": 0.5835356463721473, + "eval_loss": 1.835798978805542, + "eval_runtime": 2195.2526, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 560 + }, + { + "epoch": 0.03911342894393742, + "grad_norm": 1.0931340456008911, + "learning_rate": 5.2058554437328455e-05, + "loss": 1.8409, + "step": 570 + }, + { + "epoch": 0.03911342894393742, + "eval_accuracy": 0.5838007599546415, + "eval_loss": 1.8357644081115723, + "eval_runtime": 2196.814, + "eval_samples_per_second": 13.402, + "eval_steps_per_second": 1.676, + "step": 570 + }, + { + "epoch": 0.03979962945172579, + "grad_norm": 1.0866916179656982, + "learning_rate": 5.297346752058555e-05, + "loss": 1.8691, + "step": 580 + }, + { + "epoch": 0.03979962945172579, + "eval_accuracy": 0.5829323428145388, + "eval_loss": 1.8371539115905762, + "eval_runtime": 2195.2154, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 580 + }, + { + "epoch": 0.04048582995951417, + "grad_norm": 1.0397108793258667, + "learning_rate": 5.3888380603842635e-05, + "loss": 1.8297, + "step": 590 + }, + { + "epoch": 0.04048582995951417, + "eval_accuracy": 0.582625661015979, + "eval_loss": 1.8405317068099976, + "eval_runtime": 2196.6833, + "eval_samples_per_second": 13.403, + "eval_steps_per_second": 1.676, + "step": 590 + }, + { + "epoch": 0.04117203046730255, + "grad_norm": 1.045896291732788, + "learning_rate": 5.4803293687099735e-05, + "loss": 1.846, + "step": 600 + }, + { + "epoch": 0.04117203046730255, + "eval_accuracy": 0.582842034869093, + "eval_loss": 1.83939790725708, + "eval_runtime": 2195.4627, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 600 + }, + { + "epoch": 0.04185823097509092, + "grad_norm": 1.009037733078003, + "learning_rate": 5.5718206770356815e-05, + "loss": 1.8152, + "step": 610 + }, + { + "epoch": 0.04185823097509092, + "eval_accuracy": 0.5820976251467255, + "eval_loss": 1.8433318138122559, + "eval_runtime": 2195.2269, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 610 + }, + { + "epoch": 0.0425444314828793, + "grad_norm": 1.0480693578720093, + "learning_rate": 5.6633119853613916e-05, + "loss": 1.8718, + "step": 620 + }, + { + "epoch": 0.0425444314828793, + "eval_accuracy": 0.5822570585121779, + "eval_loss": 1.8436963558197021, + "eval_runtime": 2195.2658, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 620 + }, + { + "epoch": 0.04323063199066767, + "grad_norm": 1.1000436544418335, + "learning_rate": 5.7548032936870996e-05, + "loss": 1.8646, + "step": 630 + }, + { + "epoch": 0.04323063199066767, + "eval_accuracy": 0.5819806564365029, + "eval_loss": 1.8459354639053345, + "eval_runtime": 2195.2676, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 630 + }, + { + "epoch": 0.04391683249845605, + "grad_norm": 1.154166579246521, + "learning_rate": 5.8462946020128096e-05, + "loss": 1.8593, + "step": 640 + }, + { + "epoch": 0.04391683249845605, + "eval_accuracy": 0.582069735928279, + "eval_loss": 1.847158670425415, + "eval_runtime": 2196.3163, + "eval_samples_per_second": 13.405, + "eval_steps_per_second": 1.676, + "step": 640 + }, + { + "epoch": 0.04460303300624442, + "grad_norm": 1.0694483518600464, + "learning_rate": 5.937785910338518e-05, + "loss": 1.8511, + "step": 650 + }, + { + "epoch": 0.04460303300624442, + "eval_accuracy": 0.5816468822543094, + "eval_loss": 1.8503717184066772, + "eval_runtime": 2195.3338, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 650 + }, + { + "epoch": 0.0452892335140328, + "grad_norm": 1.0069674253463745, + "learning_rate": 6.029277218664228e-05, + "loss": 1.8305, + "step": 660 + }, + { + "epoch": 0.0452892335140328, + "eval_accuracy": 0.5812435178318018, + "eval_loss": 1.851124882698059, + "eval_runtime": 2196.3471, + "eval_samples_per_second": 13.405, + "eval_steps_per_second": 1.676, + "step": 660 + }, + { + "epoch": 0.04597543402182118, + "grad_norm": 0.93955397605896, + "learning_rate": 6.120768526989936e-05, + "loss": 1.8284, + "step": 670 + }, + { + "epoch": 0.04597543402182118, + "eval_accuracy": 0.5811864777397887, + "eval_loss": 1.8510762453079224, + "eval_runtime": 2196.11, + "eval_samples_per_second": 13.406, + "eval_steps_per_second": 1.676, + "step": 670 + }, + { + "epoch": 0.04666163452960955, + "grad_norm": 0.9924960732460022, + "learning_rate": 6.212259835315646e-05, + "loss": 1.8664, + "step": 680 + }, + { + "epoch": 0.04666163452960955, + "eval_accuracy": 0.5802622489613424, + "eval_loss": 1.854859709739685, + "eval_runtime": 2196.6759, + "eval_samples_per_second": 13.403, + "eval_steps_per_second": 1.676, + "step": 680 + }, + { + "epoch": 0.04734783503739793, + "grad_norm": 1.0502827167510986, + "learning_rate": 6.303751143641354e-05, + "loss": 1.8518, + "step": 690 + }, + { + "epoch": 0.04734783503739793, + "eval_accuracy": 0.5797041657793579, + "eval_loss": 1.8587929010391235, + "eval_runtime": 2195.6371, + "eval_samples_per_second": 13.409, + "eval_steps_per_second": 1.677, + "step": 690 + }, + { + "epoch": 0.0480340355451863, + "grad_norm": 0.9641822576522827, + "learning_rate": 6.395242451967064e-05, + "loss": 1.8469, + "step": 700 + }, + { + "epoch": 0.0480340355451863, + "eval_accuracy": 0.5796691714504977, + "eval_loss": 1.859383225440979, + "eval_runtime": 2195.5862, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 700 + }, + { + "epoch": 0.04872023605297468, + "grad_norm": 1.0197440385818481, + "learning_rate": 6.486733760292772e-05, + "loss": 1.871, + "step": 710 + }, + { + "epoch": 0.04872023605297468, + "eval_accuracy": 0.57873611108621, + "eval_loss": 1.8629941940307617, + "eval_runtime": 2195.7373, + "eval_samples_per_second": 13.409, + "eval_steps_per_second": 1.676, + "step": 710 + }, + { + "epoch": 0.04940643656076305, + "grad_norm": 1.0116243362426758, + "learning_rate": 6.578225068618482e-05, + "loss": 1.8012, + "step": 720 + }, + { + "epoch": 0.04940643656076305, + "eval_accuracy": 0.5800620774160878, + "eval_loss": 1.8581833839416504, + "eval_runtime": 2195.6346, + "eval_samples_per_second": 13.409, + "eval_steps_per_second": 1.677, + "step": 720 + }, + { + "epoch": 0.05009263706855143, + "grad_norm": 1.015205979347229, + "learning_rate": 6.669716376944191e-05, + "loss": 1.8966, + "step": 730 + }, + { + "epoch": 0.05009263706855143, + "eval_accuracy": 0.5792581042914667, + "eval_loss": 1.8612667322158813, + "eval_runtime": 2195.4078, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 730 + }, + { + "epoch": 0.05077883757633981, + "grad_norm": 1.031814694404602, + "learning_rate": 6.7612076852699e-05, + "loss": 1.8505, + "step": 740 + }, + { + "epoch": 0.05077883757633981, + "eval_accuracy": 0.5786211344630193, + "eval_loss": 1.8662667274475098, + "eval_runtime": 2195.1361, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 740 + }, + { + "epoch": 0.05146503808412818, + "grad_norm": 0.9193501472473145, + "learning_rate": 6.852698993595609e-05, + "loss": 1.8415, + "step": 750 + }, + { + "epoch": 0.05146503808412818, + "eval_accuracy": 0.5790159329112897, + "eval_loss": 1.8643760681152344, + "eval_runtime": 2196.5278, + "eval_samples_per_second": 13.404, + "eval_steps_per_second": 1.676, + "step": 750 + }, + { + "epoch": 0.05215123859191656, + "grad_norm": 0.9670405387878418, + "learning_rate": 6.944190301921317e-05, + "loss": 1.9061, + "step": 760 + }, + { + "epoch": 0.05215123859191656, + "eval_accuracy": 0.5785704026466072, + "eval_loss": 1.8654682636260986, + "eval_runtime": 2196.4101, + "eval_samples_per_second": 13.405, + "eval_steps_per_second": 1.676, + "step": 760 + }, + { + "epoch": 0.05283743909970493, + "grad_norm": 0.9446126222610474, + "learning_rate": 7.035681610247027e-05, + "loss": 1.8452, + "step": 770 + }, + { + "epoch": 0.05283743909970493, + "eval_accuracy": 0.5786606109877013, + "eval_loss": 1.8661186695098877, + "eval_runtime": 2196.1052, + "eval_samples_per_second": 13.406, + "eval_steps_per_second": 1.676, + "step": 770 + }, + { + "epoch": 0.05352363960749331, + "grad_norm": 0.9109969139099121, + "learning_rate": 7.127172918572736e-05, + "loss": 1.8865, + "step": 780 + }, + { + "epoch": 0.05352363960749331, + "eval_accuracy": 0.5784766749517566, + "eval_loss": 1.8670191764831543, + "eval_runtime": 2196.1816, + "eval_samples_per_second": 13.406, + "eval_steps_per_second": 1.676, + "step": 780 + }, + { + "epoch": 0.05420984011528168, + "grad_norm": 1.0019862651824951, + "learning_rate": 7.218664226898445e-05, + "loss": 1.8181, + "step": 790 + }, + { + "epoch": 0.05420984011528168, + "eval_accuracy": 0.5783223214082356, + "eval_loss": 1.8682514429092407, + "eval_runtime": 2196.2402, + "eval_samples_per_second": 13.406, + "eval_steps_per_second": 1.676, + "step": 790 + }, + { + "epoch": 0.05489604062307006, + "grad_norm": 0.9845278859138489, + "learning_rate": 7.310155535224153e-05, + "loss": 1.8274, + "step": 800 + }, + { + "epoch": 0.05489604062307006, + "eval_accuracy": 0.5783366312334146, + "eval_loss": 1.8687405586242676, + "eval_runtime": 2197.5837, + "eval_samples_per_second": 13.397, + "eval_steps_per_second": 1.675, + "step": 800 + }, + { + "epoch": 0.05558224113085844, + "grad_norm": 0.9345710873603821, + "learning_rate": 7.401646843549863e-05, + "loss": 1.8175, + "step": 810 + }, + { + "epoch": 0.05558224113085844, + "eval_accuracy": 0.578313921441251, + "eval_loss": 1.870363712310791, + "eval_runtime": 2195.3953, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 810 + }, + { + "epoch": 0.05626844163864681, + "grad_norm": 0.9270123839378357, + "learning_rate": 7.493138151875572e-05, + "loss": 1.8407, + "step": 820 + }, + { + "epoch": 0.05626844163864681, + "eval_accuracy": 0.5773186415586673, + "eval_loss": 1.8754173517227173, + "eval_runtime": 2197.5914, + "eval_samples_per_second": 13.397, + "eval_steps_per_second": 1.675, + "step": 820 + }, + { + "epoch": 0.05695464214643519, + "grad_norm": 0.9666882753372192, + "learning_rate": 7.584629460201281e-05, + "loss": 1.8148, + "step": 830 + }, + { + "epoch": 0.05695464214643519, + "eval_accuracy": 0.5775507196978827, + "eval_loss": 1.8739426136016846, + "eval_runtime": 2198.1195, + "eval_samples_per_second": 13.394, + "eval_steps_per_second": 1.675, + "step": 830 + }, + { + "epoch": 0.05764084265422356, + "grad_norm": 0.9308087229728699, + "learning_rate": 7.67612076852699e-05, + "loss": 1.8673, + "step": 840 + }, + { + "epoch": 0.05764084265422356, + "eval_accuracy": 0.5774692101368278, + "eval_loss": 1.8749210834503174, + "eval_runtime": 2197.1707, + "eval_samples_per_second": 13.4, + "eval_steps_per_second": 1.675, + "step": 840 + }, + { + "epoch": 0.05832704316201194, + "grad_norm": 1.0570358037948608, + "learning_rate": 7.7676120768527e-05, + "loss": 1.8628, + "step": 850 + }, + { + "epoch": 0.05832704316201194, + "eval_accuracy": 0.577259841789776, + "eval_loss": 1.87544846534729, + "eval_runtime": 2196.2567, + "eval_samples_per_second": 13.406, + "eval_steps_per_second": 1.676, + "step": 850 + }, + { + "epoch": 0.05901324366980031, + "grad_norm": 0.9226791262626648, + "learning_rate": 7.859103385178408e-05, + "loss": 1.8726, + "step": 860 + }, + { + "epoch": 0.05901324366980031, + "eval_accuracy": 0.5769537908187763, + "eval_loss": 1.8771995306015015, + "eval_runtime": 2196.3678, + "eval_samples_per_second": 13.405, + "eval_steps_per_second": 1.676, + "step": 860 + }, + { + "epoch": 0.05969944417758869, + "grad_norm": 0.8840520977973938, + "learning_rate": 7.950594693504118e-05, + "loss": 1.8312, + "step": 870 + }, + { + "epoch": 0.05969944417758869, + "eval_accuracy": 0.576975570970325, + "eval_loss": 1.8782720565795898, + "eval_runtime": 2197.3135, + "eval_samples_per_second": 13.399, + "eval_steps_per_second": 1.675, + "step": 870 + }, + { + "epoch": 0.06038564468537707, + "grad_norm": 0.9905896782875061, + "learning_rate": 8.042086001829825e-05, + "loss": 1.9177, + "step": 880 + }, + { + "epoch": 0.06038564468537707, + "eval_accuracy": 0.5767803464411996, + "eval_loss": 1.877514123916626, + "eval_runtime": 2195.3275, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 880 + }, + { + "epoch": 0.06107184519316544, + "grad_norm": 1.012568712234497, + "learning_rate": 8.133577310155535e-05, + "loss": 1.9192, + "step": 890 + }, + { + "epoch": 0.06107184519316544, + "eval_accuracy": 0.5767326359567858, + "eval_loss": 1.8775851726531982, + "eval_runtime": 2196.0616, + "eval_samples_per_second": 13.407, + "eval_steps_per_second": 1.676, + "step": 890 + }, + { + "epoch": 0.06175804570095382, + "grad_norm": 0.9244153499603271, + "learning_rate": 8.225068618481244e-05, + "loss": 1.8696, + "step": 900 + }, + { + "epoch": 0.06175804570095382, + "eval_accuracy": 0.5768811460450133, + "eval_loss": 1.8783646821975708, + "eval_runtime": 2195.2053, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 900 + }, + { + "epoch": 0.06244424620874219, + "grad_norm": 0.8586702346801758, + "learning_rate": 8.316559926806954e-05, + "loss": 1.902, + "step": 910 + }, + { + "epoch": 0.06244424620874219, + "eval_accuracy": 0.5765674255389409, + "eval_loss": 1.8812183141708374, + "eval_runtime": 2196.1137, + "eval_samples_per_second": 13.406, + "eval_steps_per_second": 1.676, + "step": 910 + }, + { + "epoch": 0.06313044671653056, + "grad_norm": 0.9296178221702576, + "learning_rate": 8.408051235132663e-05, + "loss": 1.8953, + "step": 920 + }, + { + "epoch": 0.06313044671653056, + "eval_accuracy": 0.5759858689314306, + "eval_loss": 1.882770299911499, + "eval_runtime": 2196.7044, + "eval_samples_per_second": 13.403, + "eval_steps_per_second": 1.676, + "step": 920 + }, + { + "epoch": 0.06381664722431894, + "grad_norm": 0.8966830968856812, + "learning_rate": 8.499542543458372e-05, + "loss": 1.8931, + "step": 930 + }, + { + "epoch": 0.06381664722431894, + "eval_accuracy": 0.5755770926724864, + "eval_loss": 1.8848005533218384, + "eval_runtime": 2195.3041, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 930 + }, + { + "epoch": 0.06450284773210732, + "grad_norm": 1.0561726093292236, + "learning_rate": 8.59103385178408e-05, + "loss": 1.8726, + "step": 940 + }, + { + "epoch": 0.06450284773210732, + "eval_accuracy": 0.5759230185855744, + "eval_loss": 1.8830986022949219, + "eval_runtime": 2196.3236, + "eval_samples_per_second": 13.405, + "eval_steps_per_second": 1.676, + "step": 940 + }, + { + "epoch": 0.0651890482398957, + "grad_norm": 0.8631952404975891, + "learning_rate": 8.68252516010979e-05, + "loss": 1.8391, + "step": 950 + }, + { + "epoch": 0.0651890482398957, + "eval_accuracy": 0.5761208660292918, + "eval_loss": 1.8824081420898438, + "eval_runtime": 2195.5043, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 950 + }, + { + "epoch": 0.06587524874768408, + "grad_norm": 0.8912535905838013, + "learning_rate": 8.774016468435499e-05, + "loss": 1.8492, + "step": 960 + }, + { + "epoch": 0.06587524874768408, + "eval_accuracy": 0.575300856604064, + "eval_loss": 1.8885464668273926, + "eval_runtime": 2195.4665, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 960 + }, + { + "epoch": 0.06656144925547244, + "grad_norm": 0.941037654876709, + "learning_rate": 8.865507776761208e-05, + "loss": 1.9497, + "step": 970 + }, + { + "epoch": 0.06656144925547244, + "eval_accuracy": 0.5762686456856077, + "eval_loss": 1.8846741914749146, + "eval_runtime": 2195.3271, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 970 + }, + { + "epoch": 0.06724764976326082, + "grad_norm": 0.9157822728157043, + "learning_rate": 8.956999085086916e-05, + "loss": 1.8173, + "step": 980 + }, + { + "epoch": 0.06724764976326082, + "eval_accuracy": 0.5756494718346451, + "eval_loss": 1.8873111009597778, + "eval_runtime": 2195.0363, + "eval_samples_per_second": 13.413, + "eval_steps_per_second": 1.677, + "step": 980 + }, + { + "epoch": 0.0679338502710492, + "grad_norm": 0.7956060767173767, + "learning_rate": 9.048490393412626e-05, + "loss": 1.8227, + "step": 990 + }, + { + "epoch": 0.0679338502710492, + "eval_accuracy": 0.5754663658349637, + "eval_loss": 1.8905543088912964, + "eval_runtime": 2196.6945, + "eval_samples_per_second": 13.403, + "eval_steps_per_second": 1.676, + "step": 990 + }, + { + "epoch": 0.06862005077883758, + "grad_norm": 0.8911742568016052, + "learning_rate": 9.139981701738335e-05, + "loss": 1.8767, + "step": 1000 + }, + { + "epoch": 0.06862005077883758, + "eval_accuracy": 0.5748883285812098, + "eval_loss": 1.8892529010772705, + "eval_runtime": 2195.3255, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 1000 + }, + { + "epoch": 0.06930625128662596, + "grad_norm": 0.8083513379096985, + "learning_rate": 9.231473010064045e-05, + "loss": 1.9234, + "step": 1010 + }, + { + "epoch": 0.06930625128662596, + "eval_accuracy": 0.5748815222838508, + "eval_loss": 1.8903882503509521, + "eval_runtime": 2195.4835, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 1010 + }, + { + "epoch": 0.06999245179441432, + "grad_norm": 0.8500548005104065, + "learning_rate": 9.322964318389752e-05, + "loss": 1.8992, + "step": 1020 + }, + { + "epoch": 0.06999245179441432, + "eval_accuracy": 0.5743459496853266, + "eval_loss": 1.8903559446334839, + "eval_runtime": 2195.1026, + "eval_samples_per_second": 13.413, + "eval_steps_per_second": 1.677, + "step": 1020 + }, + { + "epoch": 0.0706786523022027, + "grad_norm": 0.8529781699180603, + "learning_rate": 9.414455626715462e-05, + "loss": 1.9097, + "step": 1030 + }, + { + "epoch": 0.0706786523022027, + "eval_accuracy": 0.574473210845214, + "eval_loss": 1.8921775817871094, + "eval_runtime": 2196.6371, + "eval_samples_per_second": 13.403, + "eval_steps_per_second": 1.676, + "step": 1030 + }, + { + "epoch": 0.07136485280999108, + "grad_norm": 0.985618531703949, + "learning_rate": 9.505946935041171e-05, + "loss": 1.8252, + "step": 1040 + }, + { + "epoch": 0.07136485280999108, + "eval_accuracy": 0.5746204260768708, + "eval_loss": 1.8922775983810425, + "eval_runtime": 2195.2982, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 1040 + }, + { + "epoch": 0.07205105331777946, + "grad_norm": 0.8113195896148682, + "learning_rate": 9.597438243366881e-05, + "loss": 1.8873, + "step": 1050 + }, + { + "epoch": 0.07205105331777946, + "eval_accuracy": 0.5742100893497516, + "eval_loss": 1.8929826021194458, + "eval_runtime": 2195.2211, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 1050 + }, + { + "epoch": 0.07273725382556782, + "grad_norm": 0.848109245300293, + "learning_rate": 9.68892955169259e-05, + "loss": 1.8595, + "step": 1060 + }, + { + "epoch": 0.07273725382556782, + "eval_accuracy": 0.5738744890877788, + "eval_loss": 1.896013855934143, + "eval_runtime": 2195.349, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 1060 + }, + { + "epoch": 0.0734234543333562, + "grad_norm": 0.8576720356941223, + "learning_rate": 9.780420860018299e-05, + "loss": 1.8415, + "step": 1070 + }, + { + "epoch": 0.0734234543333562, + "eval_accuracy": 0.573847263898343, + "eval_loss": 1.896903395652771, + "eval_runtime": 2195.4292, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 1070 + }, + { + "epoch": 0.07410965484114458, + "grad_norm": 0.8684922456741333, + "learning_rate": 9.871912168344007e-05, + "loss": 1.8104, + "step": 1080 + }, + { + "epoch": 0.07410965484114458, + "eval_accuracy": 0.5738449397968058, + "eval_loss": 1.895707607269287, + "eval_runtime": 2195.3809, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 1080 + }, + { + "epoch": 0.07479585534893296, + "grad_norm": 0.8526385426521301, + "learning_rate": 9.963403476669717e-05, + "loss": 1.8955, + "step": 1090 + }, + { + "epoch": 0.07479585534893296, + "eval_accuracy": 0.5738943767566472, + "eval_loss": 1.895605444908142, + "eval_runtime": 2195.5152, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 1090 + }, + { + "epoch": 0.07548205585672134, + "grad_norm": 0.7912049889564514, + "learning_rate": 0.00010054894784995426, + "loss": 1.8411, + "step": 1100 + }, + { + "epoch": 0.07548205585672134, + "eval_accuracy": 0.5737731582607566, + "eval_loss": 1.8975555896759033, + "eval_runtime": 2196.2407, + "eval_samples_per_second": 13.406, + "eval_steps_per_second": 1.676, + "step": 1100 + }, + { + "epoch": 0.0761682563645097, + "grad_norm": 0.8417245149612427, + "learning_rate": 0.00010146386093321135, + "loss": 1.8934, + "step": 1110 + }, + { + "epoch": 0.0761682563645097, + "eval_accuracy": 0.5735296588225584, + "eval_loss": 1.8995074033737183, + "eval_runtime": 2195.4727, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 1110 + }, + { + "epoch": 0.07685445687229808, + "grad_norm": 0.8422715663909912, + "learning_rate": 0.00010237877401646845, + "loss": 1.872, + "step": 1120 + }, + { + "epoch": 0.07685445687229808, + "eval_accuracy": 0.5736483208067581, + "eval_loss": 1.8992631435394287, + "eval_runtime": 2195.5393, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 1120 + }, + { + "epoch": 0.07754065738008646, + "grad_norm": 0.924197793006897, + "learning_rate": 0.00010329368709972553, + "loss": 1.8788, + "step": 1130 + }, + { + "epoch": 0.07754065738008646, + "eval_accuracy": 0.573535568680753, + "eval_loss": 1.8997005224227905, + "eval_runtime": 2195.1245, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 1130 + }, + { + "epoch": 0.07822685788787484, + "grad_norm": 0.8739652037620544, + "learning_rate": 0.00010420860018298262, + "loss": 1.8611, + "step": 1140 + }, + { + "epoch": 0.07822685788787484, + "eval_accuracy": 0.5737993874066766, + "eval_loss": 1.897829532623291, + "eval_runtime": 2194.8865, + "eval_samples_per_second": 13.414, + "eval_steps_per_second": 1.677, + "step": 1140 + }, + { + "epoch": 0.07891305839566322, + "grad_norm": 0.9717639684677124, + "learning_rate": 0.00010512351326623971, + "loss": 1.9323, + "step": 1150 + }, + { + "epoch": 0.07891305839566322, + "eval_accuracy": 0.5726149588604147, + "eval_loss": 1.9035004377365112, + "eval_runtime": 2195.2968, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 1150 + }, + { + "epoch": 0.07959925890345158, + "grad_norm": 0.9229555130004883, + "learning_rate": 0.00010603842634949681, + "loss": 1.892, + "step": 1160 + }, + { + "epoch": 0.07959925890345158, + "eval_accuracy": 0.5725945399683378, + "eval_loss": 1.903469204902649, + "eval_runtime": 2195.7477, + "eval_samples_per_second": 13.409, + "eval_steps_per_second": 1.676, + "step": 1160 + }, + { + "epoch": 0.08028545941123996, + "grad_norm": 0.8387541174888611, + "learning_rate": 0.0001069533394327539, + "loss": 1.8585, + "step": 1170 + }, + { + "epoch": 0.08028545941123996, + "eval_accuracy": 0.5729163616283399, + "eval_loss": 1.9036465883255005, + "eval_runtime": 2195.4236, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 1170 + }, + { + "epoch": 0.08097165991902834, + "grad_norm": 0.8355444073677063, + "learning_rate": 0.00010786825251601098, + "loss": 1.8718, + "step": 1180 + }, + { + "epoch": 0.08097165991902834, + "eval_accuracy": 0.5726727293843396, + "eval_loss": 1.9039238691329956, + "eval_runtime": 2195.5829, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 1180 + }, + { + "epoch": 0.08165786042681672, + "grad_norm": 0.8180125951766968, + "learning_rate": 0.00010878316559926807, + "loss": 1.9032, + "step": 1190 + }, + { + "epoch": 0.08165786042681672, + "eval_accuracy": 0.5721860957238989, + "eval_loss": 1.906183123588562, + "eval_runtime": 2195.35, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 1190 + }, + { + "epoch": 0.0823440609346051, + "grad_norm": 0.7993365526199341, + "learning_rate": 0.00010969807868252517, + "loss": 1.8526, + "step": 1200 + }, + { + "epoch": 0.0823440609346051, + "eval_accuracy": 0.5722786613679808, + "eval_loss": 1.9059474468231201, + "eval_runtime": 2195.3016, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 1200 + }, + { + "epoch": 0.08303026144239346, + "grad_norm": 1.0333908796310425, + "learning_rate": 0.00011061299176578226, + "loss": 1.8649, + "step": 1210 + }, + { + "epoch": 0.08303026144239346, + "eval_accuracy": 0.570262403680102, + "eval_loss": 1.922554850578308, + "eval_runtime": 2195.4851, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 1210 + }, + { + "epoch": 0.08371646195018184, + "grad_norm": 1.1051403284072876, + "learning_rate": 0.00011143641354071363, + "loss": 1.9549, + "step": 1220 + }, + { + "epoch": 0.08371646195018184, + "eval_accuracy": 0.5712613025207935, + "eval_loss": 1.9148966073989868, + "eval_runtime": 2195.4835, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 1220 + }, + { + "epoch": 0.08440266245797022, + "grad_norm": 0.8354151248931885, + "learning_rate": 0.00011235132662397072, + "loss": 1.9199, + "step": 1230 + }, + { + "epoch": 0.08440266245797022, + "eval_accuracy": 0.5723187355187723, + "eval_loss": 1.9058703184127808, + "eval_runtime": 2195.1846, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 1230 + }, + { + "epoch": 0.0850888629657586, + "grad_norm": 0.7311375737190247, + "learning_rate": 0.00011317474839890212, + "loss": 1.8916, + "step": 1240 + }, + { + "epoch": 0.0850888629657586, + "eval_accuracy": 0.572127262753557, + "eval_loss": 1.9079400300979614, + "eval_runtime": 2195.7032, + "eval_samples_per_second": 13.409, + "eval_steps_per_second": 1.676, + "step": 1240 + }, + { + "epoch": 0.08577506347354696, + "grad_norm": 0.8696947693824768, + "learning_rate": 0.00011408966148215921, + "loss": 1.9645, + "step": 1250 + }, + { + "epoch": 0.08577506347354696, + "eval_accuracy": 0.5722400480810126, + "eval_loss": 1.9090999364852905, + "eval_runtime": 2195.3879, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 1250 + }, + { + "epoch": 0.08646126398133534, + "grad_norm": 0.9006031155586243, + "learning_rate": 0.0001150045745654163, + "loss": 1.919, + "step": 1260 + }, + { + "epoch": 0.08646126398133534, + "eval_accuracy": 0.5701970632254558, + "eval_loss": 1.9196306467056274, + "eval_runtime": 2195.6859, + "eval_samples_per_second": 13.409, + "eval_steps_per_second": 1.676, + "step": 1260 + }, + { + "epoch": 0.08714746448912372, + "grad_norm": 1.0175645351409912, + "learning_rate": 0.00011591948764867338, + "loss": 1.8906, + "step": 1270 + }, + { + "epoch": 0.08714746448912372, + "eval_accuracy": 0.571257019533675, + "eval_loss": 1.9163960218429565, + "eval_runtime": 2195.6701, + "eval_samples_per_second": 13.409, + "eval_steps_per_second": 1.676, + "step": 1270 + }, + { + "epoch": 0.0878336649969121, + "grad_norm": 0.8517598509788513, + "learning_rate": 0.00011683440073193048, + "loss": 1.9111, + "step": 1280 + }, + { + "epoch": 0.0878336649969121, + "eval_accuracy": 0.5712075161709325, + "eval_loss": 1.9151411056518555, + "eval_runtime": 2196.2166, + "eval_samples_per_second": 13.406, + "eval_steps_per_second": 1.676, + "step": 1280 + }, + { + "epoch": 0.08851986550470048, + "grad_norm": 0.8764553070068359, + "learning_rate": 0.00011774931381518757, + "loss": 1.9597, + "step": 1290 + }, + { + "epoch": 0.08851986550470048, + "eval_accuracy": 0.5705109497387809, + "eval_loss": 1.9204518795013428, + "eval_runtime": 2196.4778, + "eval_samples_per_second": 13.404, + "eval_steps_per_second": 1.676, + "step": 1290 + }, + { + "epoch": 0.08920606601248884, + "grad_norm": 0.8551300764083862, + "learning_rate": 0.00011866422689844465, + "loss": 1.885, + "step": 1300 + }, + { + "epoch": 0.08920606601248884, + "eval_accuracy": 0.5686897173713243, + "eval_loss": 1.9303290843963623, + "eval_runtime": 2195.3977, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 1300 + }, + { + "epoch": 0.08989226652027722, + "grad_norm": 0.8597418069839478, + "learning_rate": 0.00011957913998170174, + "loss": 1.9086, + "step": 1310 + }, + { + "epoch": 0.08989226652027722, + "eval_accuracy": 0.5694771229721302, + "eval_loss": 1.9275456666946411, + "eval_runtime": 2195.6823, + "eval_samples_per_second": 13.409, + "eval_steps_per_second": 1.676, + "step": 1310 + }, + { + "epoch": 0.0905784670280656, + "grad_norm": 1.0071135759353638, + "learning_rate": 0.00012049405306495884, + "loss": 1.9019, + "step": 1320 + }, + { + "epoch": 0.0905784670280656, + "eval_accuracy": 0.5700387587093215, + "eval_loss": 1.92341947555542, + "eval_runtime": 2195.6668, + "eval_samples_per_second": 13.409, + "eval_steps_per_second": 1.676, + "step": 1320 + }, + { + "epoch": 0.09126466753585398, + "grad_norm": 0.7982167601585388, + "learning_rate": 0.00012140896614821593, + "loss": 1.9072, + "step": 1330 + }, + { + "epoch": 0.09126466753585398, + "eval_accuracy": 0.5695152714387909, + "eval_loss": 1.9324274063110352, + "eval_runtime": 2196.4255, + "eval_samples_per_second": 13.405, + "eval_steps_per_second": 1.676, + "step": 1330 + }, + { + "epoch": 0.09195086804364236, + "grad_norm": 0.8429548144340515, + "learning_rate": 0.000122323879231473, + "loss": 1.9473, + "step": 1340 + }, + { + "epoch": 0.09195086804364236, + "eval_accuracy": 0.5696431634262383, + "eval_loss": 1.923658013343811, + "eval_runtime": 2196.8488, + "eval_samples_per_second": 13.402, + "eval_steps_per_second": 1.676, + "step": 1340 + }, + { + "epoch": 0.09263706855143072, + "grad_norm": 0.8137506246566772, + "learning_rate": 0.0001232387923147301, + "loss": 1.9171, + "step": 1350 + }, + { + "epoch": 0.09263706855143072, + "eval_accuracy": 0.5698193635242091, + "eval_loss": 1.9207695722579956, + "eval_runtime": 2195.6978, + "eval_samples_per_second": 13.409, + "eval_steps_per_second": 1.676, + "step": 1350 + }, + { + "epoch": 0.0933232690592191, + "grad_norm": 0.7542211413383484, + "learning_rate": 0.0001241537053979872, + "loss": 1.9515, + "step": 1360 + }, + { + "epoch": 0.0933232690592191, + "eval_accuracy": 0.5703601819519173, + "eval_loss": 1.9201329946517944, + "eval_runtime": 2196.079, + "eval_samples_per_second": 13.407, + "eval_steps_per_second": 1.676, + "step": 1360 + }, + { + "epoch": 0.09400946956700748, + "grad_norm": 0.7821772694587708, + "learning_rate": 0.0001250686184812443, + "loss": 1.9078, + "step": 1370 + }, + { + "epoch": 0.09400946956700748, + "eval_accuracy": 0.5695221109376003, + "eval_loss": 1.9211959838867188, + "eval_runtime": 2195.4801, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 1370 + }, + { + "epoch": 0.09469567007479586, + "grad_norm": 0.7843822240829468, + "learning_rate": 0.00012598353156450138, + "loss": 1.9052, + "step": 1380 + }, + { + "epoch": 0.09469567007479586, + "eval_accuracy": 0.5696008647782611, + "eval_loss": 1.9253789186477661, + "eval_runtime": 2195.2622, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 1380 + }, + { + "epoch": 0.09538187058258422, + "grad_norm": 0.7415681481361389, + "learning_rate": 0.00012689844464775846, + "loss": 1.9027, + "step": 1390 + }, + { + "epoch": 0.09538187058258422, + "eval_accuracy": 0.5689813257113427, + "eval_loss": 1.9291355609893799, + "eval_runtime": 2196.3808, + "eval_samples_per_second": 13.405, + "eval_steps_per_second": 1.676, + "step": 1390 + }, + { + "epoch": 0.0960680710903726, + "grad_norm": 0.7906999588012695, + "learning_rate": 0.00012781335773101558, + "loss": 1.9165, + "step": 1400 + }, + { + "epoch": 0.0960680710903726, + "eval_accuracy": 0.5686311168111361, + "eval_loss": 1.929186463356018, + "eval_runtime": 2195.3777, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 1400 + }, + { + "epoch": 0.09675427159816098, + "grad_norm": 0.7665316462516785, + "learning_rate": 0.00012872827081427266, + "loss": 1.8921, + "step": 1410 + }, + { + "epoch": 0.09675427159816098, + "eval_accuracy": 0.5691787415362032, + "eval_loss": 1.9266636371612549, + "eval_runtime": 2196.5486, + "eval_samples_per_second": 13.404, + "eval_steps_per_second": 1.676, + "step": 1410 + }, + { + "epoch": 0.09744047210594936, + "grad_norm": 0.7428321242332458, + "learning_rate": 0.00012964318389752975, + "loss": 2.0197, + "step": 1420 + }, + { + "epoch": 0.09744047210594936, + "eval_accuracy": 0.5681328958444599, + "eval_loss": 1.9324249029159546, + "eval_runtime": 2199.1199, + "eval_samples_per_second": 13.388, + "eval_steps_per_second": 1.674, + "step": 1420 + }, + { + "epoch": 0.09812667261373774, + "grad_norm": 0.7564303874969482, + "learning_rate": 0.00013055809698078684, + "loss": 1.9049, + "step": 1430 + }, + { + "epoch": 0.09812667261373774, + "eval_accuracy": 0.5684988754336691, + "eval_loss": 1.9301478862762451, + "eval_runtime": 2196.3543, + "eval_samples_per_second": 13.405, + "eval_steps_per_second": 1.676, + "step": 1430 + }, + { + "epoch": 0.0988128731215261, + "grad_norm": 0.9298701286315918, + "learning_rate": 0.00013147301006404392, + "loss": 1.9086, + "step": 1440 + }, + { + "epoch": 0.0988128731215261, + "eval_accuracy": 0.5689413179634523, + "eval_loss": 1.9311435222625732, + "eval_runtime": 2195.5784, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 1440 + }, + { + "epoch": 0.09949907362931448, + "grad_norm": 0.8423255681991577, + "learning_rate": 0.000132387923147301, + "loss": 1.902, + "step": 1450 + }, + { + "epoch": 0.09949907362931448, + "eval_accuracy": 0.5683401725001284, + "eval_loss": 1.9306671619415283, + "eval_runtime": 2195.5481, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 1450 + }, + { + "epoch": 0.10018527413710286, + "grad_norm": 0.8051897883415222, + "learning_rate": 0.0001333028362305581, + "loss": 1.9863, + "step": 1460 + }, + { + "epoch": 0.10018527413710286, + "eval_accuracy": 0.568742474476219, + "eval_loss": 1.9322450160980225, + "eval_runtime": 2196.6508, + "eval_samples_per_second": 13.403, + "eval_steps_per_second": 1.676, + "step": 1460 + }, + { + "epoch": 0.10087147464489124, + "grad_norm": 0.826680600643158, + "learning_rate": 0.00013421774931381518, + "loss": 1.8864, + "step": 1470 + }, + { + "epoch": 0.10087147464489124, + "eval_accuracy": 0.5676466606014257, + "eval_loss": 1.9348758459091187, + "eval_runtime": 2196.1461, + "eval_samples_per_second": 13.406, + "eval_steps_per_second": 1.676, + "step": 1470 + }, + { + "epoch": 0.10155767515267962, + "grad_norm": 0.8538419604301453, + "learning_rate": 0.00013513266239707227, + "loss": 1.8823, + "step": 1480 + }, + { + "epoch": 0.10155767515267962, + "eval_accuracy": 0.5668430858942111, + "eval_loss": 1.942825436592102, + "eval_runtime": 2195.3646, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 1480 + }, + { + "epoch": 0.10224387566046798, + "grad_norm": 0.7539196014404297, + "learning_rate": 0.00013604757548032939, + "loss": 1.9582, + "step": 1490 + }, + { + "epoch": 0.10224387566046798, + "eval_accuracy": 0.5680551380473151, + "eval_loss": 1.9357633590698242, + "eval_runtime": 2195.6893, + "eval_samples_per_second": 13.409, + "eval_steps_per_second": 1.676, + "step": 1490 + }, + { + "epoch": 0.10293007616825636, + "grad_norm": 0.8606800436973572, + "learning_rate": 0.00013696248856358647, + "loss": 1.9724, + "step": 1500 + }, + { + "epoch": 0.10293007616825636, + "eval_accuracy": 0.5682361191541625, + "eval_loss": 1.9344924688339233, + "eval_runtime": 2196.3487, + "eval_samples_per_second": 13.405, + "eval_steps_per_second": 1.676, + "step": 1500 + }, + { + "epoch": 0.10361627667604474, + "grad_norm": 1.327166199684143, + "learning_rate": 0.00013787740164684356, + "loss": 1.8999, + "step": 1510 + }, + { + "epoch": 0.10361627667604474, + "eval_accuracy": 0.5677454681182075, + "eval_loss": 1.9374973773956299, + "eval_runtime": 2196.3215, + "eval_samples_per_second": 13.405, + "eval_steps_per_second": 1.676, + "step": 1510 + }, + { + "epoch": 0.10430247718383312, + "grad_norm": 0.7882415056228638, + "learning_rate": 0.00013879231473010065, + "loss": 1.939, + "step": 1520 + }, + { + "epoch": 0.10430247718383312, + "eval_accuracy": 0.5680506226500428, + "eval_loss": 1.9378398656845093, + "eval_runtime": 2195.9406, + "eval_samples_per_second": 13.407, + "eval_steps_per_second": 1.676, + "step": 1520 + }, + { + "epoch": 0.1049886776916215, + "grad_norm": 0.8347277045249939, + "learning_rate": 0.00013970722781335773, + "loss": 1.9144, + "step": 1530 + }, + { + "epoch": 0.1049886776916215, + "eval_accuracy": 0.566825787938484, + "eval_loss": 1.9458295106887817, + "eval_runtime": 2196.2125, + "eval_samples_per_second": 13.406, + "eval_steps_per_second": 1.676, + "step": 1530 + }, + { + "epoch": 0.10567487819940986, + "grad_norm": 0.9527395963668823, + "learning_rate": 0.00014062214089661482, + "loss": 1.9098, + "step": 1540 + }, + { + "epoch": 0.10567487819940986, + "eval_accuracy": 0.5678965015166755, + "eval_loss": 1.940508484840393, + "eval_runtime": 2195.4239, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 1540 + }, + { + "epoch": 0.10636107870719824, + "grad_norm": 0.8285815715789795, + "learning_rate": 0.0001415370539798719, + "loss": 1.9036, + "step": 1550 + }, + { + "epoch": 0.10636107870719824, + "eval_accuracy": 0.567786471909614, + "eval_loss": 1.9401248693466187, + "eval_runtime": 2196.2023, + "eval_samples_per_second": 13.406, + "eval_steps_per_second": 1.676, + "step": 1550 + }, + { + "epoch": 0.10704727921498662, + "grad_norm": 0.9443630576133728, + "learning_rate": 0.000142451967063129, + "loss": 2.0052, + "step": 1560 + }, + { + "epoch": 0.10704727921498662, + "eval_accuracy": 0.5666123026115664, + "eval_loss": 1.9421296119689941, + "eval_runtime": 2196.6048, + "eval_samples_per_second": 13.403, + "eval_steps_per_second": 1.676, + "step": 1560 + }, + { + "epoch": 0.107733479722775, + "grad_norm": 0.8364240527153015, + "learning_rate": 0.0001433668801463861, + "loss": 1.8997, + "step": 1570 + }, + { + "epoch": 0.107733479722775, + "eval_accuracy": 0.5669286460322308, + "eval_loss": 1.941907286643982, + "eval_runtime": 2193.9823, + "eval_samples_per_second": 13.419, + "eval_steps_per_second": 1.678, + "step": 1570 + }, + { + "epoch": 0.10841968023056336, + "grad_norm": 0.7851582765579224, + "learning_rate": 0.0001442817932296432, + "loss": 1.9388, + "step": 1580 + }, + { + "epoch": 0.10841968023056336, + "eval_accuracy": 0.5667885691124382, + "eval_loss": 1.9420804977416992, + "eval_runtime": 2193.8438, + "eval_samples_per_second": 13.42, + "eval_steps_per_second": 1.678, + "step": 1580 + }, + { + "epoch": 0.10910588073835174, + "grad_norm": 1.1334728002548218, + "learning_rate": 0.00014519670631290028, + "loss": 1.9436, + "step": 1590 + }, + { + "epoch": 0.10910588073835174, + "eval_accuracy": 0.5664039303080305, + "eval_loss": 1.9469659328460693, + "eval_runtime": 2194.723, + "eval_samples_per_second": 13.415, + "eval_steps_per_second": 1.677, + "step": 1590 + }, + { + "epoch": 0.10979208124614012, + "grad_norm": 3.6574079990386963, + "learning_rate": 0.00014611161939615737, + "loss": 1.9346, + "step": 1600 + }, + { + "epoch": 0.10979208124614012, + "eval_accuracy": 0.5647517265252299, + "eval_loss": 1.9564319849014282, + "eval_runtime": 2193.9968, + "eval_samples_per_second": 13.419, + "eval_steps_per_second": 1.678, + "step": 1600 + }, + { + "epoch": 0.1104782817539285, + "grad_norm": 0.9866987466812134, + "learning_rate": 0.00014702653247941448, + "loss": 1.9583, + "step": 1610 + }, + { + "epoch": 0.1104782817539285, + "eval_accuracy": 0.5648301815528358, + "eval_loss": 1.9578943252563477, + "eval_runtime": 2195.1234, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 1610 + }, + { + "epoch": 0.11116448226171688, + "grad_norm": 0.7200923562049866, + "learning_rate": 0.00014794144556267154, + "loss": 1.9934, + "step": 1620 + }, + { + "epoch": 0.11116448226171688, + "eval_accuracy": 0.5669039109515848, + "eval_loss": 1.94541335105896, + "eval_runtime": 2195.0872, + "eval_samples_per_second": 13.413, + "eval_steps_per_second": 1.677, + "step": 1620 + }, + { + "epoch": 0.11185068276950524, + "grad_norm": 0.8922938108444214, + "learning_rate": 0.00014885635864592863, + "loss": 1.9672, + "step": 1630 + }, + { + "epoch": 0.11185068276950524, + "eval_accuracy": 0.5670405681219726, + "eval_loss": 1.943771243095398, + "eval_runtime": 2195.4513, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 1630 + }, + { + "epoch": 0.11253688327729362, + "grad_norm": 0.8861325979232788, + "learning_rate": 0.00014977127172918572, + "loss": 1.9451, + "step": 1640 + }, + { + "epoch": 0.11253688327729362, + "eval_accuracy": 0.5672741403264618, + "eval_loss": 1.9424519538879395, + "eval_runtime": 2195.3731, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 1640 + }, + { + "epoch": 0.113223083785082, + "grad_norm": 0.7567145824432373, + "learning_rate": 0.00015068618481244283, + "loss": 1.9691, + "step": 1650 + }, + { + "epoch": 0.113223083785082, + "eval_accuracy": 0.565372195232763, + "eval_loss": 1.9486464262008667, + "eval_runtime": 2195.4507, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 1650 + }, + { + "epoch": 0.11390928429287038, + "grad_norm": 0.7195202708244324, + "learning_rate": 0.00015160109789569992, + "loss": 1.9507, + "step": 1660 + }, + { + "epoch": 0.11390928429287038, + "eval_accuracy": 0.5668069627160327, + "eval_loss": 1.94415283203125, + "eval_runtime": 2195.4023, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 1660 + }, + { + "epoch": 0.11459548480065876, + "grad_norm": 0.7501398324966431, + "learning_rate": 0.000152516010978957, + "loss": 1.9337, + "step": 1670 + }, + { + "epoch": 0.11459548480065876, + "eval_accuracy": 0.565814803769799, + "eval_loss": 1.9502369165420532, + "eval_runtime": 2195.5008, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 1670 + }, + { + "epoch": 0.11528168530844712, + "grad_norm": 5.4457292556762695, + "learning_rate": 0.0001534309240622141, + "loss": 1.964, + "step": 1680 + }, + { + "epoch": 0.11528168530844712, + "eval_accuracy": 0.5647974781240622, + "eval_loss": 1.9561764001846313, + "eval_runtime": 2195.9745, + "eval_samples_per_second": 13.407, + "eval_steps_per_second": 1.676, + "step": 1680 + }, + { + "epoch": 0.1159678858162355, + "grad_norm": 0.7148681282997131, + "learning_rate": 0.0001543458371454712, + "loss": 1.9242, + "step": 1690 + }, + { + "epoch": 0.1159678858162355, + "eval_accuracy": 0.5624542193498984, + "eval_loss": 1.985127329826355, + "eval_runtime": 2192.8067, + "eval_samples_per_second": 13.427, + "eval_steps_per_second": 1.679, + "step": 1690 + }, + { + "epoch": 0.11665408632402388, + "grad_norm": 0.8697217702865601, + "learning_rate": 0.0001552607502287283, + "loss": 1.9982, + "step": 1700 + }, + { + "epoch": 0.11665408632402388, + "eval_accuracy": 0.5627469897406854, + "eval_loss": 1.975067377090454, + "eval_runtime": 2192.9867, + "eval_samples_per_second": 13.426, + "eval_steps_per_second": 1.679, + "step": 1700 + }, + { + "epoch": 0.11734028683181226, + "grad_norm": 0.8448213338851929, + "learning_rate": 0.00015617566331198538, + "loss": 2.0176, + "step": 1710 + }, + { + "epoch": 0.11734028683181226, + "eval_accuracy": 0.5627390877954589, + "eval_loss": 1.9716612100601196, + "eval_runtime": 2193.5512, + "eval_samples_per_second": 13.422, + "eval_steps_per_second": 1.678, + "step": 1710 + }, + { + "epoch": 0.11802648733960062, + "grad_norm": 3.248958110809326, + "learning_rate": 0.00015709057639524244, + "loss": 1.9987, + "step": 1720 + }, + { + "epoch": 0.11802648733960062, + "eval_accuracy": 0.5634107531397118, + "eval_loss": 1.962895154953003, + "eval_runtime": 2193.484, + "eval_samples_per_second": 13.422, + "eval_steps_per_second": 1.678, + "step": 1720 + }, + { + "epoch": 0.118712687847389, + "grad_norm": 0.709474503993988, + "learning_rate": 0.00015800548947849955, + "loss": 1.9584, + "step": 1730 + }, + { + "epoch": 0.118712687847389, + "eval_accuracy": 0.5648748043023502, + "eval_loss": 1.955528736114502, + "eval_runtime": 2192.7873, + "eval_samples_per_second": 13.427, + "eval_steps_per_second": 1.679, + "step": 1730 + }, + { + "epoch": 0.11939888835517738, + "grad_norm": 0.7776058912277222, + "learning_rate": 0.00015892040256175664, + "loss": 1.9028, + "step": 1740 + }, + { + "epoch": 0.11939888835517738, + "eval_accuracy": 0.5636472470718479, + "eval_loss": 1.9644140005111694, + "eval_runtime": 2193.2656, + "eval_samples_per_second": 13.424, + "eval_steps_per_second": 1.678, + "step": 1740 + }, + { + "epoch": 0.12008508886296576, + "grad_norm": 0.9440386891365051, + "learning_rate": 0.00015983531564501372, + "loss": 1.9697, + "step": 1750 + }, + { + "epoch": 0.12008508886296576, + "eval_accuracy": 0.5645730695199196, + "eval_loss": 1.9598476886749268, + "eval_runtime": 2192.0993, + "eval_samples_per_second": 13.431, + "eval_steps_per_second": 1.679, + "step": 1750 + }, + { + "epoch": 0.12077128937075414, + "grad_norm": 0.7116687297821045, + "learning_rate": 0.0001607502287282708, + "loss": 1.971, + "step": 1760 + }, + { + "epoch": 0.12077128937075414, + "eval_accuracy": 0.5648594652322046, + "eval_loss": 1.957185983657837, + "eval_runtime": 2191.728, + "eval_samples_per_second": 13.433, + "eval_steps_per_second": 1.679, + "step": 1760 + }, + { + "epoch": 0.1214574898785425, + "grad_norm": 1.1787147521972656, + "learning_rate": 0.00016166514181152793, + "loss": 1.9428, + "step": 1770 + }, + { + "epoch": 0.1214574898785425, + "eval_accuracy": 0.5637377542259968, + "eval_loss": 1.9630035161972046, + "eval_runtime": 2191.7408, + "eval_samples_per_second": 13.433, + "eval_steps_per_second": 1.679, + "step": 1770 + }, + { + "epoch": 0.12214369038633088, + "grad_norm": 0.7775124907493591, + "learning_rate": 0.000162580054894785, + "loss": 1.9251, + "step": 1780 + }, + { + "epoch": 0.12214369038633088, + "eval_accuracy": 0.5636699568640114, + "eval_loss": 1.962775468826294, + "eval_runtime": 2191.7029, + "eval_samples_per_second": 13.433, + "eval_steps_per_second": 1.68, + "step": 1780 + }, + { + "epoch": 0.12282989089411926, + "grad_norm": 0.7894787788391113, + "learning_rate": 0.0001634949679780421, + "loss": 1.9413, + "step": 1790 + }, + { + "epoch": 0.12282989089411926, + "eval_accuracy": 0.5643811651358474, + "eval_loss": 1.9617409706115723, + "eval_runtime": 2191.672, + "eval_samples_per_second": 13.434, + "eval_steps_per_second": 1.68, + "step": 1790 + }, + { + "epoch": 0.12351609140190764, + "grad_norm": 0.7652885317802429, + "learning_rate": 0.00016440988106129919, + "loss": 1.9768, + "step": 1800 + }, + { + "epoch": 0.12351609140190764, + "eval_accuracy": 0.5625105622114503, + "eval_loss": 1.9699786901474, + "eval_runtime": 2192.2587, + "eval_samples_per_second": 13.43, + "eval_steps_per_second": 1.679, + "step": 1800 + }, + { + "epoch": 0.12420229190969602, + "grad_norm": 0.7860700488090515, + "learning_rate": 0.00016532479414455627, + "loss": 1.9325, + "step": 1810 + }, + { + "epoch": 0.12420229190969602, + "eval_accuracy": 0.5627916456916503, + "eval_loss": 1.96902596950531, + "eval_runtime": 2194.8971, + "eval_samples_per_second": 13.414, + "eval_steps_per_second": 1.677, + "step": 1810 + }, + { + "epoch": 0.12488849241748438, + "grad_norm": 0.6445093154907227, + "learning_rate": 0.00016623970722781336, + "loss": 1.8726, + "step": 1820 + }, + { + "epoch": 0.12488849241748438, + "eval_accuracy": 0.5629212641545254, + "eval_loss": 1.9668747186660767, + "eval_runtime": 2195.3596, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 1820 + }, + { + "epoch": 0.12557469292527276, + "grad_norm": 0.7446492314338684, + "learning_rate": 0.00016715462031107045, + "loss": 1.9153, + "step": 1830 + }, + { + "epoch": 0.12557469292527276, + "eval_accuracy": 0.5622602564758932, + "eval_loss": 1.9712408781051636, + "eval_runtime": 2195.4086, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 1830 + }, + { + "epoch": 0.12626089343306113, + "grad_norm": 0.6903027296066284, + "learning_rate": 0.00016806953339432753, + "loss": 1.9477, + "step": 1840 + }, + { + "epoch": 0.12626089343306113, + "eval_accuracy": 0.5628142226780117, + "eval_loss": 1.9705018997192383, + "eval_runtime": 2195.3561, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 1840 + }, + { + "epoch": 0.12694709394084952, + "grad_norm": 0.6859719753265381, + "learning_rate": 0.00016898444647758465, + "loss": 1.956, + "step": 1850 + }, + { + "epoch": 0.12694709394084952, + "eval_accuracy": 0.5628691378771908, + "eval_loss": 1.9665277004241943, + "eval_runtime": 2196.295, + "eval_samples_per_second": 13.405, + "eval_steps_per_second": 1.676, + "step": 1850 + }, + { + "epoch": 0.12763329444863789, + "grad_norm": 0.6945796012878418, + "learning_rate": 0.00016989935956084173, + "loss": 1.9526, + "step": 1860 + }, + { + "epoch": 0.12763329444863789, + "eval_accuracy": 0.5613948274663382, + "eval_loss": 1.9775313138961792, + "eval_runtime": 2195.3271, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 1860 + }, + { + "epoch": 0.12831949495642628, + "grad_norm": 0.692680299282074, + "learning_rate": 0.00017081427264409882, + "loss": 1.9693, + "step": 1870 + }, + { + "epoch": 0.12831949495642628, + "eval_accuracy": 0.5631267811333156, + "eval_loss": 1.9690190553665161, + "eval_runtime": 2196.3618, + "eval_samples_per_second": 13.405, + "eval_steps_per_second": 1.676, + "step": 1870 + }, + { + "epoch": 0.12900569546421464, + "grad_norm": 0.9683026671409607, + "learning_rate": 0.0001717291857273559, + "loss": 1.9857, + "step": 1880 + }, + { + "epoch": 0.12900569546421464, + "eval_accuracy": 0.5625857302954538, + "eval_loss": 1.9720834493637085, + "eval_runtime": 2194.9732, + "eval_samples_per_second": 13.413, + "eval_steps_per_second": 1.677, + "step": 1880 + }, + { + "epoch": 0.129691895972003, + "grad_norm": 0.9663318991661072, + "learning_rate": 0.00017264409881061302, + "loss": 1.982, + "step": 1890 + }, + { + "epoch": 0.129691895972003, + "eval_accuracy": 0.5605411517702714, + "eval_loss": 1.980215311050415, + "eval_runtime": 2195.0705, + "eval_samples_per_second": 13.413, + "eval_steps_per_second": 1.677, + "step": 1890 + }, + { + "epoch": 0.1303780964797914, + "grad_norm": 0.9997875690460205, + "learning_rate": 0.00017355901189387008, + "loss": 1.9939, + "step": 1900 + }, + { + "epoch": 0.1303780964797914, + "eval_accuracy": 0.5622947527829953, + "eval_loss": 1.9729753732681274, + "eval_runtime": 2195.0496, + "eval_samples_per_second": 13.413, + "eval_steps_per_second": 1.677, + "step": 1900 + }, + { + "epoch": 0.13106429698757976, + "grad_norm": 1.0369981527328491, + "learning_rate": 0.00017447392497712717, + "loss": 1.9709, + "step": 1910 + }, + { + "epoch": 0.13106429698757976, + "eval_accuracy": 0.5614108305654945, + "eval_loss": 1.9774330854415894, + "eval_runtime": 2195.0243, + "eval_samples_per_second": 13.413, + "eval_steps_per_second": 1.677, + "step": 1910 + }, + { + "epoch": 0.13175049749536816, + "grad_norm": 0.7212566137313843, + "learning_rate": 0.00017538883806038425, + "loss": 2.0167, + "step": 1920 + }, + { + "epoch": 0.13175049749536816, + "eval_accuracy": 0.5607967697379137, + "eval_loss": 1.984305739402771, + "eval_runtime": 2194.9072, + "eval_samples_per_second": 13.414, + "eval_steps_per_second": 1.677, + "step": 1920 + }, + { + "epoch": 0.13243669800315652, + "grad_norm": 0.709562361240387, + "learning_rate": 0.00017630375114364137, + "loss": 1.9576, + "step": 1930 + }, + { + "epoch": 0.13243669800315652, + "eval_accuracy": 0.5579354355296557, + "eval_loss": 2.008889675140381, + "eval_runtime": 2195.0187, + "eval_samples_per_second": 13.413, + "eval_steps_per_second": 1.677, + "step": 1930 + }, + { + "epoch": 0.1331228985109449, + "grad_norm": 5.758970260620117, + "learning_rate": 0.00017721866422689846, + "loss": 2.0477, + "step": 1940 + }, + { + "epoch": 0.1331228985109449, + "eval_accuracy": 0.5608241941360528, + "eval_loss": 1.9867639541625977, + "eval_runtime": 2193.1512, + "eval_samples_per_second": 13.425, + "eval_steps_per_second": 1.678, + "step": 1940 + }, + { + "epoch": 0.13380909901873328, + "grad_norm": 0.6865282654762268, + "learning_rate": 0.00017813357731015554, + "loss": 2.0013, + "step": 1950 + }, + { + "epoch": 0.13380909901873328, + "eval_accuracy": 0.5611121171150623, + "eval_loss": 1.9813686609268188, + "eval_runtime": 2191.7839, + "eval_samples_per_second": 13.433, + "eval_steps_per_second": 1.679, + "step": 1950 + }, + { + "epoch": 0.13449529952652164, + "grad_norm": 1.0006988048553467, + "learning_rate": 0.00017904849039341263, + "loss": 1.9151, + "step": 1960 + }, + { + "epoch": 0.13449529952652164, + "eval_accuracy": 0.5620602177364407, + "eval_loss": 1.9793142080307007, + "eval_runtime": 2191.7062, + "eval_samples_per_second": 13.433, + "eval_steps_per_second": 1.68, + "step": 1960 + }, + { + "epoch": 0.13518150003431004, + "grad_norm": 25.0100040435791, + "learning_rate": 0.00017996340347666974, + "loss": 2.0365, + "step": 1970 + }, + { + "epoch": 0.13518150003431004, + "eval_accuracy": 0.5600259980638242, + "eval_loss": 1.9854369163513184, + "eval_runtime": 2191.9684, + "eval_samples_per_second": 13.432, + "eval_steps_per_second": 1.679, + "step": 1970 + }, + { + "epoch": 0.1358677005420984, + "grad_norm": 0.6420261859893799, + "learning_rate": 0.00018087831655992683, + "loss": 2.0143, + "step": 1980 + }, + { + "epoch": 0.1358677005420984, + "eval_accuracy": 0.5613296530189448, + "eval_loss": 1.9770421981811523, + "eval_runtime": 2191.2212, + "eval_samples_per_second": 13.436, + "eval_steps_per_second": 1.68, + "step": 1980 + }, + { + "epoch": 0.13655390104988677, + "grad_norm": 0.7176883816719055, + "learning_rate": 0.00018179322964318392, + "loss": 1.9782, + "step": 1990 + }, + { + "epoch": 0.13655390104988677, + "eval_accuracy": 0.5617598442134819, + "eval_loss": 1.980700135231018, + "eval_runtime": 2191.7861, + "eval_samples_per_second": 13.433, + "eval_steps_per_second": 1.679, + "step": 1990 + }, + { + "epoch": 0.13724010155767516, + "grad_norm": 0.6866204738616943, + "learning_rate": 0.00018270814272644098, + "loss": 1.9449, + "step": 2000 + }, + { + "epoch": 0.13724010155767516, + "eval_accuracy": 0.5615476869445853, + "eval_loss": 1.9827600717544556, + "eval_runtime": 2193.8402, + "eval_samples_per_second": 13.42, + "eval_steps_per_second": 1.678, + "step": 2000 + }, + { + "epoch": 0.13792630206546352, + "grad_norm": 0.6652859449386597, + "learning_rate": 0.0001836230558096981, + "loss": 1.9812, + "step": 2010 + }, + { + "epoch": 0.13792630206546352, + "eval_accuracy": 0.5610583639666517, + "eval_loss": 1.9824488162994385, + "eval_runtime": 2198.5601, + "eval_samples_per_second": 13.391, + "eval_steps_per_second": 1.674, + "step": 2010 + }, + { + "epoch": 0.13861250257325192, + "grad_norm": 0.712452232837677, + "learning_rate": 0.00018453796889295518, + "loss": 2.0223, + "step": 2020 + }, + { + "epoch": 0.13861250257325192, + "eval_accuracy": 0.5598986372995852, + "eval_loss": 1.9870091676712036, + "eval_runtime": 2199.0783, + "eval_samples_per_second": 13.388, + "eval_steps_per_second": 1.674, + "step": 2020 + }, + { + "epoch": 0.13929870308104028, + "grad_norm": 0.6659247875213623, + "learning_rate": 0.00018545288197621226, + "loss": 2.0113, + "step": 2030 + }, + { + "epoch": 0.13929870308104028, + "eval_accuracy": 0.560590721535915, + "eval_loss": 1.983353614807129, + "eval_runtime": 2195.4289, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 2030 + }, + { + "epoch": 0.13998490358882865, + "grad_norm": 0.7091332674026489, + "learning_rate": 0.00018636779505946935, + "loss": 2.0452, + "step": 2040 + }, + { + "epoch": 0.13998490358882865, + "eval_accuracy": 0.5607067938069733, + "eval_loss": 1.9845926761627197, + "eval_runtime": 2196.3562, + "eval_samples_per_second": 13.405, + "eval_steps_per_second": 1.676, + "step": 2040 + }, + { + "epoch": 0.14067110409661704, + "grad_norm": 0.6729557514190674, + "learning_rate": 0.00018728270814272646, + "loss": 1.9922, + "step": 2050 + }, + { + "epoch": 0.14067110409661704, + "eval_accuracy": 0.5608056013237551, + "eval_loss": 1.9840885400772095, + "eval_runtime": 2204.0084, + "eval_samples_per_second": 13.358, + "eval_steps_per_second": 1.67, + "step": 2050 + }, + { + "epoch": 0.1413573046044054, + "grad_norm": 0.6952201128005981, + "learning_rate": 0.00018819762122598355, + "loss": 1.9517, + "step": 2060 + }, + { + "epoch": 0.1413573046044054, + "eval_accuracy": 0.5593980922313718, + "eval_loss": 1.9891562461853027, + "eval_runtime": 2195.6228, + "eval_samples_per_second": 13.409, + "eval_steps_per_second": 1.677, + "step": 2060 + }, + { + "epoch": 0.1420435051121938, + "grad_norm": 0.7625614404678345, + "learning_rate": 0.00018911253430924064, + "loss": 1.9531, + "step": 2070 + }, + { + "epoch": 0.1420435051121938, + "eval_accuracy": 0.5598096242107102, + "eval_loss": 1.9889034032821655, + "eval_runtime": 2200.6199, + "eval_samples_per_second": 13.379, + "eval_steps_per_second": 1.673, + "step": 2070 + }, + { + "epoch": 0.14272970561998216, + "grad_norm": 0.7365880608558655, + "learning_rate": 0.00019002744739249773, + "loss": 1.9819, + "step": 2080 + }, + { + "epoch": 0.14272970561998216, + "eval_accuracy": 0.55980836255559, + "eval_loss": 1.9877856969833374, + "eval_runtime": 2198.0873, + "eval_samples_per_second": 13.394, + "eval_steps_per_second": 1.675, + "step": 2080 + }, + { + "epoch": 0.14341590612777053, + "grad_norm": 0.7005255818367004, + "learning_rate": 0.0001909423604757548, + "loss": 1.9977, + "step": 2090 + }, + { + "epoch": 0.14341590612777053, + "eval_accuracy": 0.5598768903494871, + "eval_loss": 1.9907660484313965, + "eval_runtime": 2194.9399, + "eval_samples_per_second": 13.414, + "eval_steps_per_second": 1.677, + "step": 2090 + }, + { + "epoch": 0.14410210663555892, + "grad_norm": 0.6998113393783569, + "learning_rate": 0.0001918572735590119, + "loss": 1.983, + "step": 2100 + }, + { + "epoch": 0.14410210663555892, + "eval_accuracy": 0.5593210980675892, + "eval_loss": 1.9914487600326538, + "eval_runtime": 2195.2308, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 2100 + }, + { + "epoch": 0.14478830714334728, + "grad_norm": 1.0305057764053345, + "learning_rate": 0.00019277218664226899, + "loss": 2.0584, + "step": 2110 + }, + { + "epoch": 0.14478830714334728, + "eval_accuracy": 0.5584136028202109, + "eval_loss": 1.9976584911346436, + "eval_runtime": 2195.1184, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 2110 + }, + { + "epoch": 0.14547450765113565, + "grad_norm": 0.6542972922325134, + "learning_rate": 0.00019368709972552607, + "loss": 1.9884, + "step": 2120 + }, + { + "epoch": 0.14547450765113565, + "eval_accuracy": 0.5590180684285879, + "eval_loss": 1.9945608377456665, + "eval_runtime": 2196.4694, + "eval_samples_per_second": 13.404, + "eval_steps_per_second": 1.676, + "step": 2120 + }, + { + "epoch": 0.14616070815892404, + "grad_norm": 0.6551515460014343, + "learning_rate": 0.0001946020128087832, + "loss": 1.9907, + "step": 2130 + }, + { + "epoch": 0.14616070815892404, + "eval_accuracy": 0.5585770535611776, + "eval_loss": 1.996436357498169, + "eval_runtime": 2195.8503, + "eval_samples_per_second": 13.408, + "eval_steps_per_second": 1.676, + "step": 2130 + }, + { + "epoch": 0.1468469086667124, + "grad_norm": 0.7233726382255554, + "learning_rate": 0.00019551692589204027, + "loss": 1.8951, + "step": 2140 + }, + { + "epoch": 0.1468469086667124, + "eval_accuracy": 0.5590956602184801, + "eval_loss": 1.994140625, + "eval_runtime": 2198.6021, + "eval_samples_per_second": 13.391, + "eval_steps_per_second": 1.674, + "step": 2140 + }, + { + "epoch": 0.1475331091745008, + "grad_norm": 0.6772713661193848, + "learning_rate": 0.00019643183897529736, + "loss": 1.9488, + "step": 2150 + }, + { + "epoch": 0.1475331091745008, + "eval_accuracy": 0.558802757021891, + "eval_loss": 1.9978493452072144, + "eval_runtime": 2198.8115, + "eval_samples_per_second": 13.39, + "eval_steps_per_second": 1.674, + "step": 2150 + }, + { + "epoch": 0.14821930968228916, + "grad_norm": NaN, + "learning_rate": 0.00019734675205855445, + "loss": 2.0301, + "step": 2160 + }, + { + "epoch": 0.14821930968228916, + "eval_accuracy": 0.5583828582770187, + "eval_loss": 2.0025622844696045, + "eval_runtime": 2195.3189, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 2160 + }, + { + "epoch": 0.14890551019007753, + "grad_norm": 0.7715523838996887, + "learning_rate": 0.00019817017383348584, + "loss": 2.0215, + "step": 2170 + }, + { + "epoch": 0.14890551019007753, + "eval_accuracy": 0.5581144909523723, + "eval_loss": 2.007227659225464, + "eval_runtime": 2201.5716, + "eval_samples_per_second": 13.373, + "eval_steps_per_second": 1.672, + "step": 2170 + }, + { + "epoch": 0.14959171069786592, + "grad_norm": 0.7113980650901794, + "learning_rate": 0.00019908508691674292, + "loss": 1.9645, + "step": 2180 + }, + { + "epoch": 0.14959171069786592, + "eval_accuracy": 0.5589439627910016, + "eval_loss": 1.9947305917739868, + "eval_runtime": 2196.025, + "eval_samples_per_second": 13.407, + "eval_steps_per_second": 1.676, + "step": 2180 + }, + { + "epoch": 0.15027791120565429, + "grad_norm": 0.6457582712173462, + "learning_rate": 0.0002, + "loss": 1.9758, + "step": 2190 + }, + { + "epoch": 0.15027791120565429, + "eval_accuracy": 0.5590743780886894, + "eval_loss": 1.9934245347976685, + "eval_runtime": 2196.674, + "eval_samples_per_second": 13.403, + "eval_steps_per_second": 1.676, + "step": 2190 + }, + { + "epoch": 0.15096411171344268, + "grad_norm": 0.5914057493209839, + "learning_rate": 0.00019995184552042955, + "loss": 1.9647, + "step": 2200 + }, + { + "epoch": 0.15096411171344268, + "eval_accuracy": 0.5594601789438658, + "eval_loss": 1.9928838014602661, + "eval_runtime": 2200.2943, + "eval_samples_per_second": 13.381, + "eval_steps_per_second": 1.673, + "step": 2200 + }, + { + "epoch": 0.15165031222123104, + "grad_norm": 0.6294174194335938, + "learning_rate": 0.00019990369104085908, + "loss": 1.9752, + "step": 2210 + }, + { + "epoch": 0.15165031222123104, + "eval_accuracy": 0.5588302478229311, + "eval_loss": 1.9968582391738892, + "eval_runtime": 2195.8847, + "eval_samples_per_second": 13.408, + "eval_steps_per_second": 1.676, + "step": 2210 + }, + { + "epoch": 0.1523365127290194, + "grad_norm": 0.6448546051979065, + "learning_rate": 0.00019985553656128862, + "loss": 1.9605, + "step": 2220 + }, + { + "epoch": 0.1523365127290194, + "eval_accuracy": 0.5588568089833563, + "eval_loss": 1.9973441362380981, + "eval_runtime": 2200.5474, + "eval_samples_per_second": 13.379, + "eval_steps_per_second": 1.673, + "step": 2220 + }, + { + "epoch": 0.1530227132368078, + "grad_norm": 0.8129657506942749, + "learning_rate": 0.00019980738208171816, + "loss": 2.0269, + "step": 2230 + }, + { + "epoch": 0.1530227132368078, + "eval_accuracy": 0.5584346525398479, + "eval_loss": 1.997729778289795, + "eval_runtime": 2196.0138, + "eval_samples_per_second": 13.407, + "eval_steps_per_second": 1.676, + "step": 2230 + }, + { + "epoch": 0.15370891374459617, + "grad_norm": 0.5907541513442993, + "learning_rate": 0.0001997592276021477, + "loss": 1.9763, + "step": 2240 + }, + { + "epoch": 0.15370891374459617, + "eval_accuracy": 0.5583808329885362, + "eval_loss": 2.0000393390655518, + "eval_runtime": 2195.0414, + "eval_samples_per_second": 13.413, + "eval_steps_per_second": 1.677, + "step": 2240 + }, + { + "epoch": 0.15439511425238456, + "grad_norm": 0.6172444820404053, + "learning_rate": 0.00019971107312257723, + "loss": 1.961, + "step": 2250 + }, + { + "epoch": 0.15439511425238456, + "eval_accuracy": 0.5588485086207234, + "eval_loss": 1.9951170682907104, + "eval_runtime": 2195.2092, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 2250 + }, + { + "epoch": 0.15508131476017292, + "grad_norm": 1.708965539932251, + "learning_rate": 0.00019966291864300677, + "loss": 1.9676, + "step": 2260 + }, + { + "epoch": 0.15508131476017292, + "eval_accuracy": 0.5570037696262905, + "eval_loss": 2.0055410861968994, + "eval_runtime": 2195.1746, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 2260 + }, + { + "epoch": 0.1557675152679613, + "grad_norm": 0.7572776079177856, + "learning_rate": 0.0001996147641634363, + "loss": 1.9993, + "step": 2270 + }, + { + "epoch": 0.1557675152679613, + "eval_accuracy": 0.5573890724597089, + "eval_loss": 1.9998122453689575, + "eval_runtime": 2195.7893, + "eval_samples_per_second": 13.408, + "eval_steps_per_second": 1.676, + "step": 2270 + }, + { + "epoch": 0.15645371577574968, + "grad_norm": 0.6893654465675354, + "learning_rate": 0.00019956660968386585, + "loss": 1.949, + "step": 2280 + }, + { + "epoch": 0.15645371577574968, + "eval_accuracy": 0.5586115498682799, + "eval_loss": 1.9959582090377808, + "eval_runtime": 2195.3478, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 2280 + }, + { + "epoch": 0.15713991628353804, + "grad_norm": 0.6196193695068359, + "learning_rate": 0.00019951845520429538, + "loss": 2.0126, + "step": 2290 + }, + { + "epoch": 0.15713991628353804, + "eval_accuracy": 0.5588276249083391, + "eval_loss": 1.9977123737335205, + "eval_runtime": 2196.787, + "eval_samples_per_second": 13.402, + "eval_steps_per_second": 1.676, + "step": 2290 + }, + { + "epoch": 0.15782611679132644, + "grad_norm": 5.14415979385376, + "learning_rate": 0.00019947030072472492, + "loss": 2.0028, + "step": 2300 + }, + { + "epoch": 0.15782611679132644, + "eval_accuracy": 0.5582278739059375, + "eval_loss": 2.0048232078552246, + "eval_runtime": 2196.1102, + "eval_samples_per_second": 13.406, + "eval_steps_per_second": 1.676, + "step": 2300 + }, + { + "epoch": 0.1585123172991148, + "grad_norm": 0.6127263307571411, + "learning_rate": 0.00019942214624515448, + "loss": 2.0287, + "step": 2310 + }, + { + "epoch": 0.1585123172991148, + "eval_accuracy": 0.5586611860368246, + "eval_loss": 1.995181679725647, + "eval_runtime": 2195.907, + "eval_samples_per_second": 13.408, + "eval_steps_per_second": 1.676, + "step": 2310 + }, + { + "epoch": 0.15919851780690317, + "grad_norm": 0.6055991649627686, + "learning_rate": 0.000199373991765584, + "loss": 1.9834, + "step": 2320 + }, + { + "epoch": 0.15919851780690317, + "eval_accuracy": 0.5592904863301992, + "eval_loss": 1.9946677684783936, + "eval_runtime": 2195.4572, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 2320 + }, + { + "epoch": 0.15988471831469156, + "grad_norm": 0.6904438734054565, + "learning_rate": 0.00019932583728601356, + "loss": 1.98, + "step": 2330 + }, + { + "epoch": 0.15988471831469156, + "eval_accuracy": 0.5583102135032557, + "eval_loss": 1.9992380142211914, + "eval_runtime": 2195.7129, + "eval_samples_per_second": 13.409, + "eval_steps_per_second": 1.676, + "step": 2330 + }, + { + "epoch": 0.16057091882247992, + "grad_norm": 0.7183199524879456, + "learning_rate": 0.00019927768280644307, + "loss": 2.0404, + "step": 2340 + }, + { + "epoch": 0.16057091882247992, + "eval_accuracy": 0.5585802741018792, + "eval_loss": 1.9996939897537231, + "eval_runtime": 2195.2538, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 2340 + }, + { + "epoch": 0.16125711933026832, + "grad_norm": 0.6204356551170349, + "learning_rate": 0.00019922952832687263, + "loss": 1.9546, + "step": 2350 + }, + { + "epoch": 0.16125711933026832, + "eval_accuracy": 0.5581378647735465, + "eval_loss": 1.9987818002700806, + "eval_runtime": 2195.0806, + "eval_samples_per_second": 13.413, + "eval_steps_per_second": 1.677, + "step": 2350 + }, + { + "epoch": 0.16194331983805668, + "grad_norm": 0.6199485063552856, + "learning_rate": 0.00019918137384730214, + "loss": 1.9747, + "step": 2360 + }, + { + "epoch": 0.16194331983805668, + "eval_accuracy": 0.558920688574179, + "eval_loss": 1.9964611530303955, + "eval_runtime": 2195.364, + "eval_samples_per_second": 13.411, + "eval_steps_per_second": 1.677, + "step": 2360 + }, + { + "epoch": 0.16262952034584505, + "grad_norm": 0.6137880086898804, + "learning_rate": 0.00019913321936773168, + "loss": 1.9203, + "step": 2370 + }, + { + "epoch": 0.16262952034584505, + "eval_accuracy": 0.5578602342442018, + "eval_loss": 1.9984673261642456, + "eval_runtime": 2195.4886, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 2370 + }, + { + "epoch": 0.16331572085363344, + "grad_norm": 0.6386215090751648, + "learning_rate": 0.00019908506488816125, + "loss": 2.0383, + "step": 2380 + }, + { + "epoch": 0.16331572085363344, + "eval_accuracy": 0.559068036611638, + "eval_loss": 1.9952830076217651, + "eval_runtime": 2195.2671, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 2380 + }, + { + "epoch": 0.1640019213614218, + "grad_norm": 0.6018114686012268, + "learning_rate": 0.00019903691040859076, + "loss": 1.9676, + "step": 2390 + }, + { + "epoch": 0.1640019213614218, + "eval_accuracy": 0.5586635433398123, + "eval_loss": 1.9960980415344238, + "eval_runtime": 2197.0618, + "eval_samples_per_second": 13.401, + "eval_steps_per_second": 1.675, + "step": 2390 + }, + { + "epoch": 0.1646881218692102, + "grad_norm": 0.6331018805503845, + "learning_rate": 0.00019898875592902032, + "loss": 1.9598, + "step": 2400 + }, + { + "epoch": 0.1646881218692102, + "eval_accuracy": 0.5588065419872515, + "eval_loss": 1.9976948499679565, + "eval_runtime": 2195.2367, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 2400 + }, + { + "epoch": 0.16537432237699856, + "grad_norm": 0.6709062457084656, + "learning_rate": 0.00019894060144944983, + "loss": 1.9985, + "step": 2410 + }, + { + "epoch": 0.16537432237699856, + "eval_accuracy": 0.5586920633858189, + "eval_loss": 1.9963314533233643, + "eval_runtime": 2195.5736, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 2410 + }, + { + "epoch": 0.16606052288478693, + "grad_norm": 0.6212249398231506, + "learning_rate": 0.0001988924469698794, + "loss": 1.9174, + "step": 2420 + }, + { + "epoch": 0.16606052288478693, + "eval_accuracy": 0.5585765223379692, + "eval_loss": 1.9955825805664062, + "eval_runtime": 2195.9048, + "eval_samples_per_second": 13.408, + "eval_steps_per_second": 1.676, + "step": 2420 + }, + { + "epoch": 0.16674672339257532, + "grad_norm": 0.5908493399620056, + "learning_rate": 0.0001988442924903089, + "loss": 1.9768, + "step": 2430 + }, + { + "epoch": 0.16674672339257532, + "eval_accuracy": 0.5599598607743654, + "eval_loss": 1.996363878250122, + "eval_runtime": 2196.1829, + "eval_samples_per_second": 13.406, + "eval_steps_per_second": 1.676, + "step": 2430 + }, + { + "epoch": 0.16743292390036368, + "grad_norm": 0.6591713428497314, + "learning_rate": 0.00019879613801073847, + "loss": 1.9607, + "step": 2440 + }, + { + "epoch": 0.16743292390036368, + "eval_accuracy": 0.5579004412007955, + "eval_loss": 1.9992504119873047, + "eval_runtime": 2195.5161, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 2440 + }, + { + "epoch": 0.16811912440815205, + "grad_norm": 0.6437056660652161, + "learning_rate": 0.00019874798353116798, + "loss": 2.0276, + "step": 2450 + }, + { + "epoch": 0.16811912440815205, + "eval_accuracy": 0.5586110186450713, + "eval_loss": 1.9952548742294312, + "eval_runtime": 2195.2496, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 2450 + }, + { + "epoch": 0.16880532491594044, + "grad_norm": 0.564586877822876, + "learning_rate": 0.00019869982905159754, + "loss": 1.9414, + "step": 2460 + }, + { + "epoch": 0.16880532491594044, + "eval_accuracy": 0.5588329039389736, + "eval_loss": 1.9931731224060059, + "eval_runtime": 2195.1623, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 2460 + }, + { + "epoch": 0.1694915254237288, + "grad_norm": 0.6013181805610657, + "learning_rate": 0.00019865167457202708, + "loss": 1.9564, + "step": 2470 + }, + { + "epoch": 0.1694915254237288, + "eval_accuracy": 0.5598632777547692, + "eval_loss": 1.9917162656784058, + "eval_runtime": 2196.6864, + "eval_samples_per_second": 13.403, + "eval_steps_per_second": 1.676, + "step": 2470 + }, + { + "epoch": 0.1701777259315172, + "grad_norm": 0.6139594912528992, + "learning_rate": 0.00019860352009245662, + "loss": 1.9678, + "step": 2480 + }, + { + "epoch": 0.1701777259315172, + "eval_accuracy": 0.5594104431709697, + "eval_loss": 1.9919596910476685, + "eval_runtime": 2195.2781, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 2480 + }, + { + "epoch": 0.17086392643930556, + "grad_norm": 0.6118595600128174, + "learning_rate": 0.00019855536561288616, + "loss": 1.9903, + "step": 2490 + }, + { + "epoch": 0.17086392643930556, + "eval_accuracy": 0.5595184142880981, + "eval_loss": 1.9930351972579956, + "eval_runtime": 2195.0637, + "eval_samples_per_second": 13.413, + "eval_steps_per_second": 1.677, + "step": 2490 + }, + { + "epoch": 0.17155012694709393, + "grad_norm": 0.6277731657028198, + "learning_rate": 0.0001985072111333157, + "loss": 1.9994, + "step": 2500 + }, + { + "epoch": 0.17155012694709393, + "eval_accuracy": 0.5604368328127014, + "eval_loss": 1.9892007112503052, + "eval_runtime": 2196.3916, + "eval_samples_per_second": 13.405, + "eval_steps_per_second": 1.676, + "step": 2500 + }, + { + "epoch": 0.17223632745488232, + "grad_norm": 0.6022769808769226, + "learning_rate": 0.00019845905665374523, + "loss": 1.955, + "step": 2510 + }, + { + "epoch": 0.17223632745488232, + "eval_accuracy": 0.5598575671052778, + "eval_loss": 1.9900598526000977, + "eval_runtime": 2195.1324, + "eval_samples_per_second": 13.412, + "eval_steps_per_second": 1.677, + "step": 2510 + }, + { + "epoch": 0.17292252796267069, + "grad_norm": 0.7676475644111633, + "learning_rate": 0.00019841090217417477, + "loss": 1.9778, + "step": 2520 + }, + { + "epoch": 0.17292252796267069, + "eval_accuracy": 0.5599910369364145, + "eval_loss": 1.9912259578704834, + "eval_runtime": 2195.5014, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 2520 + }, + { + "epoch": 0.17360872847045908, + "grad_norm": 0.6049858331680298, + "learning_rate": 0.0001983627476946043, + "loss": 2.0018, + "step": 2530 + }, + { + "epoch": 0.17360872847045908, + "eval_accuracy": 0.5597790788762212, + "eval_loss": 1.9898487329483032, + "eval_runtime": 2196.3189, + "eval_samples_per_second": 13.405, + "eval_steps_per_second": 1.676, + "step": 2530 + }, + { + "epoch": 0.17429492897824744, + "grad_norm": 0.597239077091217, + "learning_rate": 0.00019831459321503382, + "loss": 1.9675, + "step": 2540 + }, + { + "epoch": 0.17429492897824744, + "eval_accuracy": 0.5608272154680511, + "eval_loss": 1.987794041633606, + "eval_runtime": 2196.2383, + "eval_samples_per_second": 13.406, + "eval_steps_per_second": 1.676, + "step": 2540 + }, + { + "epoch": 0.1749811294860358, + "grad_norm": 0.6299741864204407, + "learning_rate": 0.00019826643873546338, + "loss": 2.0204, + "step": 2550 + }, + { + "epoch": 0.1749811294860358, + "eval_accuracy": 0.5596887709307754, + "eval_loss": 1.9905835390090942, + "eval_runtime": 2196.3707, + "eval_samples_per_second": 13.405, + "eval_steps_per_second": 1.676, + "step": 2550 + }, + { + "epoch": 0.1756673299938242, + "grad_norm": 0.6488902568817139, + "learning_rate": 0.00019821828425589292, + "loss": 1.9863, + "step": 2560 + }, + { + "epoch": 0.1756673299938242, + "eval_accuracy": 0.5600538872822707, + "eval_loss": 1.9906458854675293, + "eval_runtime": 2195.6961, + "eval_samples_per_second": 13.409, + "eval_steps_per_second": 1.676, + "step": 2560 + }, + { + "epoch": 0.17635353050161257, + "grad_norm": 0.6427273154258728, + "learning_rate": 0.00019817012977632246, + "loss": 1.9403, + "step": 2570 + }, + { + "epoch": 0.17635353050161257, + "eval_accuracy": 0.5603546260211854, + "eval_loss": 1.989209532737732, + "eval_runtime": 2196.6688, + "eval_samples_per_second": 13.403, + "eval_steps_per_second": 1.676, + "step": 2570 + }, + { + "epoch": 0.17703973100940096, + "grad_norm": 0.5979894399642944, + "learning_rate": 0.000198121975296752, + "loss": 1.9437, + "step": 2580 + }, + { + "epoch": 0.17703973100940096, + "eval_accuracy": 0.5593561587993505, + "eval_loss": 1.9925955533981323, + "eval_runtime": 2195.5236, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 2580 + }, + { + "epoch": 0.17772593151718932, + "grad_norm": 0.6633705496788025, + "learning_rate": 0.00019807382081718153, + "loss": 1.9804, + "step": 2590 + }, + { + "epoch": 0.17772593151718932, + "eval_accuracy": 0.5602276636743527, + "eval_loss": 1.988720178604126, + "eval_runtime": 2195.5843, + "eval_samples_per_second": 13.41, + "eval_steps_per_second": 1.677, + "step": 2590 + }, + { + "epoch": 0.1784121320249777, + "grad_norm": 0.5960366129875183, + "learning_rate": 0.00019802566633761107, + "loss": 1.9351, + "step": 2600 + }, + { + "epoch": 0.1784121320249777, + "eval_accuracy": 0.559032876275525, + "eval_loss": 1.9919941425323486, + "eval_runtime": 2195.8734, + "eval_samples_per_second": 13.408, + "eval_steps_per_second": 1.676, + "step": 2600 + }, + { + "epoch": 0.17909833253276608, + "grad_norm": 0.6105952858924866, + "learning_rate": 0.0001979775118580406, + "loss": 1.9903, + "step": 2610 + }, + { + "epoch": 0.17909833253276608, + "eval_accuracy": 0.5600523268140957, + "eval_loss": 1.9881607294082642, + "eval_runtime": 2196.3376, + "eval_samples_per_second": 13.405, + "eval_steps_per_second": 1.676, + "step": 2610 + }, + { + "epoch": 0.17978453304055445, + "grad_norm": 0.6086561679840088, + "learning_rate": 0.00019792935737847014, + "loss": 1.9367, + "step": 2620 + }, + { + "epoch": 0.17978453304055445, + "eval_accuracy": 0.5591538955627124, + "eval_loss": 1.991269588470459, + "eval_runtime": 2196.0471, + "eval_samples_per_second": 13.407, + "eval_steps_per_second": 1.676, + "step": 2620 + }, + { + "epoch": 0.18047073354834284, + "grad_norm": 2.8778371810913086, + "learning_rate": 0.00019788120289889968, + "loss": 1.9686, + "step": 2630 + }, + { + "epoch": 0.18047073354834284, + "eval_accuracy": 0.5572145988371657, + "eval_loss": 2.003204107284546, + "eval_runtime": 2195.7568, + "eval_samples_per_second": 13.409, + "eval_steps_per_second": 1.676, + "step": 2630 + } + ], + "logging_steps": 10, + "max_steps": 43719, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.8121688667193344e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}