diff --git "a/trainer_state.json" "b/trainer_state.json" deleted file mode 100644--- "a/trainer_state.json" +++ /dev/null @@ -1,4249 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.18047073354834284, - "eval_steps": 10, - "global_step": 2630, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 6.862005077883757e-05, - "grad_norm": 1.9216967821121216, - "learning_rate": 0.0, - "loss": 1.6865, - "step": 1 - }, - { - "epoch": 0.0006862005077883758, - "grad_norm": 1.8931514024734497, - "learning_rate": 8.234217749313815e-07, - "loss": 1.7993, - "step": 10 - }, - { - "epoch": 0.0006862005077883758, - "eval_accuracy": 0.5898158335459887, - "eval_loss": 1.8085482120513916, - "eval_runtime": 2190.4949, - "eval_samples_per_second": 13.441, - "eval_steps_per_second": 1.68, - "step": 10 - }, - { - "epoch": 0.0013724010155767516, - "grad_norm": 1.8920422792434692, - "learning_rate": 1.7383348581884722e-06, - "loss": 1.8431, - "step": 20 - }, - { - "epoch": 0.0013724010155767516, - "eval_accuracy": 0.5915482852347239, - "eval_loss": 1.7978498935699463, - "eval_runtime": 2191.0867, - "eval_samples_per_second": 13.437, - "eval_steps_per_second": 1.68, - "step": 20 - }, - { - "epoch": 0.0020586015233651275, - "grad_norm": 1.7306218147277832, - "learning_rate": 2.653247941445563e-06, - "loss": 1.7786, - "step": 30 - }, - { - "epoch": 0.0020586015233651275, - "eval_accuracy": 0.5931407264065678, - "eval_loss": 1.7860997915267944, - "eval_runtime": 2192.9273, - "eval_samples_per_second": 13.426, - "eval_steps_per_second": 1.679, - "step": 30 - }, - { - "epoch": 0.002744802031153503, - "grad_norm": 1.678896427154541, - "learning_rate": 3.5681610247026537e-06, - "loss": 1.7845, - "step": 40 - }, - { - "epoch": 0.002744802031153503, - "eval_accuracy": 0.5940313221156256, - "eval_loss": 1.777963638305664, - "eval_runtime": 2193.6313, - "eval_samples_per_second": 13.422, - "eval_steps_per_second": 1.678, - "step": 40 - }, - { - "epoch": 0.003431002538941879, - "grad_norm": 1.608749270439148, - "learning_rate": 4.483074107959744e-06, - "loss": 1.7561, - "step": 50 - }, - { - "epoch": 0.003431002538941879, - "eval_accuracy": 0.5950869622352757, - "eval_loss": 1.7716691493988037, - "eval_runtime": 2195.0575, - "eval_samples_per_second": 13.413, - "eval_steps_per_second": 1.677, - "step": 50 - }, - { - "epoch": 0.004117203046730255, - "grad_norm": 1.5839513540267944, - "learning_rate": 5.397987191216835e-06, - "loss": 1.7809, - "step": 60 - }, - { - "epoch": 0.004117203046730255, - "eval_accuracy": 0.5955689808940925, - "eval_loss": 1.767722725868225, - "eval_runtime": 2194.6298, - "eval_samples_per_second": 13.415, - "eval_steps_per_second": 1.677, - "step": 60 - }, - { - "epoch": 0.00480340355451863, - "grad_norm": 1.6457411050796509, - "learning_rate": 6.312900274473925e-06, - "loss": 1.7554, - "step": 70 - }, - { - "epoch": 0.00480340355451863, - "eval_accuracy": 0.5958804437015288, - "eval_loss": 1.765299916267395, - "eval_runtime": 2194.3885, - "eval_samples_per_second": 13.417, - "eval_steps_per_second": 1.677, - "step": 70 - }, - { - "epoch": 0.005489604062307006, - "grad_norm": 1.5799423456192017, - "learning_rate": 7.227813357731015e-06, - "loss": 1.7361, - "step": 80 - }, - { - "epoch": 0.005489604062307006, - "eval_accuracy": 0.5960319751217547, - "eval_loss": 1.7639434337615967, - "eval_runtime": 2195.254, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 80 - }, - { - "epoch": 0.0061758045700953815, - "grad_norm": 1.7162026166915894, - "learning_rate": 8.142726440988106e-06, - "loss": 1.7427, - "step": 90 - }, - { - "epoch": 0.0061758045700953815, - "eval_accuracy": 0.5961102309406575, - "eval_loss": 1.7636356353759766, - "eval_runtime": 2195.6998, - "eval_samples_per_second": 13.409, - "eval_steps_per_second": 1.676, - "step": 90 - }, - { - "epoch": 0.006862005077883758, - "grad_norm": 1.703099012374878, - "learning_rate": 9.057639524245198e-06, - "loss": 1.7479, - "step": 100 - }, - { - "epoch": 0.006862005077883758, - "eval_accuracy": 0.5962347363801508, - "eval_loss": 1.7628475427627563, - "eval_runtime": 2195.4873, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 100 - }, - { - "epoch": 0.007548205585672134, - "grad_norm": 1.5629897117614746, - "learning_rate": 9.972552607502288e-06, - "loss": 1.7511, - "step": 110 - }, - { - "epoch": 0.007548205585672134, - "eval_accuracy": 0.5963783990566007, - "eval_loss": 1.761244535446167, - "eval_runtime": 2194.8861, - "eval_samples_per_second": 13.414, - "eval_steps_per_second": 1.677, - "step": 110 - }, - { - "epoch": 0.00823440609346051, - "grad_norm": 1.8800327777862549, - "learning_rate": 1.0887465690759378e-05, - "loss": 1.7918, - "step": 120 - }, - { - "epoch": 0.00823440609346051, - "eval_accuracy": 0.595912250691138, - "eval_loss": 1.7631102800369263, - "eval_runtime": 2195.8634, - "eval_samples_per_second": 13.408, - "eval_steps_per_second": 1.676, - "step": 120 - }, - { - "epoch": 0.008920606601248885, - "grad_norm": 1.6483738422393799, - "learning_rate": 1.180237877401647e-05, - "loss": 1.7451, - "step": 130 - }, - { - "epoch": 0.008920606601248885, - "eval_accuracy": 0.5959484734736679, - "eval_loss": 1.7631044387817383, - "eval_runtime": 2194.7206, - "eval_samples_per_second": 13.415, - "eval_steps_per_second": 1.677, - "step": 130 - }, - { - "epoch": 0.00960680710903726, - "grad_norm": 1.5531351566314697, - "learning_rate": 1.2717291857273558e-05, - "loss": 1.8069, - "step": 140 - }, - { - "epoch": 0.00960680710903726, - "eval_accuracy": 0.595846910236492, - "eval_loss": 1.7640652656555176, - "eval_runtime": 2194.5306, - "eval_samples_per_second": 13.416, - "eval_steps_per_second": 1.677, - "step": 140 - }, - { - "epoch": 0.010293007616825637, - "grad_norm": 1.653224229812622, - "learning_rate": 1.3632204940530649e-05, - "loss": 1.774, - "step": 150 - }, - { - "epoch": 0.010293007616825637, - "eval_accuracy": 0.5955285083258945, - "eval_loss": 1.76512610912323, - "eval_runtime": 2194.8618, - "eval_samples_per_second": 13.414, - "eval_steps_per_second": 1.677, - "step": 150 - }, - { - "epoch": 0.010979208124614013, - "grad_norm": 1.4969476461410522, - "learning_rate": 1.454711802378774e-05, - "loss": 1.7293, - "step": 160 - }, - { - "epoch": 0.010979208124614013, - "eval_accuracy": 0.5952186723895343, - "eval_loss": 1.7661479711532593, - "eval_runtime": 2195.9262, - "eval_samples_per_second": 13.408, - "eval_steps_per_second": 1.676, - "step": 160 - }, - { - "epoch": 0.011665408632402388, - "grad_norm": 1.5510554313659668, - "learning_rate": 1.546203110704483e-05, - "loss": 1.8144, - "step": 170 - }, - { - "epoch": 0.011665408632402388, - "eval_accuracy": 0.5952723923364943, - "eval_loss": 1.7658929824829102, - "eval_runtime": 2195.7289, - "eval_samples_per_second": 13.409, - "eval_steps_per_second": 1.676, - "step": 170 - }, - { - "epoch": 0.012351609140190763, - "grad_norm": 1.5824214220046997, - "learning_rate": 1.6376944190301923e-05, - "loss": 1.762, - "step": 180 - }, - { - "epoch": 0.012351609140190763, - "eval_accuracy": 0.5950958270225676, - "eval_loss": 1.7668511867523193, - "eval_runtime": 2196.0319, - "eval_samples_per_second": 13.407, - "eval_steps_per_second": 1.676, - "step": 180 - }, - { - "epoch": 0.01303780964797914, - "grad_norm": 1.536055088043213, - "learning_rate": 1.7291857273559013e-05, - "loss": 1.7647, - "step": 190 - }, - { - "epoch": 0.01303780964797914, - "eval_accuracy": 0.5946427600286144, - "eval_loss": 1.7685387134552002, - "eval_runtime": 2194.9602, - "eval_samples_per_second": 13.413, - "eval_steps_per_second": 1.677, - "step": 190 - }, - { - "epoch": 0.013724010155767515, - "grad_norm": 1.5057470798492432, - "learning_rate": 1.8206770356816103e-05, - "loss": 1.7745, - "step": 200 - }, - { - "epoch": 0.013724010155767515, - "eval_accuracy": 0.5946046447634041, - "eval_loss": 1.7696959972381592, - "eval_runtime": 2194.9716, - "eval_samples_per_second": 13.413, - "eval_steps_per_second": 1.677, - "step": 200 - }, - { - "epoch": 0.01441021066355589, - "grad_norm": 1.3596829175949097, - "learning_rate": 1.9121683440073193e-05, - "loss": 1.7936, - "step": 210 - }, - { - "epoch": 0.01441021066355589, - "eval_accuracy": 0.5943018475345566, - "eval_loss": 1.7707691192626953, - "eval_runtime": 2195.7035, - "eval_samples_per_second": 13.409, - "eval_steps_per_second": 1.676, - "step": 210 - }, - { - "epoch": 0.015096411171344267, - "grad_norm": 1.5187965631484985, - "learning_rate": 2.0036596523330283e-05, - "loss": 1.7726, - "step": 220 - }, - { - "epoch": 0.015096411171344267, - "eval_accuracy": 0.5939556892113148, - "eval_loss": 1.7723795175552368, - "eval_runtime": 2196.0461, - "eval_samples_per_second": 13.407, - "eval_steps_per_second": 1.676, - "step": 220 - }, - { - "epoch": 0.01578261167913264, - "grad_norm": 1.4719396829605103, - "learning_rate": 2.0951509606587374e-05, - "loss": 1.7646, - "step": 230 - }, - { - "epoch": 0.01578261167913264, - "eval_accuracy": 0.5935164340207827, - "eval_loss": 1.7753576040267944, - "eval_runtime": 2196.1999, - "eval_samples_per_second": 13.406, - "eval_steps_per_second": 1.676, - "step": 230 - }, - { - "epoch": 0.01646881218692102, - "grad_norm": 1.4845266342163086, - "learning_rate": 2.1866422689844464e-05, - "loss": 1.7424, - "step": 240 - }, - { - "epoch": 0.01646881218692102, - "eval_accuracy": 0.5930875044813658, - "eval_loss": 1.777539849281311, - "eval_runtime": 2196.2424, - "eval_samples_per_second": 13.406, - "eval_steps_per_second": 1.676, - "step": 240 - }, - { - "epoch": 0.017155012694709395, - "grad_norm": 1.3525185585021973, - "learning_rate": 2.2781335773101558e-05, - "loss": 1.7129, - "step": 250 - }, - { - "epoch": 0.017155012694709395, - "eval_accuracy": 0.5927156814368632, - "eval_loss": 1.7804397344589233, - "eval_runtime": 2195.161, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 250 - }, - { - "epoch": 0.01784121320249777, - "grad_norm": 1.3116035461425781, - "learning_rate": 2.3696248856358648e-05, - "loss": 1.7057, - "step": 260 - }, - { - "epoch": 0.01784121320249777, - "eval_accuracy": 0.5927907831179655, - "eval_loss": 1.77951180934906, - "eval_runtime": 2195.0811, - "eval_samples_per_second": 13.413, - "eval_steps_per_second": 1.677, - "step": 260 - }, - { - "epoch": 0.018527413710286145, - "grad_norm": 1.427150011062622, - "learning_rate": 2.4611161939615738e-05, - "loss": 1.7832, - "step": 270 - }, - { - "epoch": 0.018527413710286145, - "eval_accuracy": 0.5925692298385685, - "eval_loss": 1.78168523311615, - "eval_runtime": 2195.7308, - "eval_samples_per_second": 13.409, - "eval_steps_per_second": 1.676, - "step": 270 - }, - { - "epoch": 0.01921361421807452, - "grad_norm": 1.4132215976715088, - "learning_rate": 2.552607502287283e-05, - "loss": 1.7742, - "step": 280 - }, - { - "epoch": 0.01921361421807452, - "eval_accuracy": 0.5916866688805393, - "eval_loss": 1.7839642763137817, - "eval_runtime": 2194.9088, - "eval_samples_per_second": 13.414, - "eval_steps_per_second": 1.677, - "step": 280 - }, - { - "epoch": 0.019899814725862896, - "grad_norm": 1.3926459550857544, - "learning_rate": 2.6440988106129922e-05, - "loss": 1.7713, - "step": 290 - }, - { - "epoch": 0.019899814725862896, - "eval_accuracy": 0.592083725027446, - "eval_loss": 1.784454345703125, - "eval_runtime": 2195.1648, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 290 - }, - { - "epoch": 0.020586015233651275, - "grad_norm": 1.2985360622406006, - "learning_rate": 2.7355901189387012e-05, - "loss": 1.8099, - "step": 300 - }, - { - "epoch": 0.020586015233651275, - "eval_accuracy": 0.5915102363724148, - "eval_loss": 1.7870558500289917, - "eval_runtime": 2196.189, - "eval_samples_per_second": 13.406, - "eval_steps_per_second": 1.676, - "step": 300 - }, - { - "epoch": 0.02127221574143965, - "grad_norm": 1.3202718496322632, - "learning_rate": 2.8270814272644102e-05, - "loss": 1.8158, - "step": 310 - }, - { - "epoch": 0.02127221574143965, - "eval_accuracy": 0.5911451532223702, - "eval_loss": 1.789017677307129, - "eval_runtime": 2195.7623, - "eval_samples_per_second": 13.409, - "eval_steps_per_second": 1.676, - "step": 310 - }, - { - "epoch": 0.021958416249228025, - "grad_norm": 1.3661465644836426, - "learning_rate": 2.9185727355901192e-05, - "loss": 1.7867, - "step": 320 - }, - { - "epoch": 0.021958416249228025, - "eval_accuracy": 0.5906047664135189, - "eval_loss": 1.7920600175857544, - "eval_runtime": 2196.3235, - "eval_samples_per_second": 13.405, - "eval_steps_per_second": 1.676, - "step": 320 - }, - { - "epoch": 0.0226446167570164, - "grad_norm": 1.2746013402938843, - "learning_rate": 3.0100640439158283e-05, - "loss": 1.7356, - "step": 330 - }, - { - "epoch": 0.0226446167570164, - "eval_accuracy": 0.5906917210124609, - "eval_loss": 1.7915929555892944, - "eval_runtime": 2196.2755, - "eval_samples_per_second": 13.405, - "eval_steps_per_second": 1.676, - "step": 330 - }, - { - "epoch": 0.023330817264804776, - "grad_norm": 1.3848775625228882, - "learning_rate": 3.101555352241537e-05, - "loss": 1.7845, - "step": 340 - }, - { - "epoch": 0.023330817264804776, - "eval_accuracy": 0.5902880577768986, - "eval_loss": 1.7954319715499878, - "eval_runtime": 2195.6581, - "eval_samples_per_second": 13.409, - "eval_steps_per_second": 1.676, - "step": 340 - }, - { - "epoch": 0.02401701777259315, - "grad_norm": 1.3434277772903442, - "learning_rate": 3.193046660567246e-05, - "loss": 1.8831, - "step": 350 - }, - { - "epoch": 0.02401701777259315, - "eval_accuracy": 0.5896593219081829, - "eval_loss": 1.7972683906555176, - "eval_runtime": 2196.3435, - "eval_samples_per_second": 13.405, - "eval_steps_per_second": 1.676, - "step": 350 - }, - { - "epoch": 0.024703218280381526, - "grad_norm": 1.266026258468628, - "learning_rate": 3.284537968892955e-05, - "loss": 1.7802, - "step": 360 - }, - { - "epoch": 0.024703218280381526, - "eval_accuracy": 0.5896452112917071, - "eval_loss": 1.7977614402770996, - "eval_runtime": 2194.9145, - "eval_samples_per_second": 13.414, - "eval_steps_per_second": 1.677, - "step": 360 - }, - { - "epoch": 0.025389418788169905, - "grad_norm": 1.2292088270187378, - "learning_rate": 3.3760292772186643e-05, - "loss": 1.8164, - "step": 370 - }, - { - "epoch": 0.025389418788169905, - "eval_accuracy": 0.5890495108662703, - "eval_loss": 1.8011078834533691, - "eval_runtime": 2195.3229, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 370 - }, - { - "epoch": 0.02607561929595828, - "grad_norm": 1.2202467918395996, - "learning_rate": 3.467520585544373e-05, - "loss": 1.8117, - "step": 380 - }, - { - "epoch": 0.02607561929595828, - "eval_accuracy": 0.5890773336818158, - "eval_loss": 1.8025044202804565, - "eval_runtime": 2195.1721, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 380 - }, - { - "epoch": 0.026761819803746655, - "grad_norm": 1.2751961946487427, - "learning_rate": 3.5590118938700824e-05, - "loss": 1.8681, - "step": 390 - }, - { - "epoch": 0.026761819803746655, - "eval_accuracy": 0.5889283255718303, - "eval_loss": 1.8029353618621826, - "eval_runtime": 2194.9883, - "eval_samples_per_second": 13.413, - "eval_steps_per_second": 1.677, - "step": 390 - }, - { - "epoch": 0.02744802031153503, - "grad_norm": 1.1804041862487793, - "learning_rate": 3.650503202195791e-05, - "loss": 1.7925, - "step": 400 - }, - { - "epoch": 0.02744802031153503, - "eval_accuracy": 0.5888150754240672, - "eval_loss": 1.8044122457504272, - "eval_runtime": 2195.7663, - "eval_samples_per_second": 13.409, - "eval_steps_per_second": 1.676, - "step": 400 - }, - { - "epoch": 0.028134220819323406, - "grad_norm": 1.2452822923660278, - "learning_rate": 3.7419945105215004e-05, - "loss": 1.8606, - "step": 410 - }, - { - "epoch": 0.028134220819323406, - "eval_accuracy": 0.5877631206654261, - "eval_loss": 1.8087084293365479, - "eval_runtime": 2195.4238, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 410 - }, - { - "epoch": 0.02882042132711178, - "grad_norm": 1.3155659437179565, - "learning_rate": 3.83348581884721e-05, - "loss": 1.7495, - "step": 420 - }, - { - "epoch": 0.02882042132711178, - "eval_accuracy": 0.5877042876950842, - "eval_loss": 1.8115092515945435, - "eval_runtime": 2195.0116, - "eval_samples_per_second": 13.413, - "eval_steps_per_second": 1.677, - "step": 420 - }, - { - "epoch": 0.029506621834900156, - "grad_norm": 1.2945083379745483, - "learning_rate": 3.9249771271729185e-05, - "loss": 1.8207, - "step": 430 - }, - { - "epoch": 0.029506621834900156, - "eval_accuracy": 0.5871699103487793, - "eval_loss": 1.812555193901062, - "eval_runtime": 2196.4551, - "eval_samples_per_second": 13.404, - "eval_steps_per_second": 1.676, - "step": 430 - }, - { - "epoch": 0.030192822342688535, - "grad_norm": 1.2076581716537476, - "learning_rate": 4.016468435498628e-05, - "loss": 1.8009, - "step": 440 - }, - { - "epoch": 0.030192822342688535, - "eval_accuracy": 0.5872769186238423, - "eval_loss": 1.8119513988494873, - "eval_runtime": 2195.9383, - "eval_samples_per_second": 13.407, - "eval_steps_per_second": 1.676, - "step": 440 - }, - { - "epoch": 0.03087902285047691, - "grad_norm": 1.2269212007522583, - "learning_rate": 4.1079597438243365e-05, - "loss": 1.8264, - "step": 450 - }, - { - "epoch": 0.03087902285047691, - "eval_accuracy": 0.5867487167473362, - "eval_loss": 1.815908670425415, - "eval_runtime": 2195.2403, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 450 - }, - { - "epoch": 0.03156522335826528, - "grad_norm": 1.2208211421966553, - "learning_rate": 4.199451052150046e-05, - "loss": 1.8048, - "step": 460 - }, - { - "epoch": 0.03156522335826528, - "eval_accuracy": 0.5865989450039885, - "eval_loss": 1.8151869773864746, - "eval_runtime": 2196.2404, - "eval_samples_per_second": 13.406, - "eval_steps_per_second": 1.676, - "step": 460 - }, - { - "epoch": 0.03225142386605366, - "grad_norm": 1.1229742765426636, - "learning_rate": 4.2909423604757546e-05, - "loss": 1.8115, - "step": 470 - }, - { - "epoch": 0.03225142386605366, - "eval_accuracy": 0.5854921082476188, - "eval_loss": 1.8196277618408203, - "eval_runtime": 2196.4836, - "eval_samples_per_second": 13.404, - "eval_steps_per_second": 1.676, - "step": 470 - }, - { - "epoch": 0.03293762437384204, - "grad_norm": 1.1489847898483276, - "learning_rate": 4.382433668801464e-05, - "loss": 1.8195, - "step": 480 - }, - { - "epoch": 0.03293762437384204, - "eval_accuracy": 0.5857865387109324, - "eval_loss": 1.8216103315353394, - "eval_runtime": 2195.3397, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 480 - }, - { - "epoch": 0.03362382488163041, - "grad_norm": 1.2842459678649902, - "learning_rate": 4.473924977127173e-05, - "loss": 1.7904, - "step": 490 - }, - { - "epoch": 0.03362382488163041, - "eval_accuracy": 0.5854642190291723, - "eval_loss": 1.823735237121582, - "eval_runtime": 2195.5222, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 490 - }, - { - "epoch": 0.03431002538941879, - "grad_norm": 1.1588455438613892, - "learning_rate": 4.565416285452882e-05, - "loss": 1.8076, - "step": 500 - }, - { - "epoch": 0.03431002538941879, - "eval_accuracy": 0.584858989787433, - "eval_loss": 1.82528555393219, - "eval_runtime": 2196.6782, - "eval_samples_per_second": 13.403, - "eval_steps_per_second": 1.676, - "step": 500 - }, - { - "epoch": 0.03499622589720716, - "grad_norm": 1.0610827207565308, - "learning_rate": 4.656907593778591e-05, - "loss": 1.7858, - "step": 510 - }, - { - "epoch": 0.03499622589720716, - "eval_accuracy": 0.5846732608731596, - "eval_loss": 1.8272473812103271, - "eval_runtime": 2195.3692, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 510 - }, - { - "epoch": 0.03568242640499554, - "grad_norm": 1.1871790885925293, - "learning_rate": 4.7483989021043e-05, - "loss": 1.8204, - "step": 520 - }, - { - "epoch": 0.03568242640499554, - "eval_accuracy": 0.5842074113207517, - "eval_loss": 1.829712986946106, - "eval_runtime": 2195.2548, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 520 - }, - { - "epoch": 0.03636862691278391, - "grad_norm": 1.1551142930984497, - "learning_rate": 4.8398902104300094e-05, - "loss": 1.8692, - "step": 530 - }, - { - "epoch": 0.03636862691278391, - "eval_accuracy": 0.5840751699432846, - "eval_loss": 1.8313241004943848, - "eval_runtime": 2196.5372, - "eval_samples_per_second": 13.404, - "eval_steps_per_second": 1.676, - "step": 530 - }, - { - "epoch": 0.03705482742057229, - "grad_norm": 1.1456727981567383, - "learning_rate": 4.931381518755718e-05, - "loss": 1.8427, - "step": 540 - }, - { - "epoch": 0.03705482742057229, - "eval_accuracy": 0.5839454850775084, - "eval_loss": 1.8317539691925049, - "eval_runtime": 2197.0175, - "eval_samples_per_second": 13.401, - "eval_steps_per_second": 1.675, - "step": 540 - }, - { - "epoch": 0.03774102792836067, - "grad_norm": 1.045682668685913, - "learning_rate": 5.0228728270814274e-05, - "loss": 1.8425, - "step": 550 - }, - { - "epoch": 0.03774102792836067, - "eval_accuracy": 0.5831079120849495, - "eval_loss": 1.83717679977417, - "eval_runtime": 2195.5341, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 550 - }, - { - "epoch": 0.03842722843614904, - "grad_norm": 1.0730020999908447, - "learning_rate": 5.114364135407137e-05, - "loss": 1.7764, - "step": 560 - }, - { - "epoch": 0.03842722843614904, - "eval_accuracy": 0.5835356463721473, - "eval_loss": 1.835798978805542, - "eval_runtime": 2195.2526, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 560 - }, - { - "epoch": 0.03911342894393742, - "grad_norm": 1.0931340456008911, - "learning_rate": 5.2058554437328455e-05, - "loss": 1.8409, - "step": 570 - }, - { - "epoch": 0.03911342894393742, - "eval_accuracy": 0.5838007599546415, - "eval_loss": 1.8357644081115723, - "eval_runtime": 2196.814, - "eval_samples_per_second": 13.402, - "eval_steps_per_second": 1.676, - "step": 570 - }, - { - "epoch": 0.03979962945172579, - "grad_norm": 1.0866916179656982, - "learning_rate": 5.297346752058555e-05, - "loss": 1.8691, - "step": 580 - }, - { - "epoch": 0.03979962945172579, - "eval_accuracy": 0.5829323428145388, - "eval_loss": 1.8371539115905762, - "eval_runtime": 2195.2154, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 580 - }, - { - "epoch": 0.04048582995951417, - "grad_norm": 1.0397108793258667, - "learning_rate": 5.3888380603842635e-05, - "loss": 1.8297, - "step": 590 - }, - { - "epoch": 0.04048582995951417, - "eval_accuracy": 0.582625661015979, - "eval_loss": 1.8405317068099976, - "eval_runtime": 2196.6833, - "eval_samples_per_second": 13.403, - "eval_steps_per_second": 1.676, - "step": 590 - }, - { - "epoch": 0.04117203046730255, - "grad_norm": 1.045896291732788, - "learning_rate": 5.4803293687099735e-05, - "loss": 1.846, - "step": 600 - }, - { - "epoch": 0.04117203046730255, - "eval_accuracy": 0.582842034869093, - "eval_loss": 1.83939790725708, - "eval_runtime": 2195.4627, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 600 - }, - { - "epoch": 0.04185823097509092, - "grad_norm": 1.009037733078003, - "learning_rate": 5.5718206770356815e-05, - "loss": 1.8152, - "step": 610 - }, - { - "epoch": 0.04185823097509092, - "eval_accuracy": 0.5820976251467255, - "eval_loss": 1.8433318138122559, - "eval_runtime": 2195.2269, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 610 - }, - { - "epoch": 0.0425444314828793, - "grad_norm": 1.0480693578720093, - "learning_rate": 5.6633119853613916e-05, - "loss": 1.8718, - "step": 620 - }, - { - "epoch": 0.0425444314828793, - "eval_accuracy": 0.5822570585121779, - "eval_loss": 1.8436963558197021, - "eval_runtime": 2195.2658, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 620 - }, - { - "epoch": 0.04323063199066767, - "grad_norm": 1.1000436544418335, - "learning_rate": 5.7548032936870996e-05, - "loss": 1.8646, - "step": 630 - }, - { - "epoch": 0.04323063199066767, - "eval_accuracy": 0.5819806564365029, - "eval_loss": 1.8459354639053345, - "eval_runtime": 2195.2676, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 630 - }, - { - "epoch": 0.04391683249845605, - "grad_norm": 1.154166579246521, - "learning_rate": 5.8462946020128096e-05, - "loss": 1.8593, - "step": 640 - }, - { - "epoch": 0.04391683249845605, - "eval_accuracy": 0.582069735928279, - "eval_loss": 1.847158670425415, - "eval_runtime": 2196.3163, - "eval_samples_per_second": 13.405, - "eval_steps_per_second": 1.676, - "step": 640 - }, - { - "epoch": 0.04460303300624442, - "grad_norm": 1.0694483518600464, - "learning_rate": 5.937785910338518e-05, - "loss": 1.8511, - "step": 650 - }, - { - "epoch": 0.04460303300624442, - "eval_accuracy": 0.5816468822543094, - "eval_loss": 1.8503717184066772, - "eval_runtime": 2195.3338, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 650 - }, - { - "epoch": 0.0452892335140328, - "grad_norm": 1.0069674253463745, - "learning_rate": 6.029277218664228e-05, - "loss": 1.8305, - "step": 660 - }, - { - "epoch": 0.0452892335140328, - "eval_accuracy": 0.5812435178318018, - "eval_loss": 1.851124882698059, - "eval_runtime": 2196.3471, - "eval_samples_per_second": 13.405, - "eval_steps_per_second": 1.676, - "step": 660 - }, - { - "epoch": 0.04597543402182118, - "grad_norm": 0.93955397605896, - "learning_rate": 6.120768526989936e-05, - "loss": 1.8284, - "step": 670 - }, - { - "epoch": 0.04597543402182118, - "eval_accuracy": 0.5811864777397887, - "eval_loss": 1.8510762453079224, - "eval_runtime": 2196.11, - "eval_samples_per_second": 13.406, - "eval_steps_per_second": 1.676, - "step": 670 - }, - { - "epoch": 0.04666163452960955, - "grad_norm": 0.9924960732460022, - "learning_rate": 6.212259835315646e-05, - "loss": 1.8664, - "step": 680 - }, - { - "epoch": 0.04666163452960955, - "eval_accuracy": 0.5802622489613424, - "eval_loss": 1.854859709739685, - "eval_runtime": 2196.6759, - "eval_samples_per_second": 13.403, - "eval_steps_per_second": 1.676, - "step": 680 - }, - { - "epoch": 0.04734783503739793, - "grad_norm": 1.0502827167510986, - "learning_rate": 6.303751143641354e-05, - "loss": 1.8518, - "step": 690 - }, - { - "epoch": 0.04734783503739793, - "eval_accuracy": 0.5797041657793579, - "eval_loss": 1.8587929010391235, - "eval_runtime": 2195.6371, - "eval_samples_per_second": 13.409, - "eval_steps_per_second": 1.677, - "step": 690 - }, - { - "epoch": 0.0480340355451863, - "grad_norm": 0.9641822576522827, - "learning_rate": 6.395242451967064e-05, - "loss": 1.8469, - "step": 700 - }, - { - "epoch": 0.0480340355451863, - "eval_accuracy": 0.5796691714504977, - "eval_loss": 1.859383225440979, - "eval_runtime": 2195.5862, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 700 - }, - { - "epoch": 0.04872023605297468, - "grad_norm": 1.0197440385818481, - "learning_rate": 6.486733760292772e-05, - "loss": 1.871, - "step": 710 - }, - { - "epoch": 0.04872023605297468, - "eval_accuracy": 0.57873611108621, - "eval_loss": 1.8629941940307617, - "eval_runtime": 2195.7373, - "eval_samples_per_second": 13.409, - "eval_steps_per_second": 1.676, - "step": 710 - }, - { - "epoch": 0.04940643656076305, - "grad_norm": 1.0116243362426758, - "learning_rate": 6.578225068618482e-05, - "loss": 1.8012, - "step": 720 - }, - { - "epoch": 0.04940643656076305, - "eval_accuracy": 0.5800620774160878, - "eval_loss": 1.8581833839416504, - "eval_runtime": 2195.6346, - "eval_samples_per_second": 13.409, - "eval_steps_per_second": 1.677, - "step": 720 - }, - { - "epoch": 0.05009263706855143, - "grad_norm": 1.015205979347229, - "learning_rate": 6.669716376944191e-05, - "loss": 1.8966, - "step": 730 - }, - { - "epoch": 0.05009263706855143, - "eval_accuracy": 0.5792581042914667, - "eval_loss": 1.8612667322158813, - "eval_runtime": 2195.4078, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 730 - }, - { - "epoch": 0.05077883757633981, - "grad_norm": 1.031814694404602, - "learning_rate": 6.7612076852699e-05, - "loss": 1.8505, - "step": 740 - }, - { - "epoch": 0.05077883757633981, - "eval_accuracy": 0.5786211344630193, - "eval_loss": 1.8662667274475098, - "eval_runtime": 2195.1361, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 740 - }, - { - "epoch": 0.05146503808412818, - "grad_norm": 0.9193501472473145, - "learning_rate": 6.852698993595609e-05, - "loss": 1.8415, - "step": 750 - }, - { - "epoch": 0.05146503808412818, - "eval_accuracy": 0.5790159329112897, - "eval_loss": 1.8643760681152344, - "eval_runtime": 2196.5278, - "eval_samples_per_second": 13.404, - "eval_steps_per_second": 1.676, - "step": 750 - }, - { - "epoch": 0.05215123859191656, - "grad_norm": 0.9670405387878418, - "learning_rate": 6.944190301921317e-05, - "loss": 1.9061, - "step": 760 - }, - { - "epoch": 0.05215123859191656, - "eval_accuracy": 0.5785704026466072, - "eval_loss": 1.8654682636260986, - "eval_runtime": 2196.4101, - "eval_samples_per_second": 13.405, - "eval_steps_per_second": 1.676, - "step": 760 - }, - { - "epoch": 0.05283743909970493, - "grad_norm": 0.9446126222610474, - "learning_rate": 7.035681610247027e-05, - "loss": 1.8452, - "step": 770 - }, - { - "epoch": 0.05283743909970493, - "eval_accuracy": 0.5786606109877013, - "eval_loss": 1.8661186695098877, - "eval_runtime": 2196.1052, - "eval_samples_per_second": 13.406, - "eval_steps_per_second": 1.676, - "step": 770 - }, - { - "epoch": 0.05352363960749331, - "grad_norm": 0.9109969139099121, - "learning_rate": 7.127172918572736e-05, - "loss": 1.8865, - "step": 780 - }, - { - "epoch": 0.05352363960749331, - "eval_accuracy": 0.5784766749517566, - "eval_loss": 1.8670191764831543, - "eval_runtime": 2196.1816, - "eval_samples_per_second": 13.406, - "eval_steps_per_second": 1.676, - "step": 780 - }, - { - "epoch": 0.05420984011528168, - "grad_norm": 1.0019862651824951, - "learning_rate": 7.218664226898445e-05, - "loss": 1.8181, - "step": 790 - }, - { - "epoch": 0.05420984011528168, - "eval_accuracy": 0.5783223214082356, - "eval_loss": 1.8682514429092407, - "eval_runtime": 2196.2402, - "eval_samples_per_second": 13.406, - "eval_steps_per_second": 1.676, - "step": 790 - }, - { - "epoch": 0.05489604062307006, - "grad_norm": 0.9845278859138489, - "learning_rate": 7.310155535224153e-05, - "loss": 1.8274, - "step": 800 - }, - { - "epoch": 0.05489604062307006, - "eval_accuracy": 0.5783366312334146, - "eval_loss": 1.8687405586242676, - "eval_runtime": 2197.5837, - "eval_samples_per_second": 13.397, - "eval_steps_per_second": 1.675, - "step": 800 - }, - { - "epoch": 0.05558224113085844, - "grad_norm": 0.9345710873603821, - "learning_rate": 7.401646843549863e-05, - "loss": 1.8175, - "step": 810 - }, - { - "epoch": 0.05558224113085844, - "eval_accuracy": 0.578313921441251, - "eval_loss": 1.870363712310791, - "eval_runtime": 2195.3953, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 810 - }, - { - "epoch": 0.05626844163864681, - "grad_norm": 0.9270123839378357, - "learning_rate": 7.493138151875572e-05, - "loss": 1.8407, - "step": 820 - }, - { - "epoch": 0.05626844163864681, - "eval_accuracy": 0.5773186415586673, - "eval_loss": 1.8754173517227173, - "eval_runtime": 2197.5914, - "eval_samples_per_second": 13.397, - "eval_steps_per_second": 1.675, - "step": 820 - }, - { - "epoch": 0.05695464214643519, - "grad_norm": 0.9666882753372192, - "learning_rate": 7.584629460201281e-05, - "loss": 1.8148, - "step": 830 - }, - { - "epoch": 0.05695464214643519, - "eval_accuracy": 0.5775507196978827, - "eval_loss": 1.8739426136016846, - "eval_runtime": 2198.1195, - "eval_samples_per_second": 13.394, - "eval_steps_per_second": 1.675, - "step": 830 - }, - { - "epoch": 0.05764084265422356, - "grad_norm": 0.9308087229728699, - "learning_rate": 7.67612076852699e-05, - "loss": 1.8673, - "step": 840 - }, - { - "epoch": 0.05764084265422356, - "eval_accuracy": 0.5774692101368278, - "eval_loss": 1.8749210834503174, - "eval_runtime": 2197.1707, - "eval_samples_per_second": 13.4, - "eval_steps_per_second": 1.675, - "step": 840 - }, - { - "epoch": 0.05832704316201194, - "grad_norm": 1.0570358037948608, - "learning_rate": 7.7676120768527e-05, - "loss": 1.8628, - "step": 850 - }, - { - "epoch": 0.05832704316201194, - "eval_accuracy": 0.577259841789776, - "eval_loss": 1.87544846534729, - "eval_runtime": 2196.2567, - "eval_samples_per_second": 13.406, - "eval_steps_per_second": 1.676, - "step": 850 - }, - { - "epoch": 0.05901324366980031, - "grad_norm": 0.9226791262626648, - "learning_rate": 7.859103385178408e-05, - "loss": 1.8726, - "step": 860 - }, - { - "epoch": 0.05901324366980031, - "eval_accuracy": 0.5769537908187763, - "eval_loss": 1.8771995306015015, - "eval_runtime": 2196.3678, - "eval_samples_per_second": 13.405, - "eval_steps_per_second": 1.676, - "step": 860 - }, - { - "epoch": 0.05969944417758869, - "grad_norm": 0.8840520977973938, - "learning_rate": 7.950594693504118e-05, - "loss": 1.8312, - "step": 870 - }, - { - "epoch": 0.05969944417758869, - "eval_accuracy": 0.576975570970325, - "eval_loss": 1.8782720565795898, - "eval_runtime": 2197.3135, - "eval_samples_per_second": 13.399, - "eval_steps_per_second": 1.675, - "step": 870 - }, - { - "epoch": 0.06038564468537707, - "grad_norm": 0.9905896782875061, - "learning_rate": 8.042086001829825e-05, - "loss": 1.9177, - "step": 880 - }, - { - "epoch": 0.06038564468537707, - "eval_accuracy": 0.5767803464411996, - "eval_loss": 1.877514123916626, - "eval_runtime": 2195.3275, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 880 - }, - { - "epoch": 0.06107184519316544, - "grad_norm": 1.012568712234497, - "learning_rate": 8.133577310155535e-05, - "loss": 1.9192, - "step": 890 - }, - { - "epoch": 0.06107184519316544, - "eval_accuracy": 0.5767326359567858, - "eval_loss": 1.8775851726531982, - "eval_runtime": 2196.0616, - "eval_samples_per_second": 13.407, - "eval_steps_per_second": 1.676, - "step": 890 - }, - { - "epoch": 0.06175804570095382, - "grad_norm": 0.9244153499603271, - "learning_rate": 8.225068618481244e-05, - "loss": 1.8696, - "step": 900 - }, - { - "epoch": 0.06175804570095382, - "eval_accuracy": 0.5768811460450133, - "eval_loss": 1.8783646821975708, - "eval_runtime": 2195.2053, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 900 - }, - { - "epoch": 0.06244424620874219, - "grad_norm": 0.8586702346801758, - "learning_rate": 8.316559926806954e-05, - "loss": 1.902, - "step": 910 - }, - { - "epoch": 0.06244424620874219, - "eval_accuracy": 0.5765674255389409, - "eval_loss": 1.8812183141708374, - "eval_runtime": 2196.1137, - "eval_samples_per_second": 13.406, - "eval_steps_per_second": 1.676, - "step": 910 - }, - { - "epoch": 0.06313044671653056, - "grad_norm": 0.9296178221702576, - "learning_rate": 8.408051235132663e-05, - "loss": 1.8953, - "step": 920 - }, - { - "epoch": 0.06313044671653056, - "eval_accuracy": 0.5759858689314306, - "eval_loss": 1.882770299911499, - "eval_runtime": 2196.7044, - "eval_samples_per_second": 13.403, - "eval_steps_per_second": 1.676, - "step": 920 - }, - { - "epoch": 0.06381664722431894, - "grad_norm": 0.8966830968856812, - "learning_rate": 8.499542543458372e-05, - "loss": 1.8931, - "step": 930 - }, - { - "epoch": 0.06381664722431894, - "eval_accuracy": 0.5755770926724864, - "eval_loss": 1.8848005533218384, - "eval_runtime": 2195.3041, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 930 - }, - { - "epoch": 0.06450284773210732, - "grad_norm": 1.0561726093292236, - "learning_rate": 8.59103385178408e-05, - "loss": 1.8726, - "step": 940 - }, - { - "epoch": 0.06450284773210732, - "eval_accuracy": 0.5759230185855744, - "eval_loss": 1.8830986022949219, - "eval_runtime": 2196.3236, - "eval_samples_per_second": 13.405, - "eval_steps_per_second": 1.676, - "step": 940 - }, - { - "epoch": 0.0651890482398957, - "grad_norm": 0.8631952404975891, - "learning_rate": 8.68252516010979e-05, - "loss": 1.8391, - "step": 950 - }, - { - "epoch": 0.0651890482398957, - "eval_accuracy": 0.5761208660292918, - "eval_loss": 1.8824081420898438, - "eval_runtime": 2195.5043, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 950 - }, - { - "epoch": 0.06587524874768408, - "grad_norm": 0.8912535905838013, - "learning_rate": 8.774016468435499e-05, - "loss": 1.8492, - "step": 960 - }, - { - "epoch": 0.06587524874768408, - "eval_accuracy": 0.575300856604064, - "eval_loss": 1.8885464668273926, - "eval_runtime": 2195.4665, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 960 - }, - { - "epoch": 0.06656144925547244, - "grad_norm": 0.941037654876709, - "learning_rate": 8.865507776761208e-05, - "loss": 1.9497, - "step": 970 - }, - { - "epoch": 0.06656144925547244, - "eval_accuracy": 0.5762686456856077, - "eval_loss": 1.8846741914749146, - "eval_runtime": 2195.3271, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 970 - }, - { - "epoch": 0.06724764976326082, - "grad_norm": 0.9157822728157043, - "learning_rate": 8.956999085086916e-05, - "loss": 1.8173, - "step": 980 - }, - { - "epoch": 0.06724764976326082, - "eval_accuracy": 0.5756494718346451, - "eval_loss": 1.8873111009597778, - "eval_runtime": 2195.0363, - "eval_samples_per_second": 13.413, - "eval_steps_per_second": 1.677, - "step": 980 - }, - { - "epoch": 0.0679338502710492, - "grad_norm": 0.7956060767173767, - "learning_rate": 9.048490393412626e-05, - "loss": 1.8227, - "step": 990 - }, - { - "epoch": 0.0679338502710492, - "eval_accuracy": 0.5754663658349637, - "eval_loss": 1.8905543088912964, - "eval_runtime": 2196.6945, - "eval_samples_per_second": 13.403, - "eval_steps_per_second": 1.676, - "step": 990 - }, - { - "epoch": 0.06862005077883758, - "grad_norm": 0.8911742568016052, - "learning_rate": 9.139981701738335e-05, - "loss": 1.8767, - "step": 1000 - }, - { - "epoch": 0.06862005077883758, - "eval_accuracy": 0.5748883285812098, - "eval_loss": 1.8892529010772705, - "eval_runtime": 2195.3255, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 1000 - }, - { - "epoch": 0.06930625128662596, - "grad_norm": 0.8083513379096985, - "learning_rate": 9.231473010064045e-05, - "loss": 1.9234, - "step": 1010 - }, - { - "epoch": 0.06930625128662596, - "eval_accuracy": 0.5748815222838508, - "eval_loss": 1.8903882503509521, - "eval_runtime": 2195.4835, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 1010 - }, - { - "epoch": 0.06999245179441432, - "grad_norm": 0.8500548005104065, - "learning_rate": 9.322964318389752e-05, - "loss": 1.8992, - "step": 1020 - }, - { - "epoch": 0.06999245179441432, - "eval_accuracy": 0.5743459496853266, - "eval_loss": 1.8903559446334839, - "eval_runtime": 2195.1026, - "eval_samples_per_second": 13.413, - "eval_steps_per_second": 1.677, - "step": 1020 - }, - { - "epoch": 0.0706786523022027, - "grad_norm": 0.8529781699180603, - "learning_rate": 9.414455626715462e-05, - "loss": 1.9097, - "step": 1030 - }, - { - "epoch": 0.0706786523022027, - "eval_accuracy": 0.574473210845214, - "eval_loss": 1.8921775817871094, - "eval_runtime": 2196.6371, - "eval_samples_per_second": 13.403, - "eval_steps_per_second": 1.676, - "step": 1030 - }, - { - "epoch": 0.07136485280999108, - "grad_norm": 0.985618531703949, - "learning_rate": 9.505946935041171e-05, - "loss": 1.8252, - "step": 1040 - }, - { - "epoch": 0.07136485280999108, - "eval_accuracy": 0.5746204260768708, - "eval_loss": 1.8922775983810425, - "eval_runtime": 2195.2982, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 1040 - }, - { - "epoch": 0.07205105331777946, - "grad_norm": 0.8113195896148682, - "learning_rate": 9.597438243366881e-05, - "loss": 1.8873, - "step": 1050 - }, - { - "epoch": 0.07205105331777946, - "eval_accuracy": 0.5742100893497516, - "eval_loss": 1.8929826021194458, - "eval_runtime": 2195.2211, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 1050 - }, - { - "epoch": 0.07273725382556782, - "grad_norm": 0.848109245300293, - "learning_rate": 9.68892955169259e-05, - "loss": 1.8595, - "step": 1060 - }, - { - "epoch": 0.07273725382556782, - "eval_accuracy": 0.5738744890877788, - "eval_loss": 1.896013855934143, - "eval_runtime": 2195.349, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 1060 - }, - { - "epoch": 0.0734234543333562, - "grad_norm": 0.8576720356941223, - "learning_rate": 9.780420860018299e-05, - "loss": 1.8415, - "step": 1070 - }, - { - "epoch": 0.0734234543333562, - "eval_accuracy": 0.573847263898343, - "eval_loss": 1.896903395652771, - "eval_runtime": 2195.4292, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 1070 - }, - { - "epoch": 0.07410965484114458, - "grad_norm": 0.8684922456741333, - "learning_rate": 9.871912168344007e-05, - "loss": 1.8104, - "step": 1080 - }, - { - "epoch": 0.07410965484114458, - "eval_accuracy": 0.5738449397968058, - "eval_loss": 1.895707607269287, - "eval_runtime": 2195.3809, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 1080 - }, - { - "epoch": 0.07479585534893296, - "grad_norm": 0.8526385426521301, - "learning_rate": 9.963403476669717e-05, - "loss": 1.8955, - "step": 1090 - }, - { - "epoch": 0.07479585534893296, - "eval_accuracy": 0.5738943767566472, - "eval_loss": 1.895605444908142, - "eval_runtime": 2195.5152, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 1090 - }, - { - "epoch": 0.07548205585672134, - "grad_norm": 0.7912049889564514, - "learning_rate": 0.00010054894784995426, - "loss": 1.8411, - "step": 1100 - }, - { - "epoch": 0.07548205585672134, - "eval_accuracy": 0.5737731582607566, - "eval_loss": 1.8975555896759033, - "eval_runtime": 2196.2407, - "eval_samples_per_second": 13.406, - "eval_steps_per_second": 1.676, - "step": 1100 - }, - { - "epoch": 0.0761682563645097, - "grad_norm": 0.8417245149612427, - "learning_rate": 0.00010146386093321135, - "loss": 1.8934, - "step": 1110 - }, - { - "epoch": 0.0761682563645097, - "eval_accuracy": 0.5735296588225584, - "eval_loss": 1.8995074033737183, - "eval_runtime": 2195.4727, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 1110 - }, - { - "epoch": 0.07685445687229808, - "grad_norm": 0.8422715663909912, - "learning_rate": 0.00010237877401646845, - "loss": 1.872, - "step": 1120 - }, - { - "epoch": 0.07685445687229808, - "eval_accuracy": 0.5736483208067581, - "eval_loss": 1.8992631435394287, - "eval_runtime": 2195.5393, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 1120 - }, - { - "epoch": 0.07754065738008646, - "grad_norm": 0.924197793006897, - "learning_rate": 0.00010329368709972553, - "loss": 1.8788, - "step": 1130 - }, - { - "epoch": 0.07754065738008646, - "eval_accuracy": 0.573535568680753, - "eval_loss": 1.8997005224227905, - "eval_runtime": 2195.1245, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 1130 - }, - { - "epoch": 0.07822685788787484, - "grad_norm": 0.8739652037620544, - "learning_rate": 0.00010420860018298262, - "loss": 1.8611, - "step": 1140 - }, - { - "epoch": 0.07822685788787484, - "eval_accuracy": 0.5737993874066766, - "eval_loss": 1.897829532623291, - "eval_runtime": 2194.8865, - "eval_samples_per_second": 13.414, - "eval_steps_per_second": 1.677, - "step": 1140 - }, - { - "epoch": 0.07891305839566322, - "grad_norm": 0.9717639684677124, - "learning_rate": 0.00010512351326623971, - "loss": 1.9323, - "step": 1150 - }, - { - "epoch": 0.07891305839566322, - "eval_accuracy": 0.5726149588604147, - "eval_loss": 1.9035004377365112, - "eval_runtime": 2195.2968, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 1150 - }, - { - "epoch": 0.07959925890345158, - "grad_norm": 0.9229555130004883, - "learning_rate": 0.00010603842634949681, - "loss": 1.892, - "step": 1160 - }, - { - "epoch": 0.07959925890345158, - "eval_accuracy": 0.5725945399683378, - "eval_loss": 1.903469204902649, - "eval_runtime": 2195.7477, - "eval_samples_per_second": 13.409, - "eval_steps_per_second": 1.676, - "step": 1160 - }, - { - "epoch": 0.08028545941123996, - "grad_norm": 0.8387541174888611, - "learning_rate": 0.0001069533394327539, - "loss": 1.8585, - "step": 1170 - }, - { - "epoch": 0.08028545941123996, - "eval_accuracy": 0.5729163616283399, - "eval_loss": 1.9036465883255005, - "eval_runtime": 2195.4236, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 1170 - }, - { - "epoch": 0.08097165991902834, - "grad_norm": 0.8355444073677063, - "learning_rate": 0.00010786825251601098, - "loss": 1.8718, - "step": 1180 - }, - { - "epoch": 0.08097165991902834, - "eval_accuracy": 0.5726727293843396, - "eval_loss": 1.9039238691329956, - "eval_runtime": 2195.5829, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 1180 - }, - { - "epoch": 0.08165786042681672, - "grad_norm": 0.8180125951766968, - "learning_rate": 0.00010878316559926807, - "loss": 1.9032, - "step": 1190 - }, - { - "epoch": 0.08165786042681672, - "eval_accuracy": 0.5721860957238989, - "eval_loss": 1.906183123588562, - "eval_runtime": 2195.35, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 1190 - }, - { - "epoch": 0.0823440609346051, - "grad_norm": 0.7993365526199341, - "learning_rate": 0.00010969807868252517, - "loss": 1.8526, - "step": 1200 - }, - { - "epoch": 0.0823440609346051, - "eval_accuracy": 0.5722786613679808, - "eval_loss": 1.9059474468231201, - "eval_runtime": 2195.3016, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 1200 - }, - { - "epoch": 0.08303026144239346, - "grad_norm": 1.0333908796310425, - "learning_rate": 0.00011061299176578226, - "loss": 1.8649, - "step": 1210 - }, - { - "epoch": 0.08303026144239346, - "eval_accuracy": 0.570262403680102, - "eval_loss": 1.922554850578308, - "eval_runtime": 2195.4851, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 1210 - }, - { - "epoch": 0.08371646195018184, - "grad_norm": 1.1051403284072876, - "learning_rate": 0.00011143641354071363, - "loss": 1.9549, - "step": 1220 - }, - { - "epoch": 0.08371646195018184, - "eval_accuracy": 0.5712613025207935, - "eval_loss": 1.9148966073989868, - "eval_runtime": 2195.4835, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 1220 - }, - { - "epoch": 0.08440266245797022, - "grad_norm": 0.8354151248931885, - "learning_rate": 0.00011235132662397072, - "loss": 1.9199, - "step": 1230 - }, - { - "epoch": 0.08440266245797022, - "eval_accuracy": 0.5723187355187723, - "eval_loss": 1.9058703184127808, - "eval_runtime": 2195.1846, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 1230 - }, - { - "epoch": 0.0850888629657586, - "grad_norm": 0.7311375737190247, - "learning_rate": 0.00011317474839890212, - "loss": 1.8916, - "step": 1240 - }, - { - "epoch": 0.0850888629657586, - "eval_accuracy": 0.572127262753557, - "eval_loss": 1.9079400300979614, - "eval_runtime": 2195.7032, - "eval_samples_per_second": 13.409, - "eval_steps_per_second": 1.676, - "step": 1240 - }, - { - "epoch": 0.08577506347354696, - "grad_norm": 0.8696947693824768, - "learning_rate": 0.00011408966148215921, - "loss": 1.9645, - "step": 1250 - }, - { - "epoch": 0.08577506347354696, - "eval_accuracy": 0.5722400480810126, - "eval_loss": 1.9090999364852905, - "eval_runtime": 2195.3879, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 1250 - }, - { - "epoch": 0.08646126398133534, - "grad_norm": 0.9006031155586243, - "learning_rate": 0.0001150045745654163, - "loss": 1.919, - "step": 1260 - }, - { - "epoch": 0.08646126398133534, - "eval_accuracy": 0.5701970632254558, - "eval_loss": 1.9196306467056274, - "eval_runtime": 2195.6859, - "eval_samples_per_second": 13.409, - "eval_steps_per_second": 1.676, - "step": 1260 - }, - { - "epoch": 0.08714746448912372, - "grad_norm": 1.0175645351409912, - "learning_rate": 0.00011591948764867338, - "loss": 1.8906, - "step": 1270 - }, - { - "epoch": 0.08714746448912372, - "eval_accuracy": 0.571257019533675, - "eval_loss": 1.9163960218429565, - "eval_runtime": 2195.6701, - "eval_samples_per_second": 13.409, - "eval_steps_per_second": 1.676, - "step": 1270 - }, - { - "epoch": 0.0878336649969121, - "grad_norm": 0.8517598509788513, - "learning_rate": 0.00011683440073193048, - "loss": 1.9111, - "step": 1280 - }, - { - "epoch": 0.0878336649969121, - "eval_accuracy": 0.5712075161709325, - "eval_loss": 1.9151411056518555, - "eval_runtime": 2196.2166, - "eval_samples_per_second": 13.406, - "eval_steps_per_second": 1.676, - "step": 1280 - }, - { - "epoch": 0.08851986550470048, - "grad_norm": 0.8764553070068359, - "learning_rate": 0.00011774931381518757, - "loss": 1.9597, - "step": 1290 - }, - { - "epoch": 0.08851986550470048, - "eval_accuracy": 0.5705109497387809, - "eval_loss": 1.9204518795013428, - "eval_runtime": 2196.4778, - "eval_samples_per_second": 13.404, - "eval_steps_per_second": 1.676, - "step": 1290 - }, - { - "epoch": 0.08920606601248884, - "grad_norm": 0.8551300764083862, - "learning_rate": 0.00011866422689844465, - "loss": 1.885, - "step": 1300 - }, - { - "epoch": 0.08920606601248884, - "eval_accuracy": 0.5686897173713243, - "eval_loss": 1.9303290843963623, - "eval_runtime": 2195.3977, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 1300 - }, - { - "epoch": 0.08989226652027722, - "grad_norm": 0.8597418069839478, - "learning_rate": 0.00011957913998170174, - "loss": 1.9086, - "step": 1310 - }, - { - "epoch": 0.08989226652027722, - "eval_accuracy": 0.5694771229721302, - "eval_loss": 1.9275456666946411, - "eval_runtime": 2195.6823, - "eval_samples_per_second": 13.409, - "eval_steps_per_second": 1.676, - "step": 1310 - }, - { - "epoch": 0.0905784670280656, - "grad_norm": 1.0071135759353638, - "learning_rate": 0.00012049405306495884, - "loss": 1.9019, - "step": 1320 - }, - { - "epoch": 0.0905784670280656, - "eval_accuracy": 0.5700387587093215, - "eval_loss": 1.92341947555542, - "eval_runtime": 2195.6668, - "eval_samples_per_second": 13.409, - "eval_steps_per_second": 1.676, - "step": 1320 - }, - { - "epoch": 0.09126466753585398, - "grad_norm": 0.7982167601585388, - "learning_rate": 0.00012140896614821593, - "loss": 1.9072, - "step": 1330 - }, - { - "epoch": 0.09126466753585398, - "eval_accuracy": 0.5695152714387909, - "eval_loss": 1.9324274063110352, - "eval_runtime": 2196.4255, - "eval_samples_per_second": 13.405, - "eval_steps_per_second": 1.676, - "step": 1330 - }, - { - "epoch": 0.09195086804364236, - "grad_norm": 0.8429548144340515, - "learning_rate": 0.000122323879231473, - "loss": 1.9473, - "step": 1340 - }, - { - "epoch": 0.09195086804364236, - "eval_accuracy": 0.5696431634262383, - "eval_loss": 1.923658013343811, - "eval_runtime": 2196.8488, - "eval_samples_per_second": 13.402, - "eval_steps_per_second": 1.676, - "step": 1340 - }, - { - "epoch": 0.09263706855143072, - "grad_norm": 0.8137506246566772, - "learning_rate": 0.0001232387923147301, - "loss": 1.9171, - "step": 1350 - }, - { - "epoch": 0.09263706855143072, - "eval_accuracy": 0.5698193635242091, - "eval_loss": 1.9207695722579956, - "eval_runtime": 2195.6978, - "eval_samples_per_second": 13.409, - "eval_steps_per_second": 1.676, - "step": 1350 - }, - { - "epoch": 0.0933232690592191, - "grad_norm": 0.7542211413383484, - "learning_rate": 0.0001241537053979872, - "loss": 1.9515, - "step": 1360 - }, - { - "epoch": 0.0933232690592191, - "eval_accuracy": 0.5703601819519173, - "eval_loss": 1.9201329946517944, - "eval_runtime": 2196.079, - "eval_samples_per_second": 13.407, - "eval_steps_per_second": 1.676, - "step": 1360 - }, - { - "epoch": 0.09400946956700748, - "grad_norm": 0.7821772694587708, - "learning_rate": 0.0001250686184812443, - "loss": 1.9078, - "step": 1370 - }, - { - "epoch": 0.09400946956700748, - "eval_accuracy": 0.5695221109376003, - "eval_loss": 1.9211959838867188, - "eval_runtime": 2195.4801, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 1370 - }, - { - "epoch": 0.09469567007479586, - "grad_norm": 0.7843822240829468, - "learning_rate": 0.00012598353156450138, - "loss": 1.9052, - "step": 1380 - }, - { - "epoch": 0.09469567007479586, - "eval_accuracy": 0.5696008647782611, - "eval_loss": 1.9253789186477661, - "eval_runtime": 2195.2622, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 1380 - }, - { - "epoch": 0.09538187058258422, - "grad_norm": 0.7415681481361389, - "learning_rate": 0.00012689844464775846, - "loss": 1.9027, - "step": 1390 - }, - { - "epoch": 0.09538187058258422, - "eval_accuracy": 0.5689813257113427, - "eval_loss": 1.9291355609893799, - "eval_runtime": 2196.3808, - "eval_samples_per_second": 13.405, - "eval_steps_per_second": 1.676, - "step": 1390 - }, - { - "epoch": 0.0960680710903726, - "grad_norm": 0.7906999588012695, - "learning_rate": 0.00012781335773101558, - "loss": 1.9165, - "step": 1400 - }, - { - "epoch": 0.0960680710903726, - "eval_accuracy": 0.5686311168111361, - "eval_loss": 1.929186463356018, - "eval_runtime": 2195.3777, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 1400 - }, - { - "epoch": 0.09675427159816098, - "grad_norm": 0.7665316462516785, - "learning_rate": 0.00012872827081427266, - "loss": 1.8921, - "step": 1410 - }, - { - "epoch": 0.09675427159816098, - "eval_accuracy": 0.5691787415362032, - "eval_loss": 1.9266636371612549, - "eval_runtime": 2196.5486, - "eval_samples_per_second": 13.404, - "eval_steps_per_second": 1.676, - "step": 1410 - }, - { - "epoch": 0.09744047210594936, - "grad_norm": 0.7428321242332458, - "learning_rate": 0.00012964318389752975, - "loss": 2.0197, - "step": 1420 - }, - { - "epoch": 0.09744047210594936, - "eval_accuracy": 0.5681328958444599, - "eval_loss": 1.9324249029159546, - "eval_runtime": 2199.1199, - "eval_samples_per_second": 13.388, - "eval_steps_per_second": 1.674, - "step": 1420 - }, - { - "epoch": 0.09812667261373774, - "grad_norm": 0.7564303874969482, - "learning_rate": 0.00013055809698078684, - "loss": 1.9049, - "step": 1430 - }, - { - "epoch": 0.09812667261373774, - "eval_accuracy": 0.5684988754336691, - "eval_loss": 1.9301478862762451, - "eval_runtime": 2196.3543, - "eval_samples_per_second": 13.405, - "eval_steps_per_second": 1.676, - "step": 1430 - }, - { - "epoch": 0.0988128731215261, - "grad_norm": 0.9298701286315918, - "learning_rate": 0.00013147301006404392, - "loss": 1.9086, - "step": 1440 - }, - { - "epoch": 0.0988128731215261, - "eval_accuracy": 0.5689413179634523, - "eval_loss": 1.9311435222625732, - "eval_runtime": 2195.5784, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 1440 - }, - { - "epoch": 0.09949907362931448, - "grad_norm": 0.8423255681991577, - "learning_rate": 0.000132387923147301, - "loss": 1.902, - "step": 1450 - }, - { - "epoch": 0.09949907362931448, - "eval_accuracy": 0.5683401725001284, - "eval_loss": 1.9306671619415283, - "eval_runtime": 2195.5481, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 1450 - }, - { - "epoch": 0.10018527413710286, - "grad_norm": 0.8051897883415222, - "learning_rate": 0.0001333028362305581, - "loss": 1.9863, - "step": 1460 - }, - { - "epoch": 0.10018527413710286, - "eval_accuracy": 0.568742474476219, - "eval_loss": 1.9322450160980225, - "eval_runtime": 2196.6508, - "eval_samples_per_second": 13.403, - "eval_steps_per_second": 1.676, - "step": 1460 - }, - { - "epoch": 0.10087147464489124, - "grad_norm": 0.826680600643158, - "learning_rate": 0.00013421774931381518, - "loss": 1.8864, - "step": 1470 - }, - { - "epoch": 0.10087147464489124, - "eval_accuracy": 0.5676466606014257, - "eval_loss": 1.9348758459091187, - "eval_runtime": 2196.1461, - "eval_samples_per_second": 13.406, - "eval_steps_per_second": 1.676, - "step": 1470 - }, - { - "epoch": 0.10155767515267962, - "grad_norm": 0.8538419604301453, - "learning_rate": 0.00013513266239707227, - "loss": 1.8823, - "step": 1480 - }, - { - "epoch": 0.10155767515267962, - "eval_accuracy": 0.5668430858942111, - "eval_loss": 1.942825436592102, - "eval_runtime": 2195.3646, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 1480 - }, - { - "epoch": 0.10224387566046798, - "grad_norm": 0.7539196014404297, - "learning_rate": 0.00013604757548032939, - "loss": 1.9582, - "step": 1490 - }, - { - "epoch": 0.10224387566046798, - "eval_accuracy": 0.5680551380473151, - "eval_loss": 1.9357633590698242, - "eval_runtime": 2195.6893, - "eval_samples_per_second": 13.409, - "eval_steps_per_second": 1.676, - "step": 1490 - }, - { - "epoch": 0.10293007616825636, - "grad_norm": 0.8606800436973572, - "learning_rate": 0.00013696248856358647, - "loss": 1.9724, - "step": 1500 - }, - { - "epoch": 0.10293007616825636, - "eval_accuracy": 0.5682361191541625, - "eval_loss": 1.9344924688339233, - "eval_runtime": 2196.3487, - "eval_samples_per_second": 13.405, - "eval_steps_per_second": 1.676, - "step": 1500 - }, - { - "epoch": 0.10361627667604474, - "grad_norm": 1.327166199684143, - "learning_rate": 0.00013787740164684356, - "loss": 1.8999, - "step": 1510 - }, - { - "epoch": 0.10361627667604474, - "eval_accuracy": 0.5677454681182075, - "eval_loss": 1.9374973773956299, - "eval_runtime": 2196.3215, - "eval_samples_per_second": 13.405, - "eval_steps_per_second": 1.676, - "step": 1510 - }, - { - "epoch": 0.10430247718383312, - "grad_norm": 0.7882415056228638, - "learning_rate": 0.00013879231473010065, - "loss": 1.939, - "step": 1520 - }, - { - "epoch": 0.10430247718383312, - "eval_accuracy": 0.5680506226500428, - "eval_loss": 1.9378398656845093, - "eval_runtime": 2195.9406, - "eval_samples_per_second": 13.407, - "eval_steps_per_second": 1.676, - "step": 1520 - }, - { - "epoch": 0.1049886776916215, - "grad_norm": 0.8347277045249939, - "learning_rate": 0.00013970722781335773, - "loss": 1.9144, - "step": 1530 - }, - { - "epoch": 0.1049886776916215, - "eval_accuracy": 0.566825787938484, - "eval_loss": 1.9458295106887817, - "eval_runtime": 2196.2125, - "eval_samples_per_second": 13.406, - "eval_steps_per_second": 1.676, - "step": 1530 - }, - { - "epoch": 0.10567487819940986, - "grad_norm": 0.9527395963668823, - "learning_rate": 0.00014062214089661482, - "loss": 1.9098, - "step": 1540 - }, - { - "epoch": 0.10567487819940986, - "eval_accuracy": 0.5678965015166755, - "eval_loss": 1.940508484840393, - "eval_runtime": 2195.4239, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 1540 - }, - { - "epoch": 0.10636107870719824, - "grad_norm": 0.8285815715789795, - "learning_rate": 0.0001415370539798719, - "loss": 1.9036, - "step": 1550 - }, - { - "epoch": 0.10636107870719824, - "eval_accuracy": 0.567786471909614, - "eval_loss": 1.9401248693466187, - "eval_runtime": 2196.2023, - "eval_samples_per_second": 13.406, - "eval_steps_per_second": 1.676, - "step": 1550 - }, - { - "epoch": 0.10704727921498662, - "grad_norm": 0.9443630576133728, - "learning_rate": 0.000142451967063129, - "loss": 2.0052, - "step": 1560 - }, - { - "epoch": 0.10704727921498662, - "eval_accuracy": 0.5666123026115664, - "eval_loss": 1.9421296119689941, - "eval_runtime": 2196.6048, - "eval_samples_per_second": 13.403, - "eval_steps_per_second": 1.676, - "step": 1560 - }, - { - "epoch": 0.107733479722775, - "grad_norm": 0.8364240527153015, - "learning_rate": 0.0001433668801463861, - "loss": 1.8997, - "step": 1570 - }, - { - "epoch": 0.107733479722775, - "eval_accuracy": 0.5669286460322308, - "eval_loss": 1.941907286643982, - "eval_runtime": 2193.9823, - "eval_samples_per_second": 13.419, - "eval_steps_per_second": 1.678, - "step": 1570 - }, - { - "epoch": 0.10841968023056336, - "grad_norm": 0.7851582765579224, - "learning_rate": 0.0001442817932296432, - "loss": 1.9388, - "step": 1580 - }, - { - "epoch": 0.10841968023056336, - "eval_accuracy": 0.5667885691124382, - "eval_loss": 1.9420804977416992, - "eval_runtime": 2193.8438, - "eval_samples_per_second": 13.42, - "eval_steps_per_second": 1.678, - "step": 1580 - }, - { - "epoch": 0.10910588073835174, - "grad_norm": 1.1334728002548218, - "learning_rate": 0.00014519670631290028, - "loss": 1.9436, - "step": 1590 - }, - { - "epoch": 0.10910588073835174, - "eval_accuracy": 0.5664039303080305, - "eval_loss": 1.9469659328460693, - "eval_runtime": 2194.723, - "eval_samples_per_second": 13.415, - "eval_steps_per_second": 1.677, - "step": 1590 - }, - { - "epoch": 0.10979208124614012, - "grad_norm": 3.6574079990386963, - "learning_rate": 0.00014611161939615737, - "loss": 1.9346, - "step": 1600 - }, - { - "epoch": 0.10979208124614012, - "eval_accuracy": 0.5647517265252299, - "eval_loss": 1.9564319849014282, - "eval_runtime": 2193.9968, - "eval_samples_per_second": 13.419, - "eval_steps_per_second": 1.678, - "step": 1600 - }, - { - "epoch": 0.1104782817539285, - "grad_norm": 0.9866987466812134, - "learning_rate": 0.00014702653247941448, - "loss": 1.9583, - "step": 1610 - }, - { - "epoch": 0.1104782817539285, - "eval_accuracy": 0.5648301815528358, - "eval_loss": 1.9578943252563477, - "eval_runtime": 2195.1234, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 1610 - }, - { - "epoch": 0.11116448226171688, - "grad_norm": 0.7200923562049866, - "learning_rate": 0.00014794144556267154, - "loss": 1.9934, - "step": 1620 - }, - { - "epoch": 0.11116448226171688, - "eval_accuracy": 0.5669039109515848, - "eval_loss": 1.94541335105896, - "eval_runtime": 2195.0872, - "eval_samples_per_second": 13.413, - "eval_steps_per_second": 1.677, - "step": 1620 - }, - { - "epoch": 0.11185068276950524, - "grad_norm": 0.8922938108444214, - "learning_rate": 0.00014885635864592863, - "loss": 1.9672, - "step": 1630 - }, - { - "epoch": 0.11185068276950524, - "eval_accuracy": 0.5670405681219726, - "eval_loss": 1.943771243095398, - "eval_runtime": 2195.4513, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 1630 - }, - { - "epoch": 0.11253688327729362, - "grad_norm": 0.8861325979232788, - "learning_rate": 0.00014977127172918572, - "loss": 1.9451, - "step": 1640 - }, - { - "epoch": 0.11253688327729362, - "eval_accuracy": 0.5672741403264618, - "eval_loss": 1.9424519538879395, - "eval_runtime": 2195.3731, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 1640 - }, - { - "epoch": 0.113223083785082, - "grad_norm": 0.7567145824432373, - "learning_rate": 0.00015068618481244283, - "loss": 1.9691, - "step": 1650 - }, - { - "epoch": 0.113223083785082, - "eval_accuracy": 0.565372195232763, - "eval_loss": 1.9486464262008667, - "eval_runtime": 2195.4507, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 1650 - }, - { - "epoch": 0.11390928429287038, - "grad_norm": 0.7195202708244324, - "learning_rate": 0.00015160109789569992, - "loss": 1.9507, - "step": 1660 - }, - { - "epoch": 0.11390928429287038, - "eval_accuracy": 0.5668069627160327, - "eval_loss": 1.94415283203125, - "eval_runtime": 2195.4023, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 1660 - }, - { - "epoch": 0.11459548480065876, - "grad_norm": 0.7501398324966431, - "learning_rate": 0.000152516010978957, - "loss": 1.9337, - "step": 1670 - }, - { - "epoch": 0.11459548480065876, - "eval_accuracy": 0.565814803769799, - "eval_loss": 1.9502369165420532, - "eval_runtime": 2195.5008, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 1670 - }, - { - "epoch": 0.11528168530844712, - "grad_norm": 5.4457292556762695, - "learning_rate": 0.0001534309240622141, - "loss": 1.964, - "step": 1680 - }, - { - "epoch": 0.11528168530844712, - "eval_accuracy": 0.5647974781240622, - "eval_loss": 1.9561764001846313, - "eval_runtime": 2195.9745, - "eval_samples_per_second": 13.407, - "eval_steps_per_second": 1.676, - "step": 1680 - }, - { - "epoch": 0.1159678858162355, - "grad_norm": 0.7148681282997131, - "learning_rate": 0.0001543458371454712, - "loss": 1.9242, - "step": 1690 - }, - { - "epoch": 0.1159678858162355, - "eval_accuracy": 0.5624542193498984, - "eval_loss": 1.985127329826355, - "eval_runtime": 2192.8067, - "eval_samples_per_second": 13.427, - "eval_steps_per_second": 1.679, - "step": 1690 - }, - { - "epoch": 0.11665408632402388, - "grad_norm": 0.8697217702865601, - "learning_rate": 0.0001552607502287283, - "loss": 1.9982, - "step": 1700 - }, - { - "epoch": 0.11665408632402388, - "eval_accuracy": 0.5627469897406854, - "eval_loss": 1.975067377090454, - "eval_runtime": 2192.9867, - "eval_samples_per_second": 13.426, - "eval_steps_per_second": 1.679, - "step": 1700 - }, - { - "epoch": 0.11734028683181226, - "grad_norm": 0.8448213338851929, - "learning_rate": 0.00015617566331198538, - "loss": 2.0176, - "step": 1710 - }, - { - "epoch": 0.11734028683181226, - "eval_accuracy": 0.5627390877954589, - "eval_loss": 1.9716612100601196, - "eval_runtime": 2193.5512, - "eval_samples_per_second": 13.422, - "eval_steps_per_second": 1.678, - "step": 1710 - }, - { - "epoch": 0.11802648733960062, - "grad_norm": 3.248958110809326, - "learning_rate": 0.00015709057639524244, - "loss": 1.9987, - "step": 1720 - }, - { - "epoch": 0.11802648733960062, - "eval_accuracy": 0.5634107531397118, - "eval_loss": 1.962895154953003, - "eval_runtime": 2193.484, - "eval_samples_per_second": 13.422, - "eval_steps_per_second": 1.678, - "step": 1720 - }, - { - "epoch": 0.118712687847389, - "grad_norm": 0.709474503993988, - "learning_rate": 0.00015800548947849955, - "loss": 1.9584, - "step": 1730 - }, - { - "epoch": 0.118712687847389, - "eval_accuracy": 0.5648748043023502, - "eval_loss": 1.955528736114502, - "eval_runtime": 2192.7873, - "eval_samples_per_second": 13.427, - "eval_steps_per_second": 1.679, - "step": 1730 - }, - { - "epoch": 0.11939888835517738, - "grad_norm": 0.7776058912277222, - "learning_rate": 0.00015892040256175664, - "loss": 1.9028, - "step": 1740 - }, - { - "epoch": 0.11939888835517738, - "eval_accuracy": 0.5636472470718479, - "eval_loss": 1.9644140005111694, - "eval_runtime": 2193.2656, - "eval_samples_per_second": 13.424, - "eval_steps_per_second": 1.678, - "step": 1740 - }, - { - "epoch": 0.12008508886296576, - "grad_norm": 0.9440386891365051, - "learning_rate": 0.00015983531564501372, - "loss": 1.9697, - "step": 1750 - }, - { - "epoch": 0.12008508886296576, - "eval_accuracy": 0.5645730695199196, - "eval_loss": 1.9598476886749268, - "eval_runtime": 2192.0993, - "eval_samples_per_second": 13.431, - "eval_steps_per_second": 1.679, - "step": 1750 - }, - { - "epoch": 0.12077128937075414, - "grad_norm": 0.7116687297821045, - "learning_rate": 0.0001607502287282708, - "loss": 1.971, - "step": 1760 - }, - { - "epoch": 0.12077128937075414, - "eval_accuracy": 0.5648594652322046, - "eval_loss": 1.957185983657837, - "eval_runtime": 2191.728, - "eval_samples_per_second": 13.433, - "eval_steps_per_second": 1.679, - "step": 1760 - }, - { - "epoch": 0.1214574898785425, - "grad_norm": 1.1787147521972656, - "learning_rate": 0.00016166514181152793, - "loss": 1.9428, - "step": 1770 - }, - { - "epoch": 0.1214574898785425, - "eval_accuracy": 0.5637377542259968, - "eval_loss": 1.9630035161972046, - "eval_runtime": 2191.7408, - "eval_samples_per_second": 13.433, - "eval_steps_per_second": 1.679, - "step": 1770 - }, - { - "epoch": 0.12214369038633088, - "grad_norm": 0.7775124907493591, - "learning_rate": 0.000162580054894785, - "loss": 1.9251, - "step": 1780 - }, - { - "epoch": 0.12214369038633088, - "eval_accuracy": 0.5636699568640114, - "eval_loss": 1.962775468826294, - "eval_runtime": 2191.7029, - "eval_samples_per_second": 13.433, - "eval_steps_per_second": 1.68, - "step": 1780 - }, - { - "epoch": 0.12282989089411926, - "grad_norm": 0.7894787788391113, - "learning_rate": 0.0001634949679780421, - "loss": 1.9413, - "step": 1790 - }, - { - "epoch": 0.12282989089411926, - "eval_accuracy": 0.5643811651358474, - "eval_loss": 1.9617409706115723, - "eval_runtime": 2191.672, - "eval_samples_per_second": 13.434, - "eval_steps_per_second": 1.68, - "step": 1790 - }, - { - "epoch": 0.12351609140190764, - "grad_norm": 0.7652885317802429, - "learning_rate": 0.00016440988106129919, - "loss": 1.9768, - "step": 1800 - }, - { - "epoch": 0.12351609140190764, - "eval_accuracy": 0.5625105622114503, - "eval_loss": 1.9699786901474, - "eval_runtime": 2192.2587, - "eval_samples_per_second": 13.43, - "eval_steps_per_second": 1.679, - "step": 1800 - }, - { - "epoch": 0.12420229190969602, - "grad_norm": 0.7860700488090515, - "learning_rate": 0.00016532479414455627, - "loss": 1.9325, - "step": 1810 - }, - { - "epoch": 0.12420229190969602, - "eval_accuracy": 0.5627916456916503, - "eval_loss": 1.96902596950531, - "eval_runtime": 2194.8971, - "eval_samples_per_second": 13.414, - "eval_steps_per_second": 1.677, - "step": 1810 - }, - { - "epoch": 0.12488849241748438, - "grad_norm": 0.6445093154907227, - "learning_rate": 0.00016623970722781336, - "loss": 1.8726, - "step": 1820 - }, - { - "epoch": 0.12488849241748438, - "eval_accuracy": 0.5629212641545254, - "eval_loss": 1.9668747186660767, - "eval_runtime": 2195.3596, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 1820 - }, - { - "epoch": 0.12557469292527276, - "grad_norm": 0.7446492314338684, - "learning_rate": 0.00016715462031107045, - "loss": 1.9153, - "step": 1830 - }, - { - "epoch": 0.12557469292527276, - "eval_accuracy": 0.5622602564758932, - "eval_loss": 1.9712408781051636, - "eval_runtime": 2195.4086, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 1830 - }, - { - "epoch": 0.12626089343306113, - "grad_norm": 0.6903027296066284, - "learning_rate": 0.00016806953339432753, - "loss": 1.9477, - "step": 1840 - }, - { - "epoch": 0.12626089343306113, - "eval_accuracy": 0.5628142226780117, - "eval_loss": 1.9705018997192383, - "eval_runtime": 2195.3561, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 1840 - }, - { - "epoch": 0.12694709394084952, - "grad_norm": 0.6859719753265381, - "learning_rate": 0.00016898444647758465, - "loss": 1.956, - "step": 1850 - }, - { - "epoch": 0.12694709394084952, - "eval_accuracy": 0.5628691378771908, - "eval_loss": 1.9665277004241943, - "eval_runtime": 2196.295, - "eval_samples_per_second": 13.405, - "eval_steps_per_second": 1.676, - "step": 1850 - }, - { - "epoch": 0.12763329444863789, - "grad_norm": 0.6945796012878418, - "learning_rate": 0.00016989935956084173, - "loss": 1.9526, - "step": 1860 - }, - { - "epoch": 0.12763329444863789, - "eval_accuracy": 0.5613948274663382, - "eval_loss": 1.9775313138961792, - "eval_runtime": 2195.3271, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 1860 - }, - { - "epoch": 0.12831949495642628, - "grad_norm": 0.692680299282074, - "learning_rate": 0.00017081427264409882, - "loss": 1.9693, - "step": 1870 - }, - { - "epoch": 0.12831949495642628, - "eval_accuracy": 0.5631267811333156, - "eval_loss": 1.9690190553665161, - "eval_runtime": 2196.3618, - "eval_samples_per_second": 13.405, - "eval_steps_per_second": 1.676, - "step": 1870 - }, - { - "epoch": 0.12900569546421464, - "grad_norm": 0.9683026671409607, - "learning_rate": 0.0001717291857273559, - "loss": 1.9857, - "step": 1880 - }, - { - "epoch": 0.12900569546421464, - "eval_accuracy": 0.5625857302954538, - "eval_loss": 1.9720834493637085, - "eval_runtime": 2194.9732, - "eval_samples_per_second": 13.413, - "eval_steps_per_second": 1.677, - "step": 1880 - }, - { - "epoch": 0.129691895972003, - "grad_norm": 0.9663318991661072, - "learning_rate": 0.00017264409881061302, - "loss": 1.982, - "step": 1890 - }, - { - "epoch": 0.129691895972003, - "eval_accuracy": 0.5605411517702714, - "eval_loss": 1.980215311050415, - "eval_runtime": 2195.0705, - "eval_samples_per_second": 13.413, - "eval_steps_per_second": 1.677, - "step": 1890 - }, - { - "epoch": 0.1303780964797914, - "grad_norm": 0.9997875690460205, - "learning_rate": 0.00017355901189387008, - "loss": 1.9939, - "step": 1900 - }, - { - "epoch": 0.1303780964797914, - "eval_accuracy": 0.5622947527829953, - "eval_loss": 1.9729753732681274, - "eval_runtime": 2195.0496, - "eval_samples_per_second": 13.413, - "eval_steps_per_second": 1.677, - "step": 1900 - }, - { - "epoch": 0.13106429698757976, - "grad_norm": 1.0369981527328491, - "learning_rate": 0.00017447392497712717, - "loss": 1.9709, - "step": 1910 - }, - { - "epoch": 0.13106429698757976, - "eval_accuracy": 0.5614108305654945, - "eval_loss": 1.9774330854415894, - "eval_runtime": 2195.0243, - "eval_samples_per_second": 13.413, - "eval_steps_per_second": 1.677, - "step": 1910 - }, - { - "epoch": 0.13175049749536816, - "grad_norm": 0.7212566137313843, - "learning_rate": 0.00017538883806038425, - "loss": 2.0167, - "step": 1920 - }, - { - "epoch": 0.13175049749536816, - "eval_accuracy": 0.5607967697379137, - "eval_loss": 1.984305739402771, - "eval_runtime": 2194.9072, - "eval_samples_per_second": 13.414, - "eval_steps_per_second": 1.677, - "step": 1920 - }, - { - "epoch": 0.13243669800315652, - "grad_norm": 0.709562361240387, - "learning_rate": 0.00017630375114364137, - "loss": 1.9576, - "step": 1930 - }, - { - "epoch": 0.13243669800315652, - "eval_accuracy": 0.5579354355296557, - "eval_loss": 2.008889675140381, - "eval_runtime": 2195.0187, - "eval_samples_per_second": 13.413, - "eval_steps_per_second": 1.677, - "step": 1930 - }, - { - "epoch": 0.1331228985109449, - "grad_norm": 5.758970260620117, - "learning_rate": 0.00017721866422689846, - "loss": 2.0477, - "step": 1940 - }, - { - "epoch": 0.1331228985109449, - "eval_accuracy": 0.5608241941360528, - "eval_loss": 1.9867639541625977, - "eval_runtime": 2193.1512, - "eval_samples_per_second": 13.425, - "eval_steps_per_second": 1.678, - "step": 1940 - }, - { - "epoch": 0.13380909901873328, - "grad_norm": 0.6865282654762268, - "learning_rate": 0.00017813357731015554, - "loss": 2.0013, - "step": 1950 - }, - { - "epoch": 0.13380909901873328, - "eval_accuracy": 0.5611121171150623, - "eval_loss": 1.9813686609268188, - "eval_runtime": 2191.7839, - "eval_samples_per_second": 13.433, - "eval_steps_per_second": 1.679, - "step": 1950 - }, - { - "epoch": 0.13449529952652164, - "grad_norm": 1.0006988048553467, - "learning_rate": 0.00017904849039341263, - "loss": 1.9151, - "step": 1960 - }, - { - "epoch": 0.13449529952652164, - "eval_accuracy": 0.5620602177364407, - "eval_loss": 1.9793142080307007, - "eval_runtime": 2191.7062, - "eval_samples_per_second": 13.433, - "eval_steps_per_second": 1.68, - "step": 1960 - }, - { - "epoch": 0.13518150003431004, - "grad_norm": 25.0100040435791, - "learning_rate": 0.00017996340347666974, - "loss": 2.0365, - "step": 1970 - }, - { - "epoch": 0.13518150003431004, - "eval_accuracy": 0.5600259980638242, - "eval_loss": 1.9854369163513184, - "eval_runtime": 2191.9684, - "eval_samples_per_second": 13.432, - "eval_steps_per_second": 1.679, - "step": 1970 - }, - { - "epoch": 0.1358677005420984, - "grad_norm": 0.6420261859893799, - "learning_rate": 0.00018087831655992683, - "loss": 2.0143, - "step": 1980 - }, - { - "epoch": 0.1358677005420984, - "eval_accuracy": 0.5613296530189448, - "eval_loss": 1.9770421981811523, - "eval_runtime": 2191.2212, - "eval_samples_per_second": 13.436, - "eval_steps_per_second": 1.68, - "step": 1980 - }, - { - "epoch": 0.13655390104988677, - "grad_norm": 0.7176883816719055, - "learning_rate": 0.00018179322964318392, - "loss": 1.9782, - "step": 1990 - }, - { - "epoch": 0.13655390104988677, - "eval_accuracy": 0.5617598442134819, - "eval_loss": 1.980700135231018, - "eval_runtime": 2191.7861, - "eval_samples_per_second": 13.433, - "eval_steps_per_second": 1.679, - "step": 1990 - }, - { - "epoch": 0.13724010155767516, - "grad_norm": 0.6866204738616943, - "learning_rate": 0.00018270814272644098, - "loss": 1.9449, - "step": 2000 - }, - { - "epoch": 0.13724010155767516, - "eval_accuracy": 0.5615476869445853, - "eval_loss": 1.9827600717544556, - "eval_runtime": 2193.8402, - "eval_samples_per_second": 13.42, - "eval_steps_per_second": 1.678, - "step": 2000 - }, - { - "epoch": 0.13792630206546352, - "grad_norm": 0.6652859449386597, - "learning_rate": 0.0001836230558096981, - "loss": 1.9812, - "step": 2010 - }, - { - "epoch": 0.13792630206546352, - "eval_accuracy": 0.5610583639666517, - "eval_loss": 1.9824488162994385, - "eval_runtime": 2198.5601, - "eval_samples_per_second": 13.391, - "eval_steps_per_second": 1.674, - "step": 2010 - }, - { - "epoch": 0.13861250257325192, - "grad_norm": 0.712452232837677, - "learning_rate": 0.00018453796889295518, - "loss": 2.0223, - "step": 2020 - }, - { - "epoch": 0.13861250257325192, - "eval_accuracy": 0.5598986372995852, - "eval_loss": 1.9870091676712036, - "eval_runtime": 2199.0783, - "eval_samples_per_second": 13.388, - "eval_steps_per_second": 1.674, - "step": 2020 - }, - { - "epoch": 0.13929870308104028, - "grad_norm": 0.6659247875213623, - "learning_rate": 0.00018545288197621226, - "loss": 2.0113, - "step": 2030 - }, - { - "epoch": 0.13929870308104028, - "eval_accuracy": 0.560590721535915, - "eval_loss": 1.983353614807129, - "eval_runtime": 2195.4289, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 2030 - }, - { - "epoch": 0.13998490358882865, - "grad_norm": 0.7091332674026489, - "learning_rate": 0.00018636779505946935, - "loss": 2.0452, - "step": 2040 - }, - { - "epoch": 0.13998490358882865, - "eval_accuracy": 0.5607067938069733, - "eval_loss": 1.9845926761627197, - "eval_runtime": 2196.3562, - "eval_samples_per_second": 13.405, - "eval_steps_per_second": 1.676, - "step": 2040 - }, - { - "epoch": 0.14067110409661704, - "grad_norm": 0.6729557514190674, - "learning_rate": 0.00018728270814272646, - "loss": 1.9922, - "step": 2050 - }, - { - "epoch": 0.14067110409661704, - "eval_accuracy": 0.5608056013237551, - "eval_loss": 1.9840885400772095, - "eval_runtime": 2204.0084, - "eval_samples_per_second": 13.358, - "eval_steps_per_second": 1.67, - "step": 2050 - }, - { - "epoch": 0.1413573046044054, - "grad_norm": 0.6952201128005981, - "learning_rate": 0.00018819762122598355, - "loss": 1.9517, - "step": 2060 - }, - { - "epoch": 0.1413573046044054, - "eval_accuracy": 0.5593980922313718, - "eval_loss": 1.9891562461853027, - "eval_runtime": 2195.6228, - "eval_samples_per_second": 13.409, - "eval_steps_per_second": 1.677, - "step": 2060 - }, - { - "epoch": 0.1420435051121938, - "grad_norm": 0.7625614404678345, - "learning_rate": 0.00018911253430924064, - "loss": 1.9531, - "step": 2070 - }, - { - "epoch": 0.1420435051121938, - "eval_accuracy": 0.5598096242107102, - "eval_loss": 1.9889034032821655, - "eval_runtime": 2200.6199, - "eval_samples_per_second": 13.379, - "eval_steps_per_second": 1.673, - "step": 2070 - }, - { - "epoch": 0.14272970561998216, - "grad_norm": 0.7365880608558655, - "learning_rate": 0.00019002744739249773, - "loss": 1.9819, - "step": 2080 - }, - { - "epoch": 0.14272970561998216, - "eval_accuracy": 0.55980836255559, - "eval_loss": 1.9877856969833374, - "eval_runtime": 2198.0873, - "eval_samples_per_second": 13.394, - "eval_steps_per_second": 1.675, - "step": 2080 - }, - { - "epoch": 0.14341590612777053, - "grad_norm": 0.7005255818367004, - "learning_rate": 0.0001909423604757548, - "loss": 1.9977, - "step": 2090 - }, - { - "epoch": 0.14341590612777053, - "eval_accuracy": 0.5598768903494871, - "eval_loss": 1.9907660484313965, - "eval_runtime": 2194.9399, - "eval_samples_per_second": 13.414, - "eval_steps_per_second": 1.677, - "step": 2090 - }, - { - "epoch": 0.14410210663555892, - "grad_norm": 0.6998113393783569, - "learning_rate": 0.0001918572735590119, - "loss": 1.983, - "step": 2100 - }, - { - "epoch": 0.14410210663555892, - "eval_accuracy": 0.5593210980675892, - "eval_loss": 1.9914487600326538, - "eval_runtime": 2195.2308, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 2100 - }, - { - "epoch": 0.14478830714334728, - "grad_norm": 1.0305057764053345, - "learning_rate": 0.00019277218664226899, - "loss": 2.0584, - "step": 2110 - }, - { - "epoch": 0.14478830714334728, - "eval_accuracy": 0.5584136028202109, - "eval_loss": 1.9976584911346436, - "eval_runtime": 2195.1184, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 2110 - }, - { - "epoch": 0.14547450765113565, - "grad_norm": 0.6542972922325134, - "learning_rate": 0.00019368709972552607, - "loss": 1.9884, - "step": 2120 - }, - { - "epoch": 0.14547450765113565, - "eval_accuracy": 0.5590180684285879, - "eval_loss": 1.9945608377456665, - "eval_runtime": 2196.4694, - "eval_samples_per_second": 13.404, - "eval_steps_per_second": 1.676, - "step": 2120 - }, - { - "epoch": 0.14616070815892404, - "grad_norm": 0.6551515460014343, - "learning_rate": 0.0001946020128087832, - "loss": 1.9907, - "step": 2130 - }, - { - "epoch": 0.14616070815892404, - "eval_accuracy": 0.5585770535611776, - "eval_loss": 1.996436357498169, - "eval_runtime": 2195.8503, - "eval_samples_per_second": 13.408, - "eval_steps_per_second": 1.676, - "step": 2130 - }, - { - "epoch": 0.1468469086667124, - "grad_norm": 0.7233726382255554, - "learning_rate": 0.00019551692589204027, - "loss": 1.8951, - "step": 2140 - }, - { - "epoch": 0.1468469086667124, - "eval_accuracy": 0.5590956602184801, - "eval_loss": 1.994140625, - "eval_runtime": 2198.6021, - "eval_samples_per_second": 13.391, - "eval_steps_per_second": 1.674, - "step": 2140 - }, - { - "epoch": 0.1475331091745008, - "grad_norm": 0.6772713661193848, - "learning_rate": 0.00019643183897529736, - "loss": 1.9488, - "step": 2150 - }, - { - "epoch": 0.1475331091745008, - "eval_accuracy": 0.558802757021891, - "eval_loss": 1.9978493452072144, - "eval_runtime": 2198.8115, - "eval_samples_per_second": 13.39, - "eval_steps_per_second": 1.674, - "step": 2150 - }, - { - "epoch": 0.14821930968228916, - "grad_norm": NaN, - "learning_rate": 0.00019734675205855445, - "loss": 2.0301, - "step": 2160 - }, - { - "epoch": 0.14821930968228916, - "eval_accuracy": 0.5583828582770187, - "eval_loss": 2.0025622844696045, - "eval_runtime": 2195.3189, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 2160 - }, - { - "epoch": 0.14890551019007753, - "grad_norm": 0.7715523838996887, - "learning_rate": 0.00019817017383348584, - "loss": 2.0215, - "step": 2170 - }, - { - "epoch": 0.14890551019007753, - "eval_accuracy": 0.5581144909523723, - "eval_loss": 2.007227659225464, - "eval_runtime": 2201.5716, - "eval_samples_per_second": 13.373, - "eval_steps_per_second": 1.672, - "step": 2170 - }, - { - "epoch": 0.14959171069786592, - "grad_norm": 0.7113980650901794, - "learning_rate": 0.00019908508691674292, - "loss": 1.9645, - "step": 2180 - }, - { - "epoch": 0.14959171069786592, - "eval_accuracy": 0.5589439627910016, - "eval_loss": 1.9947305917739868, - "eval_runtime": 2196.025, - "eval_samples_per_second": 13.407, - "eval_steps_per_second": 1.676, - "step": 2180 - }, - { - "epoch": 0.15027791120565429, - "grad_norm": 0.6457582712173462, - "learning_rate": 0.0002, - "loss": 1.9758, - "step": 2190 - }, - { - "epoch": 0.15027791120565429, - "eval_accuracy": 0.5590743780886894, - "eval_loss": 1.9934245347976685, - "eval_runtime": 2196.674, - "eval_samples_per_second": 13.403, - "eval_steps_per_second": 1.676, - "step": 2190 - }, - { - "epoch": 0.15096411171344268, - "grad_norm": 0.5914057493209839, - "learning_rate": 0.00019995184552042955, - "loss": 1.9647, - "step": 2200 - }, - { - "epoch": 0.15096411171344268, - "eval_accuracy": 0.5594601789438658, - "eval_loss": 1.9928838014602661, - "eval_runtime": 2200.2943, - "eval_samples_per_second": 13.381, - "eval_steps_per_second": 1.673, - "step": 2200 - }, - { - "epoch": 0.15165031222123104, - "grad_norm": 0.6294174194335938, - "learning_rate": 0.00019990369104085908, - "loss": 1.9752, - "step": 2210 - }, - { - "epoch": 0.15165031222123104, - "eval_accuracy": 0.5588302478229311, - "eval_loss": 1.9968582391738892, - "eval_runtime": 2195.8847, - "eval_samples_per_second": 13.408, - "eval_steps_per_second": 1.676, - "step": 2210 - }, - { - "epoch": 0.1523365127290194, - "grad_norm": 0.6448546051979065, - "learning_rate": 0.00019985553656128862, - "loss": 1.9605, - "step": 2220 - }, - { - "epoch": 0.1523365127290194, - "eval_accuracy": 0.5588568089833563, - "eval_loss": 1.9973441362380981, - "eval_runtime": 2200.5474, - "eval_samples_per_second": 13.379, - "eval_steps_per_second": 1.673, - "step": 2220 - }, - { - "epoch": 0.1530227132368078, - "grad_norm": 0.8129657506942749, - "learning_rate": 0.00019980738208171816, - "loss": 2.0269, - "step": 2230 - }, - { - "epoch": 0.1530227132368078, - "eval_accuracy": 0.5584346525398479, - "eval_loss": 1.997729778289795, - "eval_runtime": 2196.0138, - "eval_samples_per_second": 13.407, - "eval_steps_per_second": 1.676, - "step": 2230 - }, - { - "epoch": 0.15370891374459617, - "grad_norm": 0.5907541513442993, - "learning_rate": 0.0001997592276021477, - "loss": 1.9763, - "step": 2240 - }, - { - "epoch": 0.15370891374459617, - "eval_accuracy": 0.5583808329885362, - "eval_loss": 2.0000393390655518, - "eval_runtime": 2195.0414, - "eval_samples_per_second": 13.413, - "eval_steps_per_second": 1.677, - "step": 2240 - }, - { - "epoch": 0.15439511425238456, - "grad_norm": 0.6172444820404053, - "learning_rate": 0.00019971107312257723, - "loss": 1.961, - "step": 2250 - }, - { - "epoch": 0.15439511425238456, - "eval_accuracy": 0.5588485086207234, - "eval_loss": 1.9951170682907104, - "eval_runtime": 2195.2092, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 2250 - }, - { - "epoch": 0.15508131476017292, - "grad_norm": 1.708965539932251, - "learning_rate": 0.00019966291864300677, - "loss": 1.9676, - "step": 2260 - }, - { - "epoch": 0.15508131476017292, - "eval_accuracy": 0.5570037696262905, - "eval_loss": 2.0055410861968994, - "eval_runtime": 2195.1746, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 2260 - }, - { - "epoch": 0.1557675152679613, - "grad_norm": 0.7572776079177856, - "learning_rate": 0.0001996147641634363, - "loss": 1.9993, - "step": 2270 - }, - { - "epoch": 0.1557675152679613, - "eval_accuracy": 0.5573890724597089, - "eval_loss": 1.9998122453689575, - "eval_runtime": 2195.7893, - "eval_samples_per_second": 13.408, - "eval_steps_per_second": 1.676, - "step": 2270 - }, - { - "epoch": 0.15645371577574968, - "grad_norm": 0.6893654465675354, - "learning_rate": 0.00019956660968386585, - "loss": 1.949, - "step": 2280 - }, - { - "epoch": 0.15645371577574968, - "eval_accuracy": 0.5586115498682799, - "eval_loss": 1.9959582090377808, - "eval_runtime": 2195.3478, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 2280 - }, - { - "epoch": 0.15713991628353804, - "grad_norm": 0.6196193695068359, - "learning_rate": 0.00019951845520429538, - "loss": 2.0126, - "step": 2290 - }, - { - "epoch": 0.15713991628353804, - "eval_accuracy": 0.5588276249083391, - "eval_loss": 1.9977123737335205, - "eval_runtime": 2196.787, - "eval_samples_per_second": 13.402, - "eval_steps_per_second": 1.676, - "step": 2290 - }, - { - "epoch": 0.15782611679132644, - "grad_norm": 5.14415979385376, - "learning_rate": 0.00019947030072472492, - "loss": 2.0028, - "step": 2300 - }, - { - "epoch": 0.15782611679132644, - "eval_accuracy": 0.5582278739059375, - "eval_loss": 2.0048232078552246, - "eval_runtime": 2196.1102, - "eval_samples_per_second": 13.406, - "eval_steps_per_second": 1.676, - "step": 2300 - }, - { - "epoch": 0.1585123172991148, - "grad_norm": 0.6127263307571411, - "learning_rate": 0.00019942214624515448, - "loss": 2.0287, - "step": 2310 - }, - { - "epoch": 0.1585123172991148, - "eval_accuracy": 0.5586611860368246, - "eval_loss": 1.995181679725647, - "eval_runtime": 2195.907, - "eval_samples_per_second": 13.408, - "eval_steps_per_second": 1.676, - "step": 2310 - }, - { - "epoch": 0.15919851780690317, - "grad_norm": 0.6055991649627686, - "learning_rate": 0.000199373991765584, - "loss": 1.9834, - "step": 2320 - }, - { - "epoch": 0.15919851780690317, - "eval_accuracy": 0.5592904863301992, - "eval_loss": 1.9946677684783936, - "eval_runtime": 2195.4572, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 2320 - }, - { - "epoch": 0.15988471831469156, - "grad_norm": 0.6904438734054565, - "learning_rate": 0.00019932583728601356, - "loss": 1.98, - "step": 2330 - }, - { - "epoch": 0.15988471831469156, - "eval_accuracy": 0.5583102135032557, - "eval_loss": 1.9992380142211914, - "eval_runtime": 2195.7129, - "eval_samples_per_second": 13.409, - "eval_steps_per_second": 1.676, - "step": 2330 - }, - { - "epoch": 0.16057091882247992, - "grad_norm": 0.7183199524879456, - "learning_rate": 0.00019927768280644307, - "loss": 2.0404, - "step": 2340 - }, - { - "epoch": 0.16057091882247992, - "eval_accuracy": 0.5585802741018792, - "eval_loss": 1.9996939897537231, - "eval_runtime": 2195.2538, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 2340 - }, - { - "epoch": 0.16125711933026832, - "grad_norm": 0.6204356551170349, - "learning_rate": 0.00019922952832687263, - "loss": 1.9546, - "step": 2350 - }, - { - "epoch": 0.16125711933026832, - "eval_accuracy": 0.5581378647735465, - "eval_loss": 1.9987818002700806, - "eval_runtime": 2195.0806, - "eval_samples_per_second": 13.413, - "eval_steps_per_second": 1.677, - "step": 2350 - }, - { - "epoch": 0.16194331983805668, - "grad_norm": 0.6199485063552856, - "learning_rate": 0.00019918137384730214, - "loss": 1.9747, - "step": 2360 - }, - { - "epoch": 0.16194331983805668, - "eval_accuracy": 0.558920688574179, - "eval_loss": 1.9964611530303955, - "eval_runtime": 2195.364, - "eval_samples_per_second": 13.411, - "eval_steps_per_second": 1.677, - "step": 2360 - }, - { - "epoch": 0.16262952034584505, - "grad_norm": 0.6137880086898804, - "learning_rate": 0.00019913321936773168, - "loss": 1.9203, - "step": 2370 - }, - { - "epoch": 0.16262952034584505, - "eval_accuracy": 0.5578602342442018, - "eval_loss": 1.9984673261642456, - "eval_runtime": 2195.4886, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 2370 - }, - { - "epoch": 0.16331572085363344, - "grad_norm": 0.6386215090751648, - "learning_rate": 0.00019908506488816125, - "loss": 2.0383, - "step": 2380 - }, - { - "epoch": 0.16331572085363344, - "eval_accuracy": 0.559068036611638, - "eval_loss": 1.9952830076217651, - "eval_runtime": 2195.2671, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 2380 - }, - { - "epoch": 0.1640019213614218, - "grad_norm": 0.6018114686012268, - "learning_rate": 0.00019903691040859076, - "loss": 1.9676, - "step": 2390 - }, - { - "epoch": 0.1640019213614218, - "eval_accuracy": 0.5586635433398123, - "eval_loss": 1.9960980415344238, - "eval_runtime": 2197.0618, - "eval_samples_per_second": 13.401, - "eval_steps_per_second": 1.675, - "step": 2390 - }, - { - "epoch": 0.1646881218692102, - "grad_norm": 0.6331018805503845, - "learning_rate": 0.00019898875592902032, - "loss": 1.9598, - "step": 2400 - }, - { - "epoch": 0.1646881218692102, - "eval_accuracy": 0.5588065419872515, - "eval_loss": 1.9976948499679565, - "eval_runtime": 2195.2367, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 2400 - }, - { - "epoch": 0.16537432237699856, - "grad_norm": 0.6709062457084656, - "learning_rate": 0.00019894060144944983, - "loss": 1.9985, - "step": 2410 - }, - { - "epoch": 0.16537432237699856, - "eval_accuracy": 0.5586920633858189, - "eval_loss": 1.9963314533233643, - "eval_runtime": 2195.5736, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 2410 - }, - { - "epoch": 0.16606052288478693, - "grad_norm": 0.6212249398231506, - "learning_rate": 0.0001988924469698794, - "loss": 1.9174, - "step": 2420 - }, - { - "epoch": 0.16606052288478693, - "eval_accuracy": 0.5585765223379692, - "eval_loss": 1.9955825805664062, - "eval_runtime": 2195.9048, - "eval_samples_per_second": 13.408, - "eval_steps_per_second": 1.676, - "step": 2420 - }, - { - "epoch": 0.16674672339257532, - "grad_norm": 0.5908493399620056, - "learning_rate": 0.0001988442924903089, - "loss": 1.9768, - "step": 2430 - }, - { - "epoch": 0.16674672339257532, - "eval_accuracy": 0.5599598607743654, - "eval_loss": 1.996363878250122, - "eval_runtime": 2196.1829, - "eval_samples_per_second": 13.406, - "eval_steps_per_second": 1.676, - "step": 2430 - }, - { - "epoch": 0.16743292390036368, - "grad_norm": 0.6591713428497314, - "learning_rate": 0.00019879613801073847, - "loss": 1.9607, - "step": 2440 - }, - { - "epoch": 0.16743292390036368, - "eval_accuracy": 0.5579004412007955, - "eval_loss": 1.9992504119873047, - "eval_runtime": 2195.5161, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 2440 - }, - { - "epoch": 0.16811912440815205, - "grad_norm": 0.6437056660652161, - "learning_rate": 0.00019874798353116798, - "loss": 2.0276, - "step": 2450 - }, - { - "epoch": 0.16811912440815205, - "eval_accuracy": 0.5586110186450713, - "eval_loss": 1.9952548742294312, - "eval_runtime": 2195.2496, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 2450 - }, - { - "epoch": 0.16880532491594044, - "grad_norm": 0.564586877822876, - "learning_rate": 0.00019869982905159754, - "loss": 1.9414, - "step": 2460 - }, - { - "epoch": 0.16880532491594044, - "eval_accuracy": 0.5588329039389736, - "eval_loss": 1.9931731224060059, - "eval_runtime": 2195.1623, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 2460 - }, - { - "epoch": 0.1694915254237288, - "grad_norm": 0.6013181805610657, - "learning_rate": 0.00019865167457202708, - "loss": 1.9564, - "step": 2470 - }, - { - "epoch": 0.1694915254237288, - "eval_accuracy": 0.5598632777547692, - "eval_loss": 1.9917162656784058, - "eval_runtime": 2196.6864, - "eval_samples_per_second": 13.403, - "eval_steps_per_second": 1.676, - "step": 2470 - }, - { - "epoch": 0.1701777259315172, - "grad_norm": 0.6139594912528992, - "learning_rate": 0.00019860352009245662, - "loss": 1.9678, - "step": 2480 - }, - { - "epoch": 0.1701777259315172, - "eval_accuracy": 0.5594104431709697, - "eval_loss": 1.9919596910476685, - "eval_runtime": 2195.2781, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 2480 - }, - { - "epoch": 0.17086392643930556, - "grad_norm": 0.6118595600128174, - "learning_rate": 0.00019855536561288616, - "loss": 1.9903, - "step": 2490 - }, - { - "epoch": 0.17086392643930556, - "eval_accuracy": 0.5595184142880981, - "eval_loss": 1.9930351972579956, - "eval_runtime": 2195.0637, - "eval_samples_per_second": 13.413, - "eval_steps_per_second": 1.677, - "step": 2490 - }, - { - "epoch": 0.17155012694709393, - "grad_norm": 0.6277731657028198, - "learning_rate": 0.0001985072111333157, - "loss": 1.9994, - "step": 2500 - }, - { - "epoch": 0.17155012694709393, - "eval_accuracy": 0.5604368328127014, - "eval_loss": 1.9892007112503052, - "eval_runtime": 2196.3916, - "eval_samples_per_second": 13.405, - "eval_steps_per_second": 1.676, - "step": 2500 - }, - { - "epoch": 0.17223632745488232, - "grad_norm": 0.6022769808769226, - "learning_rate": 0.00019845905665374523, - "loss": 1.955, - "step": 2510 - }, - { - "epoch": 0.17223632745488232, - "eval_accuracy": 0.5598575671052778, - "eval_loss": 1.9900598526000977, - "eval_runtime": 2195.1324, - "eval_samples_per_second": 13.412, - "eval_steps_per_second": 1.677, - "step": 2510 - }, - { - "epoch": 0.17292252796267069, - "grad_norm": 0.7676475644111633, - "learning_rate": 0.00019841090217417477, - "loss": 1.9778, - "step": 2520 - }, - { - "epoch": 0.17292252796267069, - "eval_accuracy": 0.5599910369364145, - "eval_loss": 1.9912259578704834, - "eval_runtime": 2195.5014, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 2520 - }, - { - "epoch": 0.17360872847045908, - "grad_norm": 0.6049858331680298, - "learning_rate": 0.0001983627476946043, - "loss": 2.0018, - "step": 2530 - }, - { - "epoch": 0.17360872847045908, - "eval_accuracy": 0.5597790788762212, - "eval_loss": 1.9898487329483032, - "eval_runtime": 2196.3189, - "eval_samples_per_second": 13.405, - "eval_steps_per_second": 1.676, - "step": 2530 - }, - { - "epoch": 0.17429492897824744, - "grad_norm": 0.597239077091217, - "learning_rate": 0.00019831459321503382, - "loss": 1.9675, - "step": 2540 - }, - { - "epoch": 0.17429492897824744, - "eval_accuracy": 0.5608272154680511, - "eval_loss": 1.987794041633606, - "eval_runtime": 2196.2383, - "eval_samples_per_second": 13.406, - "eval_steps_per_second": 1.676, - "step": 2540 - }, - { - "epoch": 0.1749811294860358, - "grad_norm": 0.6299741864204407, - "learning_rate": 0.00019826643873546338, - "loss": 2.0204, - "step": 2550 - }, - { - "epoch": 0.1749811294860358, - "eval_accuracy": 0.5596887709307754, - "eval_loss": 1.9905835390090942, - "eval_runtime": 2196.3707, - "eval_samples_per_second": 13.405, - "eval_steps_per_second": 1.676, - "step": 2550 - }, - { - "epoch": 0.1756673299938242, - "grad_norm": 0.6488902568817139, - "learning_rate": 0.00019821828425589292, - "loss": 1.9863, - "step": 2560 - }, - { - "epoch": 0.1756673299938242, - "eval_accuracy": 0.5600538872822707, - "eval_loss": 1.9906458854675293, - "eval_runtime": 2195.6961, - "eval_samples_per_second": 13.409, - "eval_steps_per_second": 1.676, - "step": 2560 - }, - { - "epoch": 0.17635353050161257, - "grad_norm": 0.6427273154258728, - "learning_rate": 0.00019817012977632246, - "loss": 1.9403, - "step": 2570 - }, - { - "epoch": 0.17635353050161257, - "eval_accuracy": 0.5603546260211854, - "eval_loss": 1.989209532737732, - "eval_runtime": 2196.6688, - "eval_samples_per_second": 13.403, - "eval_steps_per_second": 1.676, - "step": 2570 - }, - { - "epoch": 0.17703973100940096, - "grad_norm": 0.5979894399642944, - "learning_rate": 0.000198121975296752, - "loss": 1.9437, - "step": 2580 - }, - { - "epoch": 0.17703973100940096, - "eval_accuracy": 0.5593561587993505, - "eval_loss": 1.9925955533981323, - "eval_runtime": 2195.5236, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 2580 - }, - { - "epoch": 0.17772593151718932, - "grad_norm": 0.6633705496788025, - "learning_rate": 0.00019807382081718153, - "loss": 1.9804, - "step": 2590 - }, - { - "epoch": 0.17772593151718932, - "eval_accuracy": 0.5602276636743527, - "eval_loss": 1.988720178604126, - "eval_runtime": 2195.5843, - "eval_samples_per_second": 13.41, - "eval_steps_per_second": 1.677, - "step": 2590 - }, - { - "epoch": 0.1784121320249777, - "grad_norm": 0.5960366129875183, - "learning_rate": 0.00019802566633761107, - "loss": 1.9351, - "step": 2600 - }, - { - "epoch": 0.1784121320249777, - "eval_accuracy": 0.559032876275525, - "eval_loss": 1.9919941425323486, - "eval_runtime": 2195.8734, - "eval_samples_per_second": 13.408, - "eval_steps_per_second": 1.676, - "step": 2600 - }, - { - "epoch": 0.17909833253276608, - "grad_norm": 0.6105952858924866, - "learning_rate": 0.0001979775118580406, - "loss": 1.9903, - "step": 2610 - }, - { - "epoch": 0.17909833253276608, - "eval_accuracy": 0.5600523268140957, - "eval_loss": 1.9881607294082642, - "eval_runtime": 2196.3376, - "eval_samples_per_second": 13.405, - "eval_steps_per_second": 1.676, - "step": 2610 - }, - { - "epoch": 0.17978453304055445, - "grad_norm": 0.6086561679840088, - "learning_rate": 0.00019792935737847014, - "loss": 1.9367, - "step": 2620 - }, - { - "epoch": 0.17978453304055445, - "eval_accuracy": 0.5591538955627124, - "eval_loss": 1.991269588470459, - "eval_runtime": 2196.0471, - "eval_samples_per_second": 13.407, - "eval_steps_per_second": 1.676, - "step": 2620 - }, - { - "epoch": 0.18047073354834284, - "grad_norm": 2.8778371810913086, - "learning_rate": 0.00019788120289889968, - "loss": 1.9686, - "step": 2630 - }, - { - "epoch": 0.18047073354834284, - "eval_accuracy": 0.5572145988371657, - "eval_loss": 2.003204107284546, - "eval_runtime": 2195.7568, - "eval_samples_per_second": 13.409, - "eval_steps_per_second": 1.676, - "step": 2630 - } - ], - "logging_steps": 10, - "max_steps": 43719, - "num_input_tokens_seen": 0, - "num_train_epochs": 3, - "save_steps": 10, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 4.8121688667193344e+17, - "train_batch_size": 8, - "trial_name": null, - "trial_params": null -}