| { | |
| "best_global_step": 875, | |
| "best_metric": 0.41610679030418396, | |
| "best_model_checkpoint": "/home/ricoiban/GEMMA/mnlp_chatsplaining/results_model/try_ft/checkpoint-875", | |
| "epoch": 0.5460448642266824, | |
| "eval_steps": 25, | |
| "global_step": 925, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0029515938606847697, | |
| "grad_norm": 274.0, | |
| "learning_rate": 4.800000000000001e-06, | |
| "loss": 10.7873, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0059031877213695395, | |
| "grad_norm": 160.0, | |
| "learning_rate": 1.08e-05, | |
| "loss": 8.2818, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.00885478158205431, | |
| "grad_norm": 96.0, | |
| "learning_rate": 1.6800000000000002e-05, | |
| "loss": 5.4171, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.011806375442739079, | |
| "grad_norm": 141.0, | |
| "learning_rate": 2.2800000000000002e-05, | |
| "loss": 4.8734, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.01475796930342385, | |
| "grad_norm": 111.0, | |
| "learning_rate": 2.88e-05, | |
| "loss": 4.5143, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.01475796930342385, | |
| "eval_loss": 4.148748874664307, | |
| "eval_runtime": 22.5903, | |
| "eval_samples_per_second": 88.534, | |
| "eval_steps_per_second": 88.534, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.01770956316410862, | |
| "grad_norm": 234.0, | |
| "learning_rate": 3.48e-05, | |
| "loss": 4.0827, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.02066115702479339, | |
| "grad_norm": 173.0, | |
| "learning_rate": 4.08e-05, | |
| "loss": 3.624, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.023612750885478158, | |
| "grad_norm": 91.0, | |
| "learning_rate": 4.6800000000000006e-05, | |
| "loss": 2.6036, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.026564344746162927, | |
| "grad_norm": 163.0, | |
| "learning_rate": 5.28e-05, | |
| "loss": 1.9856, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.0295159386068477, | |
| "grad_norm": 65.0, | |
| "learning_rate": 5.88e-05, | |
| "loss": 1.1888, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0295159386068477, | |
| "eval_loss": 0.8685765862464905, | |
| "eval_runtime": 22.0353, | |
| "eval_samples_per_second": 90.764, | |
| "eval_steps_per_second": 90.764, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.032467532467532464, | |
| "grad_norm": 31.125, | |
| "learning_rate": 5.9999123594193744e-05, | |
| "loss": 0.7088, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.03541912632821724, | |
| "grad_norm": 25.625, | |
| "learning_rate": 5.9995563283365586e-05, | |
| "loss": 0.4428, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.03837072018890201, | |
| "grad_norm": 38.25, | |
| "learning_rate": 5.998926461693058e-05, | |
| "loss": 0.7334, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.04132231404958678, | |
| "grad_norm": 13.375, | |
| "learning_rate": 5.9980228169906714e-05, | |
| "loss": 0.6483, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.04427390791027155, | |
| "grad_norm": 41.5, | |
| "learning_rate": 5.9968454767249506e-05, | |
| "loss": 0.5933, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.04427390791027155, | |
| "eval_loss": 0.6194470524787903, | |
| "eval_runtime": 22.0585, | |
| "eval_samples_per_second": 90.668, | |
| "eval_steps_per_second": 90.668, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.047225501770956316, | |
| "grad_norm": 26.0, | |
| "learning_rate": 5.995394548377669e-05, | |
| "loss": 0.7882, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.050177095631641085, | |
| "grad_norm": 17.875, | |
| "learning_rate": 5.993670164407008e-05, | |
| "loss": 0.6423, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.053128689492325853, | |
| "grad_norm": 19.75, | |
| "learning_rate": 5.991672482235466e-05, | |
| "loss": 0.6952, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.05608028335301062, | |
| "grad_norm": 19.875, | |
| "learning_rate": 5.9894016842354855e-05, | |
| "loss": 0.6287, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.0590318772136954, | |
| "grad_norm": 19.75, | |
| "learning_rate": 5.986857977712809e-05, | |
| "loss": 0.6606, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0590318772136954, | |
| "eval_loss": 0.5243311524391174, | |
| "eval_runtime": 22.0497, | |
| "eval_samples_per_second": 90.704, | |
| "eval_steps_per_second": 90.704, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.06198347107438017, | |
| "grad_norm": 26.25, | |
| "learning_rate": 5.9840415948875444e-05, | |
| "loss": 0.6193, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.06493506493506493, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 5.980952792872975e-05, | |
| "loss": 0.5167, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0678866587957497, | |
| "grad_norm": 35.0, | |
| "learning_rate": 5.9775918536520786e-05, | |
| "loss": 0.5922, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.07083825265643448, | |
| "grad_norm": 35.25, | |
| "learning_rate": 5.973959084051791e-05, | |
| "loss": 0.7114, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.07378984651711924, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 5.970054815714995e-05, | |
| "loss": 0.6309, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.07378984651711924, | |
| "eval_loss": 0.644968569278717, | |
| "eval_runtime": 22.0752, | |
| "eval_samples_per_second": 90.6, | |
| "eval_steps_per_second": 90.6, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.07674144037780402, | |
| "grad_norm": 19.375, | |
| "learning_rate": 5.965879405070235e-05, | |
| "loss": 0.622, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.07969303423848878, | |
| "grad_norm": 15.4375, | |
| "learning_rate": 5.961433233299193e-05, | |
| "loss": 0.5902, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.08264462809917356, | |
| "grad_norm": 19.375, | |
| "learning_rate": 5.956716706301877e-05, | |
| "loss": 0.6647, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.08559622195985832, | |
| "grad_norm": 11.0, | |
| "learning_rate": 5.951730254659569e-05, | |
| "loss": 0.7817, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.0885478158205431, | |
| "grad_norm": 16.875, | |
| "learning_rate": 5.946474333595521e-05, | |
| "loss": 0.6154, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0885478158205431, | |
| "eval_loss": 0.5631881356239319, | |
| "eval_runtime": 22.058, | |
| "eval_samples_per_second": 90.67, | |
| "eval_steps_per_second": 90.67, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.09149940968122787, | |
| "grad_norm": 17.625, | |
| "learning_rate": 5.9409494229333904e-05, | |
| "loss": 0.6532, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.09445100354191263, | |
| "grad_norm": 22.25, | |
| "learning_rate": 5.935156027053442e-05, | |
| "loss": 0.6099, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.09740259740259741, | |
| "grad_norm": 25.5, | |
| "learning_rate": 5.929094674846495e-05, | |
| "loss": 0.7848, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.10035419126328217, | |
| "grad_norm": 22.0, | |
| "learning_rate": 5.922765919665644e-05, | |
| "loss": 0.5597, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.10330578512396695, | |
| "grad_norm": 11.9375, | |
| "learning_rate": 5.916170339275745e-05, | |
| "loss": 0.6166, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.10330578512396695, | |
| "eval_loss": 0.5547081828117371, | |
| "eval_runtime": 22.0339, | |
| "eval_samples_per_second": 90.769, | |
| "eval_steps_per_second": 90.769, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.10625737898465171, | |
| "grad_norm": 12.625, | |
| "learning_rate": 5.909308535800664e-05, | |
| "loss": 0.5548, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.10920897284533648, | |
| "grad_norm": 32.5, | |
| "learning_rate": 5.90218113566831e-05, | |
| "loss": 0.5793, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.11216056670602124, | |
| "grad_norm": 19.0, | |
| "learning_rate": 5.8947887895534504e-05, | |
| "loss": 0.8419, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.11511216056670602, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 5.8871321723183046e-05, | |
| "loss": 0.6571, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.1180637544273908, | |
| "grad_norm": 18.375, | |
| "learning_rate": 5.879211982950937e-05, | |
| "loss": 0.6579, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.1180637544273908, | |
| "eval_loss": 0.5850950479507446, | |
| "eval_runtime": 22.079, | |
| "eval_samples_per_second": 90.584, | |
| "eval_steps_per_second": 90.584, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.12101534828807556, | |
| "grad_norm": 20.0, | |
| "learning_rate": 5.871028944501446e-05, | |
| "loss": 0.5835, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.12396694214876033, | |
| "grad_norm": 20.25, | |
| "learning_rate": 5.862583804015953e-05, | |
| "loss": 0.5418, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.1269185360094451, | |
| "grad_norm": 27.375, | |
| "learning_rate": 5.853877332468404e-05, | |
| "loss": 0.6755, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.12987012987012986, | |
| "grad_norm": 29.5, | |
| "learning_rate": 5.844910324690187e-05, | |
| "loss": 0.6172, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.13282172373081463, | |
| "grad_norm": 25.375, | |
| "learning_rate": 5.835683599297568e-05, | |
| "loss": 0.5844, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.13282172373081463, | |
| "eval_loss": 0.5579066276550293, | |
| "eval_runtime": 22.0218, | |
| "eval_samples_per_second": 90.819, | |
| "eval_steps_per_second": 90.819, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.1357733175914994, | |
| "grad_norm": 16.5, | |
| "learning_rate": 5.8261979986169596e-05, | |
| "loss": 0.6147, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.13872491145218419, | |
| "grad_norm": 10.6875, | |
| "learning_rate": 5.816454388608023e-05, | |
| "loss": 0.6352, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.14167650531286896, | |
| "grad_norm": 16.125, | |
| "learning_rate": 5.8064536587846115e-05, | |
| "loss": 0.7107, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.1446280991735537, | |
| "grad_norm": 23.5, | |
| "learning_rate": 5.7961967221335674e-05, | |
| "loss": 0.6752, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.14757969303423848, | |
| "grad_norm": 27.625, | |
| "learning_rate": 5.7856845150313716e-05, | |
| "loss": 0.6039, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.14757969303423848, | |
| "eval_loss": 0.5708025693893433, | |
| "eval_runtime": 22.0736, | |
| "eval_samples_per_second": 90.606, | |
| "eval_steps_per_second": 90.606, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.15053128689492326, | |
| "grad_norm": 19.375, | |
| "learning_rate": 5.7749179971586596e-05, | |
| "loss": 0.6029, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.15348288075560804, | |
| "grad_norm": 24.75, | |
| "learning_rate": 5.763898151412613e-05, | |
| "loss": 0.4986, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.15643447461629278, | |
| "grad_norm": 36.0, | |
| "learning_rate": 5.752625983817225e-05, | |
| "loss": 0.6049, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.15938606847697756, | |
| "grad_norm": 22.25, | |
| "learning_rate": 5.7411025234314634e-05, | |
| "loss": 0.5895, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.16233766233766234, | |
| "grad_norm": 13.0, | |
| "learning_rate": 5.729328822255319e-05, | |
| "loss": 0.6957, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.16233766233766234, | |
| "eval_loss": 0.5568270087242126, | |
| "eval_runtime": 22.052, | |
| "eval_samples_per_second": 90.695, | |
| "eval_steps_per_second": 90.695, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.1652892561983471, | |
| "grad_norm": 27.75, | |
| "learning_rate": 5.717305955133773e-05, | |
| "loss": 0.7125, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.1682408500590319, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 5.7050350196586686e-05, | |
| "loss": 0.5977, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.17119244391971664, | |
| "grad_norm": 18.75, | |
| "learning_rate": 5.692517136068511e-05, | |
| "loss": 0.5908, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.1741440377804014, | |
| "grad_norm": 24.25, | |
| "learning_rate": 5.679753447146195e-05, | |
| "loss": 0.5334, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.1770956316410862, | |
| "grad_norm": 11.625, | |
| "learning_rate": 5.666745118114688e-05, | |
| "loss": 0.4347, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.1770956316410862, | |
| "eval_loss": 0.5026609301567078, | |
| "eval_runtime": 22.0534, | |
| "eval_samples_per_second": 90.689, | |
| "eval_steps_per_second": 90.689, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.18004722550177096, | |
| "grad_norm": 24.75, | |
| "learning_rate": 5.6534933365306394e-05, | |
| "loss": 0.5473, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.18299881936245574, | |
| "grad_norm": 27.75, | |
| "learning_rate": 5.6399993121759797e-05, | |
| "loss": 0.5315, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.1859504132231405, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 5.626264276947469e-05, | |
| "loss": 0.4026, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.18890200708382526, | |
| "grad_norm": 27.125, | |
| "learning_rate": 5.612289484744238e-05, | |
| "loss": 0.55, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.19185360094451004, | |
| "grad_norm": 29.375, | |
| "learning_rate": 5.5980762113533166e-05, | |
| "loss": 0.5988, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.19185360094451004, | |
| "eval_loss": 0.5229803919792175, | |
| "eval_runtime": 22.0483, | |
| "eval_samples_per_second": 90.71, | |
| "eval_steps_per_second": 90.71, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.19480519480519481, | |
| "grad_norm": 18.0, | |
| "learning_rate": 5.5836257543331644e-05, | |
| "loss": 0.5174, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.19775678866587956, | |
| "grad_norm": 17.75, | |
| "learning_rate": 5.568939432895213e-05, | |
| "loss": 0.6662, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.20070838252656434, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 5.554018587783435e-05, | |
| "loss": 0.6594, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.20365997638724911, | |
| "grad_norm": 20.125, | |
| "learning_rate": 5.538864581151943e-05, | |
| "loss": 0.5776, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.2066115702479339, | |
| "grad_norm": 29.0, | |
| "learning_rate": 5.523478796440633e-05, | |
| "loss": 0.6647, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.2066115702479339, | |
| "eval_loss": 0.5277854204177856, | |
| "eval_runtime": 22.0408, | |
| "eval_samples_per_second": 90.741, | |
| "eval_steps_per_second": 90.741, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.20956316410861867, | |
| "grad_norm": 17.625, | |
| "learning_rate": 5.507862638248896e-05, | |
| "loss": 0.5446, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.21251475796930341, | |
| "grad_norm": 15.9375, | |
| "learning_rate": 5.49201753220738e-05, | |
| "loss": 0.5346, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.2154663518299882, | |
| "grad_norm": 13.8125, | |
| "learning_rate": 5.475944924847845e-05, | |
| "loss": 0.4782, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.21841794569067297, | |
| "grad_norm": 46.0, | |
| "learning_rate": 5.459646283471106e-05, | |
| "loss": 0.6363, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.22136953955135774, | |
| "grad_norm": 12.625, | |
| "learning_rate": 5.443123096013083e-05, | |
| "loss": 0.5603, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.22136953955135774, | |
| "eval_loss": 0.49818602204322815, | |
| "eval_runtime": 22.0724, | |
| "eval_samples_per_second": 90.611, | |
| "eval_steps_per_second": 90.611, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.2243211334120425, | |
| "grad_norm": 25.125, | |
| "learning_rate": 5.426376870908959e-05, | |
| "loss": 0.6536, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.22727272727272727, | |
| "grad_norm": 17.25, | |
| "learning_rate": 5.409409136955476e-05, | |
| "loss": 0.5464, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.23022432113341204, | |
| "grad_norm": 23.375, | |
| "learning_rate": 5.3922214431713654e-05, | |
| "loss": 0.5587, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.23317591499409682, | |
| "grad_norm": 10.75, | |
| "learning_rate": 5.3748153586559385e-05, | |
| "loss": 0.5231, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.2361275088547816, | |
| "grad_norm": 19.875, | |
| "learning_rate": 5.357192472445835e-05, | |
| "loss": 0.503, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2361275088547816, | |
| "eval_loss": 0.4984550178050995, | |
| "eval_runtime": 22.074, | |
| "eval_samples_per_second": 90.604, | |
| "eval_steps_per_second": 90.604, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.23907910271546634, | |
| "grad_norm": 20.125, | |
| "learning_rate": 5.339354393369962e-05, | |
| "loss": 0.4524, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.24203069657615112, | |
| "grad_norm": 11.375, | |
| "learning_rate": 5.321302749902615e-05, | |
| "loss": 0.492, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.2449822904368359, | |
| "grad_norm": 20.75, | |
| "learning_rate": 5.303039190014818e-05, | |
| "loss": 0.4989, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.24793388429752067, | |
| "grad_norm": 14.0, | |
| "learning_rate": 5.284565381023873e-05, | |
| "loss": 0.6195, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.25088547815820544, | |
| "grad_norm": 25.125, | |
| "learning_rate": 5.265883009441147e-05, | |
| "loss": 0.5687, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.25088547815820544, | |
| "eval_loss": 0.4847257137298584, | |
| "eval_runtime": 22.029, | |
| "eval_samples_per_second": 90.789, | |
| "eval_steps_per_second": 90.789, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.2538370720188902, | |
| "grad_norm": 16.75, | |
| "learning_rate": 5.2469937808181055e-05, | |
| "loss": 0.4048, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.256788665879575, | |
| "grad_norm": 39.75, | |
| "learning_rate": 5.227899419590614e-05, | |
| "loss": 0.5483, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.2597402597402597, | |
| "grad_norm": 15.125, | |
| "learning_rate": 5.208601668921508e-05, | |
| "loss": 0.4843, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.2626918536009445, | |
| "grad_norm": 15.5, | |
| "learning_rate": 5.1891022905414546e-05, | |
| "loss": 0.6146, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.26564344746162927, | |
| "grad_norm": 22.875, | |
| "learning_rate": 5.169403064588125e-05, | |
| "loss": 0.4279, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.26564344746162927, | |
| "eval_loss": 0.48101305961608887, | |
| "eval_runtime": 22.0566, | |
| "eval_samples_per_second": 90.676, | |
| "eval_steps_per_second": 90.676, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.26859504132231404, | |
| "grad_norm": 28.625, | |
| "learning_rate": 5.1495057894436757e-05, | |
| "loss": 0.5749, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.2715466351829988, | |
| "grad_norm": 20.5, | |
| "learning_rate": 5.1294122815705773e-05, | |
| "loss": 0.4963, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.2744982290436836, | |
| "grad_norm": 22.25, | |
| "learning_rate": 5.109124375345781e-05, | |
| "loss": 0.4213, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.27744982290436837, | |
| "grad_norm": 18.125, | |
| "learning_rate": 5.0886439228932576e-05, | |
| "loss": 0.5002, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.28040141676505315, | |
| "grad_norm": 34.0, | |
| "learning_rate": 5.067972793914911e-05, | |
| "loss": 0.5136, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.28040141676505315, | |
| "eval_loss": 0.5137303471565247, | |
| "eval_runtime": 22.0698, | |
| "eval_samples_per_second": 90.621, | |
| "eval_steps_per_second": 90.621, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.2833530106257379, | |
| "grad_norm": 15.0, | |
| "learning_rate": 5.047112875519892e-05, | |
| "loss": 0.5162, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.28630460448642264, | |
| "grad_norm": 38.0, | |
| "learning_rate": 5.02606607205232e-05, | |
| "loss": 0.4831, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.2892561983471074, | |
| "grad_norm": 26.5, | |
| "learning_rate": 5.004834304917425e-05, | |
| "loss": 0.5147, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.2922077922077922, | |
| "grad_norm": 37.0, | |
| "learning_rate": 4.983419512406151e-05, | |
| "loss": 0.553, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.29515938606847697, | |
| "grad_norm": 23.625, | |
| "learning_rate": 4.9618236495181936e-05, | |
| "loss": 0.3999, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.29515938606847697, | |
| "eval_loss": 0.5061969757080078, | |
| "eval_runtime": 22.0382, | |
| "eval_samples_per_second": 90.752, | |
| "eval_steps_per_second": 90.752, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.29811097992916175, | |
| "grad_norm": 13.75, | |
| "learning_rate": 4.9400486877835325e-05, | |
| "loss": 0.4205, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.3010625737898465, | |
| "grad_norm": 22.5, | |
| "learning_rate": 4.91809661508244e-05, | |
| "loss": 0.5904, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.3040141676505313, | |
| "grad_norm": 16.75, | |
| "learning_rate": 4.895969435464009e-05, | |
| "loss": 0.5749, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.3069657615112161, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 4.873669168963196e-05, | |
| "loss": 0.5841, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.30991735537190085, | |
| "grad_norm": 14.25, | |
| "learning_rate": 4.851197851416409e-05, | |
| "loss": 0.5454, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.30991735537190085, | |
| "eval_loss": 0.6045746207237244, | |
| "eval_runtime": 22.0653, | |
| "eval_samples_per_second": 90.64, | |
| "eval_steps_per_second": 90.64, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.31286894923258557, | |
| "grad_norm": 12.4375, | |
| "learning_rate": 4.828557534275651e-05, | |
| "loss": 0.4654, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.31582054309327035, | |
| "grad_norm": 28.125, | |
| "learning_rate": 4.8057502844212406e-05, | |
| "loss": 0.5669, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.3187721369539551, | |
| "grad_norm": 17.625, | |
| "learning_rate": 4.78277818397312e-05, | |
| "loss": 0.3907, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.3217237308146399, | |
| "grad_norm": 12.625, | |
| "learning_rate": 4.7596433301007775e-05, | |
| "loss": 0.508, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.3246753246753247, | |
| "grad_norm": 26.75, | |
| "learning_rate": 4.736347834831789e-05, | |
| "loss": 0.408, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.3246753246753247, | |
| "eval_loss": 0.48477041721343994, | |
| "eval_runtime": 22.0413, | |
| "eval_samples_per_second": 90.739, | |
| "eval_steps_per_second": 90.739, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.32762691853600945, | |
| "grad_norm": 32.25, | |
| "learning_rate": 4.712893824859008e-05, | |
| "loss": 0.3498, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.3305785123966942, | |
| "grad_norm": 21.25, | |
| "learning_rate": 4.6892834413464163e-05, | |
| "loss": 0.6738, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.333530106257379, | |
| "grad_norm": 23.0, | |
| "learning_rate": 4.6655188397336515e-05, | |
| "loss": 0.4393, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.3364817001180638, | |
| "grad_norm": 13.625, | |
| "learning_rate": 4.641602189539235e-05, | |
| "loss": 0.5021, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.33943329397874855, | |
| "grad_norm": 16.5, | |
| "learning_rate": 4.617535674162509e-05, | |
| "loss": 0.5409, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.33943329397874855, | |
| "eval_loss": 0.4802681505680084, | |
| "eval_runtime": 22.041, | |
| "eval_samples_per_second": 90.74, | |
| "eval_steps_per_second": 90.74, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.34238488783943327, | |
| "grad_norm": 18.0, | |
| "learning_rate": 4.59332149068431e-05, | |
| "loss": 0.4619, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.34533648170011805, | |
| "grad_norm": 19.25, | |
| "learning_rate": 4.5689618496664e-05, | |
| "loss": 0.5019, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.3482880755608028, | |
| "grad_norm": 17.25, | |
| "learning_rate": 4.544458974949646e-05, | |
| "loss": 0.3654, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.3512396694214876, | |
| "grad_norm": 25.625, | |
| "learning_rate": 4.519815103451012e-05, | |
| "loss": 0.6236, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.3541912632821724, | |
| "grad_norm": 30.125, | |
| "learning_rate": 4.4950324849593455e-05, | |
| "loss": 0.4846, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3541912632821724, | |
| "eval_loss": 0.47454240918159485, | |
| "eval_runtime": 22.0763, | |
| "eval_samples_per_second": 90.595, | |
| "eval_steps_per_second": 90.595, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.35714285714285715, | |
| "grad_norm": 17.75, | |
| "learning_rate": 4.470113381929984e-05, | |
| "loss": 0.5738, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.3600944510035419, | |
| "grad_norm": 16.0, | |
| "learning_rate": 4.445060069278218e-05, | |
| "loss": 0.3647, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.3630460448642267, | |
| "grad_norm": 12.3125, | |
| "learning_rate": 4.419874834171601e-05, | |
| "loss": 0.4772, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.3659976387249115, | |
| "grad_norm": 12.4375, | |
| "learning_rate": 4.3945599758211594e-05, | |
| "loss": 0.4309, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.3689492325855962, | |
| "grad_norm": 15.4375, | |
| "learning_rate": 4.369117805271482e-05, | |
| "loss": 0.3699, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.3689492325855962, | |
| "eval_loss": 0.5046902298927307, | |
| "eval_runtime": 22.0518, | |
| "eval_samples_per_second": 90.696, | |
| "eval_steps_per_second": 90.696, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.371900826446281, | |
| "grad_norm": 26.5, | |
| "learning_rate": 4.343550645189751e-05, | |
| "loss": 0.4449, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.37485242030696575, | |
| "grad_norm": 21.125, | |
| "learning_rate": 4.317860829653692e-05, | |
| "loss": 0.2902, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.3778040141676505, | |
| "grad_norm": 28.875, | |
| "learning_rate": 4.292050703938496e-05, | |
| "loss": 0.5582, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.3807556080283353, | |
| "grad_norm": 13.75, | |
| "learning_rate": 4.266122624302714e-05, | |
| "loss": 0.5335, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.3837072018890201, | |
| "grad_norm": 13.5, | |
| "learning_rate": 4.2400789577731485e-05, | |
| "loss": 0.3741, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.3837072018890201, | |
| "eval_loss": 0.4519728720188141, | |
| "eval_runtime": 22.0605, | |
| "eval_samples_per_second": 90.66, | |
| "eval_steps_per_second": 90.66, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.38665879574970485, | |
| "grad_norm": 19.5, | |
| "learning_rate": 4.213922081928763e-05, | |
| "loss": 0.5017, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.38961038961038963, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 4.187654384683628e-05, | |
| "loss": 0.359, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.3925619834710744, | |
| "grad_norm": 11.1875, | |
| "learning_rate": 4.161278264068925e-05, | |
| "loss": 0.2609, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.3955135773317591, | |
| "grad_norm": 13.0625, | |
| "learning_rate": 4.134796128014022e-05, | |
| "loss": 0.4038, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.3984651711924439, | |
| "grad_norm": 17.625, | |
| "learning_rate": 4.108210394126652e-05, | |
| "loss": 0.3166, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.3984651711924439, | |
| "eval_loss": 0.44906917214393616, | |
| "eval_runtime": 22.0309, | |
| "eval_samples_per_second": 90.782, | |
| "eval_steps_per_second": 90.782, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.4014167650531287, | |
| "grad_norm": 32.25, | |
| "learning_rate": 4.0815234894722035e-05, | |
| "loss": 0.3372, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.40436835891381345, | |
| "grad_norm": 27.25, | |
| "learning_rate": 4.05473785035215e-05, | |
| "loss": 0.3346, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.40731995277449823, | |
| "grad_norm": 24.75, | |
| "learning_rate": 4.0278559220816304e-05, | |
| "loss": 0.3993, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.410271546635183, | |
| "grad_norm": 20.0, | |
| "learning_rate": 4.0008801587662194e-05, | |
| "loss": 0.34, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.4132231404958678, | |
| "grad_norm": 27.25, | |
| "learning_rate": 3.9738130230778796e-05, | |
| "loss": 0.4442, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.4132231404958678, | |
| "eval_loss": 0.4622591435909271, | |
| "eval_runtime": 22.0728, | |
| "eval_samples_per_second": 90.609, | |
| "eval_steps_per_second": 90.609, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.41617473435655256, | |
| "grad_norm": 22.625, | |
| "learning_rate": 3.946656986030142e-05, | |
| "loss": 0.3391, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.41912632821723733, | |
| "grad_norm": 26.125, | |
| "learning_rate": 3.919414526752524e-05, | |
| "loss": 0.333, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.42207792207792205, | |
| "grad_norm": 7.5, | |
| "learning_rate": 3.8920881322642036e-05, | |
| "loss": 0.3791, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.42502951593860683, | |
| "grad_norm": 32.0, | |
| "learning_rate": 3.864680297246972e-05, | |
| "loss": 0.5102, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.4279811097992916, | |
| "grad_norm": 15.5, | |
| "learning_rate": 3.8371935238174924e-05, | |
| "loss": 0.4723, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.4279811097992916, | |
| "eval_loss": 0.44943633675575256, | |
| "eval_runtime": 22.0672, | |
| "eval_samples_per_second": 90.632, | |
| "eval_steps_per_second": 90.632, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.4309327036599764, | |
| "grad_norm": 26.875, | |
| "learning_rate": 3.809630321298872e-05, | |
| "loss": 0.4499, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.43388429752066116, | |
| "grad_norm": 14.3125, | |
| "learning_rate": 3.78199320599159e-05, | |
| "loss": 0.2574, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.43683589138134593, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 3.754284700943767e-05, | |
| "loss": 0.3981, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.4397874852420307, | |
| "grad_norm": 20.5, | |
| "learning_rate": 3.726507335720842e-05, | |
| "loss": 0.3897, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.4427390791027155, | |
| "grad_norm": 12.4375, | |
| "learning_rate": 3.6986636461746365e-05, | |
| "loss": 0.3282, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.4427390791027155, | |
| "eval_loss": 0.46209147572517395, | |
| "eval_runtime": 22.0979, | |
| "eval_samples_per_second": 90.506, | |
| "eval_steps_per_second": 90.506, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.44569067296340026, | |
| "grad_norm": 17.0, | |
| "learning_rate": 3.6707561742118546e-05, | |
| "loss": 0.3905, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.448642266824085, | |
| "grad_norm": 16.875, | |
| "learning_rate": 3.642787467562024e-05, | |
| "loss": 0.2688, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.45159386068476975, | |
| "grad_norm": 17.625, | |
| "learning_rate": 3.614760079544913e-05, | |
| "loss": 0.369, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.45454545454545453, | |
| "grad_norm": 24.0, | |
| "learning_rate": 3.5866765688374296e-05, | |
| "loss": 0.4732, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.4574970484061393, | |
| "grad_norm": 16.375, | |
| "learning_rate": 3.558539499240037e-05, | |
| "loss": 0.3928, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.4574970484061393, | |
| "eval_loss": 0.4500948488712311, | |
| "eval_runtime": 22.071, | |
| "eval_samples_per_second": 90.617, | |
| "eval_steps_per_second": 90.617, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.4604486422668241, | |
| "grad_norm": 11.375, | |
| "learning_rate": 3.530351439442696e-05, | |
| "loss": 0.498, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.46340023612750886, | |
| "grad_norm": 9.125, | |
| "learning_rate": 3.502114962790366e-05, | |
| "loss": 0.3545, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.46635182998819363, | |
| "grad_norm": 12.375, | |
| "learning_rate": 3.473832647048079e-05, | |
| "loss": 0.5654, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.4693034238488784, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 3.445507074165612e-05, | |
| "loss": 0.3995, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.4722550177095632, | |
| "grad_norm": 8.375, | |
| "learning_rate": 3.41714083004177e-05, | |
| "loss": 0.4174, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.4722550177095632, | |
| "eval_loss": 0.4428013265132904, | |
| "eval_runtime": 22.0412, | |
| "eval_samples_per_second": 90.739, | |
| "eval_steps_per_second": 90.739, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.47520661157024796, | |
| "grad_norm": 14.3125, | |
| "learning_rate": 3.3887365042883226e-05, | |
| "loss": 0.487, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.4781582054309327, | |
| "grad_norm": 12.4375, | |
| "learning_rate": 3.360296689993586e-05, | |
| "loss": 0.367, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.48110979929161746, | |
| "grad_norm": 10.375, | |
| "learning_rate": 3.331823983485695e-05, | |
| "loss": 0.2972, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.48406139315230223, | |
| "grad_norm": 14.0, | |
| "learning_rate": 3.303320984095584e-05, | |
| "loss": 0.417, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.487012987012987, | |
| "grad_norm": 25.0, | |
| "learning_rate": 3.274790293919685e-05, | |
| "loss": 0.3814, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.487012987012987, | |
| "eval_loss": 0.45124053955078125, | |
| "eval_runtime": 22.0625, | |
| "eval_samples_per_second": 90.652, | |
| "eval_steps_per_second": 90.652, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.4899645808736718, | |
| "grad_norm": 19.125, | |
| "learning_rate": 3.246234517582378e-05, | |
| "loss": 0.4698, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.49291617473435656, | |
| "grad_norm": 19.625, | |
| "learning_rate": 3.217656261998208e-05, | |
| "loss": 0.4826, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.49586776859504134, | |
| "grad_norm": 9.25, | |
| "learning_rate": 3.189058136133898e-05, | |
| "loss": 0.4592, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.4988193624557261, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 3.1604427507701675e-05, | |
| "loss": 0.551, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.5017709563164109, | |
| "grad_norm": 22.25, | |
| "learning_rate": 3.131812718263392e-05, | |
| "loss": 0.3845, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.5017709563164109, | |
| "eval_loss": 0.42552411556243896, | |
| "eval_runtime": 22.0632, | |
| "eval_samples_per_second": 90.649, | |
| "eval_steps_per_second": 90.649, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.5047225501770957, | |
| "grad_norm": 11.875, | |
| "learning_rate": 3.1031706523071115e-05, | |
| "loss": 0.5727, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.5076741440377804, | |
| "grad_norm": 5.21875, | |
| "learning_rate": 3.0745191676934285e-05, | |
| "loss": 0.4759, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.5106257378984652, | |
| "grad_norm": 20.75, | |
| "learning_rate": 3.0458608800742883e-05, | |
| "loss": 0.4727, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.51357733175915, | |
| "grad_norm": 19.125, | |
| "learning_rate": 3.0171984057227008e-05, | |
| "loss": 0.4733, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.5165289256198347, | |
| "grad_norm": 21.125, | |
| "learning_rate": 2.988534361293888e-05, | |
| "loss": 0.3789, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.5165289256198347, | |
| "eval_loss": 0.41610679030418396, | |
| "eval_runtime": 22.0756, | |
| "eval_samples_per_second": 90.598, | |
| "eval_steps_per_second": 90.598, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.5194805194805194, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 2.959871363586411e-05, | |
| "loss": 0.4258, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.5224321133412042, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 2.9312120293032703e-05, | |
| "loss": 0.408, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.525383707201889, | |
| "grad_norm": 29.375, | |
| "learning_rate": 2.902558974813026e-05, | |
| "loss": 0.5145, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.5283353010625738, | |
| "grad_norm": 14.9375, | |
| "learning_rate": 2.873914815910944e-05, | |
| "loss": 0.4281, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.5312868949232585, | |
| "grad_norm": 19.125, | |
| "learning_rate": 2.8452821675801944e-05, | |
| "loss": 0.3189, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5312868949232585, | |
| "eval_loss": 0.41652774810791016, | |
| "eval_runtime": 22.0235, | |
| "eval_samples_per_second": 90.812, | |
| "eval_steps_per_second": 90.812, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5342384887839433, | |
| "grad_norm": 19.375, | |
| "learning_rate": 2.81666364375312e-05, | |
| "loss": 0.4598, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.5371900826446281, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 2.7880618570726142e-05, | |
| "loss": 0.4811, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.5401416765053129, | |
| "grad_norm": 23.0, | |
| "learning_rate": 2.7594794186535993e-05, | |
| "loss": 0.4931, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.5430932703659976, | |
| "grad_norm": 24.125, | |
| "learning_rate": 2.7309189378446578e-05, | |
| "loss": 0.5955, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.5460448642266824, | |
| "grad_norm": 19.875, | |
| "learning_rate": 2.702383021989817e-05, | |
| "loss": 0.3743, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.5460448642266824, | |
| "eval_loss": 0.41767558455467224, | |
| "eval_runtime": 22.0719, | |
| "eval_samples_per_second": 90.613, | |
| "eval_steps_per_second": 90.613, | |
| "step": 925 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1694, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 25, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.3649613399392256e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |