| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9976947902259106, | |
| "eval_steps": 500, | |
| "global_step": 6500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0030736130321192563, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 1.9972341733251385e-05, | |
| "loss": 1.2571, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0061472260642385125, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 1.994161032575292e-05, | |
| "loss": 0.6897, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.009220839096357769, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.9910878918254458e-05, | |
| "loss": 0.7244, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.012294452128477025, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 1.9880147510755993e-05, | |
| "loss": 0.662, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.015368065160596281, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.984941610325753e-05, | |
| "loss": 0.6253, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.018441678192715538, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.9818684695759067e-05, | |
| "loss": 0.6154, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.021515291224834792, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.9787953288260605e-05, | |
| "loss": 0.5772, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.02458890425695405, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 1.975722188076214e-05, | |
| "loss": 0.6075, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.027662517289073305, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.972649047326368e-05, | |
| "loss": 0.5716, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.030736130321192563, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.9695759065765213e-05, | |
| "loss": 0.6216, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.03380974335331182, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 1.9665027658266752e-05, | |
| "loss": 0.5748, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.036883356385431075, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.9634296250768287e-05, | |
| "loss": 0.6002, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.03995696941755033, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 1.9603564843269825e-05, | |
| "loss": 0.603, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.043030582449669584, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 1.957283343577136e-05, | |
| "loss": 0.5885, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.046104195481788846, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.9542102028272895e-05, | |
| "loss": 0.5882, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0491778085139081, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.9511370620774434e-05, | |
| "loss": 0.5882, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.052251421546027355, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.948063921327597e-05, | |
| "loss": 0.617, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.05532503457814661, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.9449907805777507e-05, | |
| "loss": 0.5883, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.05839864761026587, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.9419176398279042e-05, | |
| "loss": 0.5665, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.061472260642385125, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.938844499078058e-05, | |
| "loss": 0.5579, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.06454587367450439, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 1.9357713583282115e-05, | |
| "loss": 0.5114, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.06761948670662364, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1.9326982175783654e-05, | |
| "loss": 0.5922, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.0706930997387429, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.929625076828519e-05, | |
| "loss": 0.5241, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.07376671277086215, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.9265519360786727e-05, | |
| "loss": 0.5573, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.0768403258029814, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 1.9234787953288262e-05, | |
| "loss": 0.5565, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.07991393883510066, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.92040565457898e-05, | |
| "loss": 0.5994, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.08298755186721991, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.9173325138291336e-05, | |
| "loss": 0.5496, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.08606116489933917, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 1.914259373079287e-05, | |
| "loss": 0.4966, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.08913477793145842, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.911186232329441e-05, | |
| "loss": 0.5337, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.09220839096357769, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 1.9081130915795944e-05, | |
| "loss": 0.5477, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.09528200399569695, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.9050399508297482e-05, | |
| "loss": 0.5003, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.0983556170278162, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.9019668100799017e-05, | |
| "loss": 0.5316, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.10142923005993545, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.8988936693300556e-05, | |
| "loss": 0.5299, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.10450284309205471, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.895820528580209e-05, | |
| "loss": 0.5126, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.10757645612417396, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 1.892747387830363e-05, | |
| "loss": 0.5374, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.11065006915629322, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 1.8896742470805164e-05, | |
| "loss": 0.5385, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.11372368218841247, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.8866011063306702e-05, | |
| "loss": 0.5072, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.11679729522053174, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 1.8835279655808237e-05, | |
| "loss": 0.5379, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.119870908252651, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.8804548248309776e-05, | |
| "loss": 0.5164, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.12294452128477025, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 1.877381684081131e-05, | |
| "loss": 0.5303, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1260181343168895, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.8743085433312846e-05, | |
| "loss": 0.5223, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.12909174734900877, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.8712354025814384e-05, | |
| "loss": 0.5457, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.13216536038112803, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.868162261831592e-05, | |
| "loss": 0.5, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.13523897341324728, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.8650891210817458e-05, | |
| "loss": 0.525, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.13831258644536654, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 1.8620159803318993e-05, | |
| "loss": 0.4843, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.1413861994774858, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.858942839582053e-05, | |
| "loss": 0.5126, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.14445981250960505, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.8558696988322066e-05, | |
| "loss": 0.5159, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.1475334255417243, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.8527965580823604e-05, | |
| "loss": 0.5117, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.15060703857384355, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 1.849723417332514e-05, | |
| "loss": 0.4941, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.1536806516059628, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.8466502765826678e-05, | |
| "loss": 0.5206, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.15675426463808206, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 1.8435771358328213e-05, | |
| "loss": 0.5227, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.15982787767020132, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 1.840503995082975e-05, | |
| "loss": 0.4969, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.16290149070232057, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.8374308543331286e-05, | |
| "loss": 0.5237, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.16597510373443983, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.834357713583282e-05, | |
| "loss": 0.5529, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.16904871676655908, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.831284572833436e-05, | |
| "loss": 0.535, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.17212232979867834, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.8282114320835895e-05, | |
| "loss": 0.495, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.1751959428307976, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1.8251382913337433e-05, | |
| "loss": 0.4784, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.17826955586291685, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 1.8220651505838968e-05, | |
| "loss": 0.5158, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.18134316889503613, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 1.8189920098340506e-05, | |
| "loss": 0.5033, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.18441678192715538, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.815918869084204e-05, | |
| "loss": 0.5106, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.18749039495927464, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.812845728334358e-05, | |
| "loss": 0.5103, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.1905640079913939, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 1.8097725875845115e-05, | |
| "loss": 0.5091, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.19363762102351315, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 1.8066994468346653e-05, | |
| "loss": 0.485, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.1967112340556324, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.8036263060848188e-05, | |
| "loss": 0.4814, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.19978484708775165, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.8005531653349727e-05, | |
| "loss": 0.4873, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.2028584601198709, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.797480024585126e-05, | |
| "loss": 0.4744, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.20593207315199016, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 1.7944068838352797e-05, | |
| "loss": 0.5105, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.20900568618410942, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.7913337430854335e-05, | |
| "loss": 0.496, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.21207929921622867, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.788260602335587e-05, | |
| "loss": 0.4867, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.21515291224834793, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 1.785187461585741e-05, | |
| "loss": 0.4406, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.21822652528046718, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 1.7821143208358943e-05, | |
| "loss": 0.4459, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.22130013831258644, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 1.7790411800860482e-05, | |
| "loss": 0.4765, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.2243737513447057, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.7759680393362017e-05, | |
| "loss": 0.4879, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.22744736437682495, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.7728948985863555e-05, | |
| "loss": 0.4889, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.23052097740894423, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.769821757836509e-05, | |
| "loss": 0.4491, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.23359459044106348, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.766748617086663e-05, | |
| "loss": 0.4794, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.23666820347318274, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.7636754763368163e-05, | |
| "loss": 0.4935, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.239741816505302, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.7606023355869702e-05, | |
| "loss": 0.4491, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.24281542953742125, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 1.7575291948371237e-05, | |
| "loss": 0.4424, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.2458890425695405, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.7544560540872772e-05, | |
| "loss": 0.4756, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.24896265560165975, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.751382913337431e-05, | |
| "loss": 0.4743, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.252036268633779, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.7483097725875845e-05, | |
| "loss": 0.4598, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.25510988166589826, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.7452366318377384e-05, | |
| "loss": 0.4849, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.25818349469801755, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 1.742163491087892e-05, | |
| "loss": 0.4555, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.2612571077301368, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.7390903503380457e-05, | |
| "loss": 0.4487, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.26433072076225606, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 1.7360172095881992e-05, | |
| "loss": 0.4816, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.2674043337943753, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.732944068838353e-05, | |
| "loss": 0.4694, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.27047794682649456, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.7298709280885065e-05, | |
| "loss": 0.4826, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.2735515598586138, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 1.7267977873386604e-05, | |
| "loss": 0.4397, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.2766251728907331, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.723724646588814e-05, | |
| "loss": 0.4297, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.2796987859228523, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.7206515058389677e-05, | |
| "loss": 0.4964, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.2827723989549716, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 1.7175783650891212e-05, | |
| "loss": 0.4641, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.2858460119870908, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 1.7145052243392747e-05, | |
| "loss": 0.4507, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.2889196250192101, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.7114320835894286e-05, | |
| "loss": 0.4675, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.2919932380513293, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 1.708358942839582e-05, | |
| "loss": 0.4329, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.2950668510834486, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.705285802089736e-05, | |
| "loss": 0.4666, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.2981404641155678, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.7022126613398894e-05, | |
| "loss": 0.445, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.3012140771476871, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.6991395205900432e-05, | |
| "loss": 0.4755, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.30428769017980634, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 1.6960663798401967e-05, | |
| "loss": 0.4107, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.3073613032119256, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.6929932390903506e-05, | |
| "loss": 0.4595, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3104349162440449, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.689920098340504e-05, | |
| "loss": 0.4523, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.3135085292761641, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 1.686846957590658e-05, | |
| "loss": 0.4645, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.3165821423082834, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.6837738168408114e-05, | |
| "loss": 0.4895, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.31965575534040264, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 1.6807006760909653e-05, | |
| "loss": 0.4519, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.3227293683725219, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 1.6776275353411188e-05, | |
| "loss": 0.4727, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.32580298140464115, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.6745543945912723e-05, | |
| "loss": 0.442, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.3288765944367604, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.671481253841426e-05, | |
| "loss": 0.4645, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.33195020746887965, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.6684081130915796e-05, | |
| "loss": 0.4463, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.33502382050099894, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.6653349723417334e-05, | |
| "loss": 0.4446, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.33809743353311816, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.662261831591887e-05, | |
| "loss": 0.4594, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.34117104656523745, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.6591886908420408e-05, | |
| "loss": 0.4874, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.3442446595973567, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.6561155500921943e-05, | |
| "loss": 0.4757, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.34731827262947595, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.653042409342348e-05, | |
| "loss": 0.4756, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.3503918856615952, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.6499692685925016e-05, | |
| "loss": 0.4271, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.35346549869371446, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 1.6468961278426554e-05, | |
| "loss": 0.4882, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.3565391117258337, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.643822987092809e-05, | |
| "loss": 0.4381, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.359612724757953, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.6407498463429628e-05, | |
| "loss": 0.4276, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.36268633779007226, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.6376767055931163e-05, | |
| "loss": 0.4088, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.3657599508221915, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.6346035648432698e-05, | |
| "loss": 0.4368, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.36883356385431076, | |
| "grad_norm": 1.625, | |
| "learning_rate": 1.6315304240934236e-05, | |
| "loss": 0.4488, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.37190717688643, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.628457283343577e-05, | |
| "loss": 0.4213, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.3749807899185493, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.625384142593731e-05, | |
| "loss": 0.4275, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.3780544029506685, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.6223110018438845e-05, | |
| "loss": 0.4721, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.3811280159827878, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.6192378610940383e-05, | |
| "loss": 0.3997, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.384201629014907, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.6161647203441918e-05, | |
| "loss": 0.433, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.3872752420470263, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.6130915795943456e-05, | |
| "loss": 0.4386, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.3903488550791455, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.610018438844499e-05, | |
| "loss": 0.4366, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.3934224681112648, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.606945298094653e-05, | |
| "loss": 0.41, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.396496081143384, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.6038721573448065e-05, | |
| "loss": 0.4296, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.3995696941755033, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.6007990165949603e-05, | |
| "loss": 0.4292, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.40264330720762254, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.5977258758451138e-05, | |
| "loss": 0.4056, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.4057169202397418, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 1.5946527350952673e-05, | |
| "loss": 0.4191, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.40879053327186105, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.591579594345421e-05, | |
| "loss": 0.4153, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.4118641463039803, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.5885064535955747e-05, | |
| "loss": 0.4415, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.4149377593360996, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.5854333128457285e-05, | |
| "loss": 0.4004, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.41801137236821884, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.582360172095882e-05, | |
| "loss": 0.4418, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.4210849854003381, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 1.579287031346036e-05, | |
| "loss": 0.4311, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.42415859843245735, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.5762138905961893e-05, | |
| "loss": 0.4112, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.42723221146457663, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.5731407498463432e-05, | |
| "loss": 0.4352, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.43030582449669585, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.5700676090964967e-05, | |
| "loss": 0.4617, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.43337943752881514, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.5669944683466505e-05, | |
| "loss": 0.3976, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.43645305056093436, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.563921327596804e-05, | |
| "loss": 0.432, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.43952666359305365, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.560848186846958e-05, | |
| "loss": 0.3597, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.4426002766251729, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 1.5577750460971114e-05, | |
| "loss": 0.4091, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.44567388965729215, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.554701905347265e-05, | |
| "loss": 0.4275, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.4487475026894114, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.5516287645974187e-05, | |
| "loss": 0.4125, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.45182111572153066, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.5485556238475722e-05, | |
| "loss": 0.3917, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.4548947287536499, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.545482483097726e-05, | |
| "loss": 0.364, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.4579683417857692, | |
| "grad_norm": 1.625, | |
| "learning_rate": 1.5424093423478795e-05, | |
| "loss": 0.4297, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.46104195481788846, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.5393362015980334e-05, | |
| "loss": 0.4274, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.4641155678500077, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.536263060848187e-05, | |
| "loss": 0.3786, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.46718918088212696, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 1.5331899200983407e-05, | |
| "loss": 0.4062, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.4702627939142462, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.5301167793484942e-05, | |
| "loss": 0.3928, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.4733364069463655, | |
| "grad_norm": 2.375, | |
| "learning_rate": 1.527043638598648e-05, | |
| "loss": 0.4124, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.4764100199784847, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.5239704978488017e-05, | |
| "loss": 0.4111, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.479483633010604, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 1.5208973570989554e-05, | |
| "loss": 0.385, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.4825572460427232, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.5178242163491089e-05, | |
| "loss": 0.4165, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.4856308590748425, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.5147510755992626e-05, | |
| "loss": 0.3929, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.4887044721069617, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 1.5116779348494162e-05, | |
| "loss": 0.4247, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.491778085139081, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.5086047940995699e-05, | |
| "loss": 0.4078, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.4948516981712002, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 1.5055316533497236e-05, | |
| "loss": 0.3808, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.4979253112033195, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 1.5024585125998772e-05, | |
| "loss": 0.4193, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.5009989242354388, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.4993853718500309e-05, | |
| "loss": 0.3915, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.504072537267558, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.4963122311001846e-05, | |
| "loss": 0.4306, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.5071461502996772, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.4932390903503382e-05, | |
| "loss": 0.4278, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.5102197633317965, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 1.490165949600492e-05, | |
| "loss": 0.4045, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.5132933763639158, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 1.4870928088506456e-05, | |
| "loss": 0.3882, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.5163669893960351, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.4840196681007993e-05, | |
| "loss": 0.4204, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.5194406024281543, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.480946527350953e-05, | |
| "loss": 0.362, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.5225142154602735, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 1.4778733866011064e-05, | |
| "loss": 0.3536, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5255878284923928, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.4748002458512601e-05, | |
| "loss": 0.3586, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.5286614415245121, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.4717271051014138e-05, | |
| "loss": 0.3937, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.5317350545566313, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 1.4686539643515674e-05, | |
| "loss": 0.3939, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.5348086675887506, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.4655808236017211e-05, | |
| "loss": 0.3851, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.5378822806208698, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 1.4625076828518748e-05, | |
| "loss": 0.429, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.5409558936529891, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.4594345421020284e-05, | |
| "loss": 0.3902, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.5440295066851083, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.4563614013521821e-05, | |
| "loss": 0.3783, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.5471031197172276, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.4532882606023358e-05, | |
| "loss": 0.4017, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.5501767327493469, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.4502151198524894e-05, | |
| "loss": 0.3858, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.5532503457814661, | |
| "grad_norm": 2.375, | |
| "learning_rate": 1.4471419791026431e-05, | |
| "loss": 0.3847, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5563239588135853, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.4440688383527968e-05, | |
| "loss": 0.4029, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.5593975718457046, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.4409956976029505e-05, | |
| "loss": 0.3941, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.5624711848778239, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 1.437922556853104e-05, | |
| "loss": 0.3994, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.5655447979099432, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 1.4348494161032576e-05, | |
| "loss": 0.3863, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.5686184109420624, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.4317762753534113e-05, | |
| "loss": 0.3733, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.5716920239741816, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 1.428703134603565e-05, | |
| "loss": 0.367, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.5747656370063009, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.4256299938537186e-05, | |
| "loss": 0.3405, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.5778392500384202, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.4225568531038723e-05, | |
| "loss": 0.39, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.5809128630705395, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 1.419483712354026e-05, | |
| "loss": 0.3968, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.5839864761026586, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 1.4164105716041796e-05, | |
| "loss": 0.3926, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.5870600891347779, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.4133374308543333e-05, | |
| "loss": 0.3772, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.5901337021668972, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.410264290104487e-05, | |
| "loss": 0.4271, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.5932073151990165, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 1.4071911493546407e-05, | |
| "loss": 0.3683, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.5962809282311357, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.4041180086047943e-05, | |
| "loss": 0.3764, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.5993545412632549, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.401044867854948e-05, | |
| "loss": 0.4047, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.6024281542953742, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.3979717271051015e-05, | |
| "loss": 0.3684, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.6055017673274935, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 1.3948985863552552e-05, | |
| "loss": 0.4148, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.6085753803596127, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.3918254456054088e-05, | |
| "loss": 0.3858, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.611648993391732, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.3887523048555625e-05, | |
| "loss": 0.3473, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.6147226064238512, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.3856791641057162e-05, | |
| "loss": 0.3727, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6177962194559705, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.3826060233558698e-05, | |
| "loss": 0.3615, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.6208698324880898, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.3795328826060235e-05, | |
| "loss": 0.39, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.623943445520209, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.3764597418561772e-05, | |
| "loss": 0.3736, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.6270170585523283, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 1.3733866011063308e-05, | |
| "loss": 0.3375, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.6300906715844475, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.3703134603564845e-05, | |
| "loss": 0.3694, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.6331642846165668, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.3672403196066382e-05, | |
| "loss": 0.3941, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.636237897648686, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 1.3641671788567919e-05, | |
| "loss": 0.333, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.6393115106808053, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.3610940381069455e-05, | |
| "loss": 0.3883, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.6423851237129246, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.358020897357099e-05, | |
| "loss": 0.3817, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.6454587367450438, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.3549477566072527e-05, | |
| "loss": 0.37, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.648532349777163, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.3518746158574064e-05, | |
| "loss": 0.372, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.6516059628092823, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 1.34880147510756e-05, | |
| "loss": 0.388, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.6546795758414016, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.3457283343577137e-05, | |
| "loss": 0.3619, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.6577531888735209, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 1.3426551936078674e-05, | |
| "loss": 0.408, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.66082680190564, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 1.339582052858021e-05, | |
| "loss": 0.3902, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.6639004149377593, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.3365089121081747e-05, | |
| "loss": 0.3731, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.6669740279698786, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.3334357713583284e-05, | |
| "loss": 0.3645, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.6700476410019979, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 1.330362630608482e-05, | |
| "loss": 0.3926, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.6731212540341172, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.3272894898586357e-05, | |
| "loss": 0.3805, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.6761948670662363, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.3242163491087894e-05, | |
| "loss": 0.3481, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.6792684800983556, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.321143208358943e-05, | |
| "loss": 0.3531, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.6823420931304749, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.3180700676090966e-05, | |
| "loss": 0.3419, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.6854157061625942, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 1.3149969268592502e-05, | |
| "loss": 0.3551, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.6884893191947133, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.3119237861094039e-05, | |
| "loss": 0.3668, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.6915629322268326, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 1.3088506453595576e-05, | |
| "loss": 0.3915, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.6946365452589519, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 1.3057775046097112e-05, | |
| "loss": 0.3845, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.6977101582910712, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 1.3027043638598649e-05, | |
| "loss": 0.3735, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.7007837713231904, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.2996312231100186e-05, | |
| "loss": 0.355, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.7038573843553096, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.2965580823601722e-05, | |
| "loss": 0.3618, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.7069309973874289, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 1.293484941610326e-05, | |
| "loss": 0.3698, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.7100046104195482, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 1.2904118008604796e-05, | |
| "loss": 0.359, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.7130782234516674, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.2873386601106333e-05, | |
| "loss": 0.3443, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.7161518364837867, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 1.284265519360787e-05, | |
| "loss": 0.3849, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.719225449515906, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 1.2811923786109406e-05, | |
| "loss": 0.4124, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.7222990625480252, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.2781192378610941e-05, | |
| "loss": 0.3298, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.7253726755801445, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.2750460971112478e-05, | |
| "loss": 0.37, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.7284462886122637, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.2719729563614014e-05, | |
| "loss": 0.388, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.731519901644383, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.2688998156115551e-05, | |
| "loss": 0.3375, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.7345935146765022, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.2658266748617088e-05, | |
| "loss": 0.3783, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.7376671277086215, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 1.2627535341118624e-05, | |
| "loss": 0.3779, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.7407407407407407, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.2596803933620161e-05, | |
| "loss": 0.3391, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.74381435377286, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 1.2566072526121698e-05, | |
| "loss": 0.3817, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.7468879668049793, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 1.2535341118623235e-05, | |
| "loss": 0.3502, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.7499615798370985, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.2504609711124771e-05, | |
| "loss": 0.3552, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.7530351928692177, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.2473878303626308e-05, | |
| "loss": 0.3718, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.756108805901337, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.2443146896127845e-05, | |
| "loss": 0.3423, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.7591824189334563, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.2412415488629381e-05, | |
| "loss": 0.3806, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.7622560319655756, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.2381684081130916e-05, | |
| "loss": 0.353, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.7653296449976947, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.2350952673632453e-05, | |
| "loss": 0.3523, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.768403258029814, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.232022126613399e-05, | |
| "loss": 0.3781, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.7714768710619333, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 1.2289489858635526e-05, | |
| "loss": 0.3533, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.7745504840940526, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 1.2258758451137063e-05, | |
| "loss": 0.346, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.7776240971261719, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 1.22280270436386e-05, | |
| "loss": 0.3564, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.780697710158291, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.2197295636140136e-05, | |
| "loss": 0.3397, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.7837713231904103, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 1.2166564228641673e-05, | |
| "loss": 0.3456, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.7868449362225296, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.213583282114321e-05, | |
| "loss": 0.3214, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.7899185492546489, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.2105101413644747e-05, | |
| "loss": 0.3448, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.792992162286768, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.2074370006146283e-05, | |
| "loss": 0.3729, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.7960657753188873, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 1.204363859864782e-05, | |
| "loss": 0.3235, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.7991393883510066, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 1.2012907191149357e-05, | |
| "loss": 0.3423, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.8022130013831259, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.1982175783650892e-05, | |
| "loss": 0.3604, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.8052866144152451, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.1951444376152428e-05, | |
| "loss": 0.3702, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.8083602274473644, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.1920712968653965e-05, | |
| "loss": 0.3613, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.8114338404794836, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.1889981561155502e-05, | |
| "loss": 0.3198, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.8145074535116029, | |
| "grad_norm": 2.625, | |
| "learning_rate": 1.1859250153657038e-05, | |
| "loss": 0.3552, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.8175810665437221, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.1828518746158575e-05, | |
| "loss": 0.3481, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.8206546795758414, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.1797787338660112e-05, | |
| "loss": 0.3437, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.8237282926079607, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.1767055931161648e-05, | |
| "loss": 0.3552, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.8268019056400799, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 1.1736324523663185e-05, | |
| "loss": 0.3293, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.8298755186721992, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.1705593116164722e-05, | |
| "loss": 0.329, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.8329491317043184, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.1674861708666259e-05, | |
| "loss": 0.338, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.8360227447364377, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.1644130301167795e-05, | |
| "loss": 0.3558, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.839096357768557, | |
| "grad_norm": 2.875, | |
| "learning_rate": 1.1613398893669332e-05, | |
| "loss": 0.3521, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.8421699708006762, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.1582667486170867e-05, | |
| "loss": 0.381, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.8452435838327954, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.1551936078672404e-05, | |
| "loss": 0.351, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.8483171968649147, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.152120467117394e-05, | |
| "loss": 0.3478, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.851390809897034, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.1490473263675477e-05, | |
| "loss": 0.3434, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.8544644229291533, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.1459741856177014e-05, | |
| "loss": 0.3457, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.8575380359612724, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1.142901044867855e-05, | |
| "loss": 0.3267, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.8606116489933917, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 1.1398279041180087e-05, | |
| "loss": 0.3545, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.863685262025511, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 1.1367547633681624e-05, | |
| "loss": 0.3394, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.8667588750576303, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 1.133681622618316e-05, | |
| "loss": 0.3388, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.8698324880897496, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 1.1306084818684697e-05, | |
| "loss": 0.3326, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.8729061011218687, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 1.1275353411186234e-05, | |
| "loss": 0.2943, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.875979714153988, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 1.124462200368777e-05, | |
| "loss": 0.3485, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.8790533271861073, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 1.1213890596189307e-05, | |
| "loss": 0.3435, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.8821269402182266, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 1.1183159188690842e-05, | |
| "loss": 0.336, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.8852005532503457, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.1152427781192379e-05, | |
| "loss": 0.3299, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.888274166282465, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.1121696373693916e-05, | |
| "loss": 0.3236, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.8913477793145843, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 1.1090964966195452e-05, | |
| "loss": 0.3486, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.8944213923467036, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 1.1060233558696989e-05, | |
| "loss": 0.361, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.8974950053788228, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.1029502151198526e-05, | |
| "loss": 0.3271, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.900568618410942, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 1.0998770743700062e-05, | |
| "loss": 0.3285, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.9036422314430613, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.09680393362016e-05, | |
| "loss": 0.3021, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.9067158444751806, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.0937307928703136e-05, | |
| "loss": 0.3023, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.9097894575072998, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.0906576521204673e-05, | |
| "loss": 0.351, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.9128630705394191, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.087584511370621e-05, | |
| "loss": 0.3478, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.9159366835715383, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.0845113706207746e-05, | |
| "loss": 0.3674, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.9190102966036576, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.0814382298709283e-05, | |
| "loss": 0.3331, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.9220839096357769, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.0783650891210818e-05, | |
| "loss": 0.3487, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.9251575226678961, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 1.0752919483712354e-05, | |
| "loss": 0.3305, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.9282311357000154, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 1.0722188076213891e-05, | |
| "loss": 0.3359, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.9313047487321346, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.0691456668715428e-05, | |
| "loss": 0.3223, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.9343783617642539, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.0660725261216964e-05, | |
| "loss": 0.3388, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.9374519747963731, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.0629993853718501e-05, | |
| "loss": 0.3115, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.9405255878284924, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 1.0599262446220038e-05, | |
| "loss": 0.3434, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.9435992008606117, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.0568531038721575e-05, | |
| "loss": 0.333, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.946672813892731, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.0537799631223111e-05, | |
| "loss": 0.3097, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.9497464269248501, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.0507068223724648e-05, | |
| "loss": 0.3393, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.9528200399569694, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.0476336816226185e-05, | |
| "loss": 0.32, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.9558936529890887, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.0445605408727721e-05, | |
| "loss": 0.3458, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.958967266021208, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 1.0414874001229258e-05, | |
| "loss": 0.3408, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.9620408790533271, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 1.0384142593730793e-05, | |
| "loss": 0.3059, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.9651144920854464, | |
| "grad_norm": 2.625, | |
| "learning_rate": 1.035341118623233e-05, | |
| "loss": 0.3436, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.9681881051175657, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.0322679778733866e-05, | |
| "loss": 0.3455, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.971261718149685, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 1.0291948371235403e-05, | |
| "loss": 0.3236, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.9743353311818043, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.026121696373694e-05, | |
| "loss": 0.3414, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.9774089442139234, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 1.0230485556238476e-05, | |
| "loss": 0.3181, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.9804825572460427, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.0199754148740013e-05, | |
| "loss": 0.3312, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.983556170278162, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.016902274124155e-05, | |
| "loss": 0.3483, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.9866297833102813, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 1.0138291333743087e-05, | |
| "loss": 0.3294, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.9897033963424005, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.0107559926244623e-05, | |
| "loss": 0.3232, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.9927770093745197, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 1.007682851874616e-05, | |
| "loss": 0.2939, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.995850622406639, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.0046097111247697e-05, | |
| "loss": 0.3432, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.9989242354387583, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 1.0015365703749233e-05, | |
| "loss": 0.3194, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.0018441678192715, | |
| "grad_norm": 3.0, | |
| "learning_rate": 9.98463429625077e-06, | |
| "loss": 0.2676, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 1.0049177808513907, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 9.953902888752307e-06, | |
| "loss": 0.2694, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 1.0079913938835101, | |
| "grad_norm": 1.875, | |
| "learning_rate": 9.923171481253843e-06, | |
| "loss": 0.2422, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 1.0110650069156293, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 9.892440073755378e-06, | |
| "loss": 0.2634, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 1.0141386199477487, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 9.861708666256915e-06, | |
| "loss": 0.252, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.0172122329798678, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 9.830977258758452e-06, | |
| "loss": 0.2702, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 1.020285846011987, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 9.800245851259988e-06, | |
| "loss": 0.2959, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 1.0233594590441064, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 9.769514443761525e-06, | |
| "loss": 0.2707, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 1.0264330720762256, | |
| "grad_norm": 2.125, | |
| "learning_rate": 9.738783036263062e-06, | |
| "loss": 0.2567, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 1.0295066851083448, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 9.708051628764599e-06, | |
| "loss": 0.2623, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.0325802981404641, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 9.677320221266134e-06, | |
| "loss": 0.2846, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 1.0356539111725833, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 9.64658881376767e-06, | |
| "loss": 0.2979, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 1.0387275242047027, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 9.615857406269209e-06, | |
| "loss": 0.2692, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 1.0418011372368219, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 9.585125998770745e-06, | |
| "loss": 0.2733, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 1.044874750268941, | |
| "grad_norm": 2.375, | |
| "learning_rate": 9.554394591272282e-06, | |
| "loss": 0.2625, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.0479483633010604, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 9.523663183773819e-06, | |
| "loss": 0.283, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 1.0510219763331796, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 9.492931776275354e-06, | |
| "loss": 0.2823, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 1.054095589365299, | |
| "grad_norm": 2.25, | |
| "learning_rate": 9.46220036877689e-06, | |
| "loss": 0.2728, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 1.0571692023974182, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 9.431468961278427e-06, | |
| "loss": 0.2437, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 1.0602428154295374, | |
| "grad_norm": 2.375, | |
| "learning_rate": 9.400737553779964e-06, | |
| "loss": 0.2473, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.0633164284616567, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 9.3700061462815e-06, | |
| "loss": 0.27, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 1.066390041493776, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 9.339274738783037e-06, | |
| "loss": 0.293, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 1.069463654525895, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 9.308543331284574e-06, | |
| "loss": 0.2806, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 1.0725372675580145, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 9.277811923786109e-06, | |
| "loss": 0.2824, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 1.0756108805901337, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 9.247080516287647e-06, | |
| "loss": 0.2685, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.078684493622253, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 9.216349108789184e-06, | |
| "loss": 0.2478, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 1.0817581066543722, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 9.18561770129072e-06, | |
| "loss": 0.2639, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 1.0848317196864914, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 9.154886293792257e-06, | |
| "loss": 0.2966, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 1.0879053327186108, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 9.124154886293794e-06, | |
| "loss": 0.2538, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 1.09097894575073, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 9.093423478795329e-06, | |
| "loss": 0.283, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.0940525587828493, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 9.062692071296866e-06, | |
| "loss": 0.295, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 1.0971261718149685, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 9.031960663798402e-06, | |
| "loss": 0.2694, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 1.1001997848470877, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 9.00122925629994e-06, | |
| "loss": 0.275, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 1.103273397879207, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 8.970497848801476e-06, | |
| "loss": 0.3061, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 1.1063470109113263, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 8.939766441303013e-06, | |
| "loss": 0.2823, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.1094206239434454, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 8.90903503380455e-06, | |
| "loss": 0.2522, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 1.1124942369755648, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 8.878303626306086e-06, | |
| "loss": 0.2925, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 1.115567850007684, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 8.847572218807623e-06, | |
| "loss": 0.2762, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 1.1186414630398034, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 8.81684081130916e-06, | |
| "loss": 0.2761, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 1.1217150760719226, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 8.786109403810696e-06, | |
| "loss": 0.273, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.1247886891040417, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 8.755377996312233e-06, | |
| "loss": 0.2805, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 1.1278623021361611, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 8.72464658881377e-06, | |
| "loss": 0.2725, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 1.1309359151682803, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 8.693915181315304e-06, | |
| "loss": 0.2676, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 1.1340095282003997, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 8.663183773816841e-06, | |
| "loss": 0.2852, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 1.1370831412325189, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 8.632452366318378e-06, | |
| "loss": 0.2739, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.140156754264638, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 8.601720958819915e-06, | |
| "loss": 0.2742, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 1.1432303672967574, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 8.570989551321451e-06, | |
| "loss": 0.2766, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 1.1463039803288766, | |
| "grad_norm": 2.375, | |
| "learning_rate": 8.540258143822988e-06, | |
| "loss": 0.2897, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 1.1493775933609958, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 8.509526736324525e-06, | |
| "loss": 0.2664, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 1.1524512063931152, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 8.478795328826061e-06, | |
| "loss": 0.2723, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.1555248194252343, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 8.448063921327598e-06, | |
| "loss": 0.2647, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 1.1585984324573535, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 8.417332513829135e-06, | |
| "loss": 0.3002, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 1.161672045489473, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 8.386601106330671e-06, | |
| "loss": 0.2566, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 1.164745658521592, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 8.355869698832208e-06, | |
| "loss": 0.2624, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 1.1678192715537115, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 8.325138291333745e-06, | |
| "loss": 0.2831, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.1708928845858306, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 8.29440688383528e-06, | |
| "loss": 0.2856, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 1.17396649761795, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 8.263675476336816e-06, | |
| "loss": 0.2612, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 1.1770401106500692, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 8.232944068838353e-06, | |
| "loss": 0.2484, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 1.1801137236821884, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 8.20221266133989e-06, | |
| "loss": 0.2772, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 1.1831873367143078, | |
| "grad_norm": 2.375, | |
| "learning_rate": 8.171481253841427e-06, | |
| "loss": 0.2308, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.186260949746427, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 8.140749846342963e-06, | |
| "loss": 0.2431, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 1.189334562778546, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 8.1100184388445e-06, | |
| "loss": 0.262, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 1.1924081758106655, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 8.079287031346037e-06, | |
| "loss": 0.2678, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 1.1954817888427847, | |
| "grad_norm": 2.75, | |
| "learning_rate": 8.048555623847573e-06, | |
| "loss": 0.2382, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 1.1985554018749038, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 8.01782421634911e-06, | |
| "loss": 0.2762, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.2016290149070232, | |
| "grad_norm": 2.0, | |
| "learning_rate": 7.987092808850647e-06, | |
| "loss": 0.2571, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 1.2047026279391424, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 7.956361401352183e-06, | |
| "loss": 0.314, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 1.2077762409712618, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 7.92562999385372e-06, | |
| "loss": 0.2412, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 1.210849854003381, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 7.894898586355255e-06, | |
| "loss": 0.2633, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 1.2139234670355001, | |
| "grad_norm": 2.75, | |
| "learning_rate": 7.864167178856792e-06, | |
| "loss": 0.2563, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.2169970800676195, | |
| "grad_norm": 2.125, | |
| "learning_rate": 7.833435771358328e-06, | |
| "loss": 0.2917, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 1.2200706930997387, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 7.802704363859865e-06, | |
| "loss": 0.2385, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 1.223144306131858, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 7.771972956361402e-06, | |
| "loss": 0.265, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 1.2262179191639773, | |
| "grad_norm": 2.75, | |
| "learning_rate": 7.741241548862939e-06, | |
| "loss": 0.2519, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 1.2292915321960964, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 7.710510141364475e-06, | |
| "loss": 0.2523, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.2323651452282158, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 7.679778733866012e-06, | |
| "loss": 0.2485, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 1.235438758260335, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 7.649047326367549e-06, | |
| "loss": 0.2591, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 1.2385123712924542, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 7.6183159188690845e-06, | |
| "loss": 0.2765, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 1.2415859843245736, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 7.587584511370621e-06, | |
| "loss": 0.2541, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 1.2446595973566927, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 7.556853103872158e-06, | |
| "loss": 0.2941, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.2477332103888121, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 7.526121696373695e-06, | |
| "loss": 0.2644, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 1.2508068234209313, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 7.4953902888752304e-06, | |
| "loss": 0.2743, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 1.2538804364530507, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 7.464658881376767e-06, | |
| "loss": 0.2657, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 1.2569540494851699, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 7.433927473878304e-06, | |
| "loss": 0.2711, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 1.260027662517289, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 7.4031960663798405e-06, | |
| "loss": 0.2613, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.2631012755494084, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 7.372464658881377e-06, | |
| "loss": 0.2861, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 1.2661748885815276, | |
| "grad_norm": 2.125, | |
| "learning_rate": 7.341733251382914e-06, | |
| "loss": 0.2608, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 1.2692485016136468, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 7.311001843884451e-06, | |
| "loss": 0.268, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 1.2723221146457662, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 7.2802704363859865e-06, | |
| "loss": 0.2485, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 1.2753957276778853, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 7.249539028887523e-06, | |
| "loss": 0.2528, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.2784693407100045, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 7.21880762138906e-06, | |
| "loss": 0.2659, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 1.281542953742124, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 7.1880762138905965e-06, | |
| "loss": 0.2727, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 1.284616566774243, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 7.157344806392133e-06, | |
| "loss": 0.277, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 1.2876901798063622, | |
| "grad_norm": 3.0, | |
| "learning_rate": 7.12661339889367e-06, | |
| "loss": 0.2812, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 1.2907637928384816, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 7.095881991395206e-06, | |
| "loss": 0.2952, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.2938374058706008, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 7.0651505838967425e-06, | |
| "loss": 0.2447, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 1.2969110189027202, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 7.034419176398279e-06, | |
| "loss": 0.2678, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 1.2999846319348394, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 7.003687768899816e-06, | |
| "loss": 0.2617, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 1.3030582449669588, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 6.9729563614013526e-06, | |
| "loss": 0.2657, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 1.306131857999078, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 6.942224953902889e-06, | |
| "loss": 0.28, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.309205471031197, | |
| "grad_norm": 2.5, | |
| "learning_rate": 6.911493546404427e-06, | |
| "loss": 0.2695, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 1.3122790840633165, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 6.880762138905962e-06, | |
| "loss": 0.2686, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 1.3153526970954357, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 6.8500307314074985e-06, | |
| "loss": 0.2472, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 1.3184263101275548, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 6.819299323909035e-06, | |
| "loss": 0.2498, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 1.3214999231596742, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 6.788567916410572e-06, | |
| "loss": 0.2758, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.3245735361917934, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 6.7578365089121086e-06, | |
| "loss": 0.2772, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 1.3276471492239126, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 6.727105101413646e-06, | |
| "loss": 0.2693, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 1.330720762256032, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 6.696373693915181e-06, | |
| "loss": 0.275, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 1.3337943752881511, | |
| "grad_norm": 2.625, | |
| "learning_rate": 6.665642286416718e-06, | |
| "loss": 0.2852, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 1.3368679883202705, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 6.6349108789182545e-06, | |
| "loss": 0.2865, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.3399416013523897, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 6.604179471419791e-06, | |
| "loss": 0.2714, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 1.343015214384509, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 6.573448063921328e-06, | |
| "loss": 0.2551, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 1.3460888274166283, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 6.5427166564228654e-06, | |
| "loss": 0.255, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 1.3491624404487474, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 6.511985248924402e-06, | |
| "loss": 0.236, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 1.3522360534808668, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 6.481253841425937e-06, | |
| "loss": 0.2599, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.355309666512986, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 6.450522433927474e-06, | |
| "loss": 0.2586, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 1.3583832795451052, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 6.4197910264290105e-06, | |
| "loss": 0.2531, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 1.3614568925772246, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 6.389059618930547e-06, | |
| "loss": 0.2543, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 1.3645305056093437, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 6.358328211432085e-06, | |
| "loss": 0.2641, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 1.367604118641463, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 6.3275968039336215e-06, | |
| "loss": 0.2631, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.3706777316735823, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 6.2968653964351565e-06, | |
| "loss": 0.2782, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 1.3737513447057015, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 6.266133988936693e-06, | |
| "loss": 0.2614, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 1.3768249577378209, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 6.23540258143823e-06, | |
| "loss": 0.2633, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 1.37989857076994, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 6.2046711739397665e-06, | |
| "loss": 0.2587, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 1.3829721838020594, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 6.173939766441303e-06, | |
| "loss": 0.2503, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.3860457968341786, | |
| "grad_norm": 2.0, | |
| "learning_rate": 6.143208358942841e-06, | |
| "loss": 0.2426, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 1.3891194098662978, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 6.1124769514443775e-06, | |
| "loss": 0.2734, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 1.3921930228984172, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 6.0817455439459125e-06, | |
| "loss": 0.2667, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 1.3952666359305363, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 6.051014136447449e-06, | |
| "loss": 0.2595, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 1.3983402489626555, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 6.020282728948986e-06, | |
| "loss": 0.2494, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.401413861994775, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 5.9895513214505226e-06, | |
| "loss": 0.207, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 1.404487475026894, | |
| "grad_norm": 2.375, | |
| "learning_rate": 5.95881991395206e-06, | |
| "loss": 0.2547, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 1.4075610880590133, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 5.928088506453597e-06, | |
| "loss": 0.2823, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 1.4106347010911326, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 5.897357098955132e-06, | |
| "loss": 0.2654, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 1.4137083141232518, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 5.8666256914566685e-06, | |
| "loss": 0.2591, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.4167819271553712, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 5.835894283958205e-06, | |
| "loss": 0.2856, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 1.4198555401874904, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 5.805162876459742e-06, | |
| "loss": 0.2865, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 1.4229291532196098, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 5.7744314689612794e-06, | |
| "loss": 0.2241, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 1.426002766251729, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 5.743700061462816e-06, | |
| "loss": 0.2939, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 1.4290763792838481, | |
| "grad_norm": 2.375, | |
| "learning_rate": 5.712968653964353e-06, | |
| "loss": 0.2837, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.4321499923159675, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 5.682237246465888e-06, | |
| "loss": 0.2509, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 1.4352236053480867, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 5.6515058389674245e-06, | |
| "loss": 0.2812, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 1.4382972183802059, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 5.620774431468961e-06, | |
| "loss": 0.2667, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 1.4413708314123252, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 5.590043023970499e-06, | |
| "loss": 0.2411, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 1.4444444444444444, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 5.5593116164720354e-06, | |
| "loss": 0.2485, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.4475180574765636, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 5.528580208973572e-06, | |
| "loss": 0.2606, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 1.450591670508683, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 5.497848801475107e-06, | |
| "loss": 0.2536, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 1.4536652835408022, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 5.467117393976644e-06, | |
| "loss": 0.2716, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 1.4567388965729215, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 5.4363859864781805e-06, | |
| "loss": 0.2297, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 1.4598125096050407, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 5.405654578979718e-06, | |
| "loss": 0.2937, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.4628861226371601, | |
| "grad_norm": 2.375, | |
| "learning_rate": 5.374923171481255e-06, | |
| "loss": 0.2927, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 1.4659597356692793, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 5.3441917639827915e-06, | |
| "loss": 0.2757, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 1.4690333487013985, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 5.313460356484328e-06, | |
| "loss": 0.2751, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 1.4721069617335178, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 5.282728948985863e-06, | |
| "loss": 0.2831, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 1.475180574765637, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 5.2519975414874e-06, | |
| "loss": 0.2273, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.4782541877977562, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 5.221266133988937e-06, | |
| "loss": 0.265, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 1.4813278008298756, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 5.190534726490474e-06, | |
| "loss": 0.2605, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 1.4844014138619948, | |
| "grad_norm": 2.25, | |
| "learning_rate": 5.159803318992011e-06, | |
| "loss": 0.2595, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 1.487475026894114, | |
| "grad_norm": 2.25, | |
| "learning_rate": 5.1290719114935475e-06, | |
| "loss": 0.2491, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 1.4905486399262333, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 5.0983405039950825e-06, | |
| "loss": 0.2739, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.4936222529583525, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 5.067609096496619e-06, | |
| "loss": 0.2642, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 1.4966958659904717, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 5.036877688998157e-06, | |
| "loss": 0.2876, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 1.499769479022591, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 5.006146281499693e-06, | |
| "loss": 0.2167, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 1.5028430920547104, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 4.97541487400123e-06, | |
| "loss": 0.2668, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 1.5059167050868296, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 4.944683466502766e-06, | |
| "loss": 0.274, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.5089903181189488, | |
| "grad_norm": 2.125, | |
| "learning_rate": 4.913952059004303e-06, | |
| "loss": 0.2559, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 1.5120639311510682, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 4.883220651505839e-06, | |
| "loss": 0.253, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 1.5151375441831874, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 4.852489244007376e-06, | |
| "loss": 0.2538, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 1.5182111572153065, | |
| "grad_norm": 2.75, | |
| "learning_rate": 4.821757836508913e-06, | |
| "loss": 0.2763, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 1.521284770247426, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 4.7910264290104494e-06, | |
| "loss": 0.2621, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.524358383279545, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 4.760295021511986e-06, | |
| "loss": 0.269, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 1.5274319963116643, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 4.729563614013522e-06, | |
| "loss": 0.2617, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 1.5305056093437837, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 4.698832206515059e-06, | |
| "loss": 0.2691, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 1.5335792223759028, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 4.668100799016595e-06, | |
| "loss": 0.2592, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 1.536652835408022, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 4.637369391518132e-06, | |
| "loss": 0.2441, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.5397264484401414, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 4.606637984019669e-06, | |
| "loss": 0.2212, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 1.5428000614722608, | |
| "grad_norm": 2.75, | |
| "learning_rate": 4.5759065765212054e-06, | |
| "loss": 0.2572, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 1.5458736745043797, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 4.545175169022741e-06, | |
| "loss": 0.2554, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 1.5489472875364991, | |
| "grad_norm": 2.75, | |
| "learning_rate": 4.514443761524278e-06, | |
| "loss": 0.2633, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 1.5520209005686185, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 4.483712354025815e-06, | |
| "loss": 0.2433, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.5550945136007377, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 4.452980946527351e-06, | |
| "loss": 0.284, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 1.5581681266328569, | |
| "grad_norm": 3.53125, | |
| "learning_rate": 4.422249539028888e-06, | |
| "loss": 0.2655, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 1.5612417396649763, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 4.391518131530425e-06, | |
| "loss": 0.2448, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 1.5643153526970954, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 4.3607867240319615e-06, | |
| "loss": 0.2536, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 1.5673889657292146, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 4.330055316533497e-06, | |
| "loss": 0.2619, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.570462578761334, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 4.299323909035034e-06, | |
| "loss": 0.2762, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 1.5735361917934532, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 4.268592501536571e-06, | |
| "loss": 0.2555, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 1.5766098048255723, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 4.237861094038107e-06, | |
| "loss": 0.2457, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 1.5796834178576917, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 4.207129686539644e-06, | |
| "loss": 0.27, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 1.5827570308898111, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 4.176398279041181e-06, | |
| "loss": 0.2525, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.58583064392193, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 4.145666871542717e-06, | |
| "loss": 0.2788, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 1.5889042569540495, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 4.114935464044253e-06, | |
| "loss": 0.2732, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 1.5919778699861689, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 4.08420405654579e-06, | |
| "loss": 0.2415, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 1.595051483018288, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 4.053472649047327e-06, | |
| "loss": 0.2461, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 1.5981250960504072, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 4.022741241548863e-06, | |
| "loss": 0.2554, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.6011987090825266, | |
| "grad_norm": 2.5, | |
| "learning_rate": 3.9920098340504e-06, | |
| "loss": 0.2824, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 1.6042723221146458, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 3.961278426551937e-06, | |
| "loss": 0.2289, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 1.607345935146765, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 3.930547019053473e-06, | |
| "loss": 0.2515, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 1.6104195481788843, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 3.899815611555009e-06, | |
| "loss": 0.2636, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 1.6134931612110035, | |
| "grad_norm": 2.375, | |
| "learning_rate": 3.869084204056546e-06, | |
| "loss": 0.2656, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.6165667742431227, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 3.838352796558083e-06, | |
| "loss": 0.2758, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 1.619640387275242, | |
| "grad_norm": 2.375, | |
| "learning_rate": 3.807621389059619e-06, | |
| "loss": 0.2656, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 1.6227140003073615, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 3.776889981561156e-06, | |
| "loss": 0.2808, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 1.6257876133394804, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 3.746158574062692e-06, | |
| "loss": 0.2827, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 1.6288612263715998, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 3.7154271665642287e-06, | |
| "loss": 0.2532, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.6319348394037192, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 3.6846957590657658e-06, | |
| "loss": 0.2599, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 1.6350084524358384, | |
| "grad_norm": 2.625, | |
| "learning_rate": 3.6539643515673025e-06, | |
| "loss": 0.2759, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 1.6380820654679575, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 3.6232329440688383e-06, | |
| "loss": 0.262, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 1.641155678500077, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 3.5925015365703754e-06, | |
| "loss": 0.2804, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 1.644229291532196, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 3.561770129071912e-06, | |
| "loss": 0.2631, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.6473029045643153, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 3.531038721573448e-06, | |
| "loss": 0.2603, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 1.6503765175964347, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 3.500307314074985e-06, | |
| "loss": 0.234, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 1.6534501306285538, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 3.469575906576522e-06, | |
| "loss": 0.2589, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 1.656523743660673, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 3.4388444990780576e-06, | |
| "loss": 0.2643, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 1.6595973566927924, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 3.4081130915795948e-06, | |
| "loss": 0.252, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.6626709697249118, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 3.3773816840811315e-06, | |
| "loss": 0.2721, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 1.6657445827570307, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 3.3466502765826673e-06, | |
| "loss": 0.2749, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 1.6688181957891501, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 3.3159188690842044e-06, | |
| "loss": 0.2675, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 1.6718918088212695, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 3.285187461585741e-06, | |
| "loss": 0.2348, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 1.6749654218533887, | |
| "grad_norm": 2.25, | |
| "learning_rate": 3.254456054087278e-06, | |
| "loss": 0.2552, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.6780390348855079, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 3.223724646588814e-06, | |
| "loss": 0.306, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 1.6811126479176273, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 3.1929932390903508e-06, | |
| "loss": 0.2652, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 1.6841862609497464, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 3.1622618315918875e-06, | |
| "loss": 0.2572, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 1.6872598739818656, | |
| "grad_norm": 2.125, | |
| "learning_rate": 3.1315304240934238e-06, | |
| "loss": 0.2807, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 1.690333487013985, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 3.1007990165949604e-06, | |
| "loss": 0.2612, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.6934071000461042, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 3.070067609096497e-06, | |
| "loss": 0.2599, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 1.6964807130782233, | |
| "grad_norm": 2.375, | |
| "learning_rate": 3.0393362015980334e-06, | |
| "loss": 0.2675, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 1.6995543261103427, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 3.00860479409957e-06, | |
| "loss": 0.2352, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 1.702627939142462, | |
| "grad_norm": 1.875, | |
| "learning_rate": 2.977873386601107e-06, | |
| "loss": 0.262, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 1.705701552174581, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 2.947141979102643e-06, | |
| "loss": 0.2622, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.7087751652067005, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 2.9164105716041798e-06, | |
| "loss": 0.2577, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 1.7118487782388199, | |
| "grad_norm": 2.25, | |
| "learning_rate": 2.8856791641057165e-06, | |
| "loss": 0.2688, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 1.714922391270939, | |
| "grad_norm": 3.0, | |
| "learning_rate": 2.854947756607253e-06, | |
| "loss": 0.2794, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 1.7179960043030582, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 2.8242163491087894e-06, | |
| "loss": 0.2451, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 1.7210696173351776, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 2.793484941610326e-06, | |
| "loss": 0.2478, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.7241432303672968, | |
| "grad_norm": 2.25, | |
| "learning_rate": 2.762753534111863e-06, | |
| "loss": 0.2533, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 1.727216843399416, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 2.732022126613399e-06, | |
| "loss": 0.2803, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 1.7302904564315353, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 2.7012907191149358e-06, | |
| "loss": 0.236, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 1.7333640694636545, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 2.6705593116164725e-06, | |
| "loss": 0.2735, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 1.7364376824957737, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 2.6398279041180088e-06, | |
| "loss": 0.2486, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.739511295527893, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 2.6090964966195454e-06, | |
| "loss": 0.2673, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 1.7425849085600122, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 2.578365089121082e-06, | |
| "loss": 0.2445, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 1.7456585215921314, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 2.5476336816226184e-06, | |
| "loss": 0.2702, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 1.7487321346242508, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 2.516902274124155e-06, | |
| "loss": 0.2502, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 1.7518057476563702, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 2.486170866625692e-06, | |
| "loss": 0.2615, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.7548793606884892, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 2.455439459127228e-06, | |
| "loss": 0.2696, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 1.7579529737206085, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 2.4247080516287648e-06, | |
| "loss": 0.2489, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 1.761026586752728, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 2.3939766441303015e-06, | |
| "loss": 0.2448, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 1.7641001997848471, | |
| "grad_norm": 2.875, | |
| "learning_rate": 2.3632452366318377e-06, | |
| "loss": 0.2512, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 1.7671738128169663, | |
| "grad_norm": 2.125, | |
| "learning_rate": 2.3325138291333744e-06, | |
| "loss": 0.2481, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.7702474258490857, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 2.301782421634911e-06, | |
| "loss": 0.2504, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 1.7733210388812048, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 2.2710510141364474e-06, | |
| "loss": 0.2581, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 1.776394651913324, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 2.240319606637984e-06, | |
| "loss": 0.2627, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 1.7794682649454434, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 2.2095881991395208e-06, | |
| "loss": 0.25, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 1.7825418779775626, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 2.1788567916410575e-06, | |
| "loss": 0.2664, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.7856154910096818, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 2.1481253841425938e-06, | |
| "loss": 0.2627, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 1.7886891040418011, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 2.1173939766441304e-06, | |
| "loss": 0.2092, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 1.7917627170739205, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 2.086662569145667e-06, | |
| "loss": 0.2603, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 1.7948363301060395, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 2.0559311616472034e-06, | |
| "loss": 0.2244, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 1.7979099431381589, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 2.02519975414874e-06, | |
| "loss": 0.2542, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.8009835561702783, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.994468346650277e-06, | |
| "loss": 0.2478, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 1.8040571692023974, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.963736939151813e-06, | |
| "loss": 0.2253, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 1.8071307822345166, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.9330055316533498e-06, | |
| "loss": 0.2727, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 1.810204395266636, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 1.9022741241548865e-06, | |
| "loss": 0.273, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 1.8132780082987552, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1.871542716656423e-06, | |
| "loss": 0.2747, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.8163516213308744, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.8408113091579596e-06, | |
| "loss": 0.2624, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 1.8194252343629937, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 1.8100799016594961e-06, | |
| "loss": 0.2392, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 1.822498847395113, | |
| "grad_norm": 2.375, | |
| "learning_rate": 1.7793484941610328e-06, | |
| "loss": 0.2444, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 1.825572460427232, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 1.7486170866625693e-06, | |
| "loss": 0.2719, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 1.8286460734593515, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 1.7178856791641058e-06, | |
| "loss": 0.2604, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.8317196864914709, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 1.6871542716656425e-06, | |
| "loss": 0.2628, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 1.8347932995235898, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.656422864167179e-06, | |
| "loss": 0.2842, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 1.8378669125557092, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 1.6256914566687157e-06, | |
| "loss": 0.2726, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 1.8409405255878286, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 1.5949600491702521e-06, | |
| "loss": 0.2798, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 1.8440141386199478, | |
| "grad_norm": 2.5, | |
| "learning_rate": 1.5642286416717886e-06, | |
| "loss": 0.2865, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.847087751652067, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 1.5334972341733253e-06, | |
| "loss": 0.2594, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 1.8501613646841863, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 1.5027658266748618e-06, | |
| "loss": 0.2585, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 1.8532349777163055, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 1.4720344191763983e-06, | |
| "loss": 0.2758, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 1.8563085907484247, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 1.441303011677935e-06, | |
| "loss": 0.2722, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 1.859382203780544, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.4105716041794715e-06, | |
| "loss": 0.2812, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 1.8624558168126633, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 1.3798401966810082e-06, | |
| "loss": 0.2263, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 1.8655294298447824, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 1.3491087891825446e-06, | |
| "loss": 0.2759, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 1.8686030428769018, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 1.3183773816840811e-06, | |
| "loss": 0.2584, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 1.8716766559090212, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 1.2876459741856178e-06, | |
| "loss": 0.2728, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 1.8747502689411402, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.2569145666871543e-06, | |
| "loss": 0.2466, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.8778238819732596, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 1.226183159188691e-06, | |
| "loss": 0.2428, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 1.880897495005379, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 1.1954517516902275e-06, | |
| "loss": 0.2383, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 1.8839711080374981, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.1647203441917642e-06, | |
| "loss": 0.2902, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 1.8870447210696173, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 1.1339889366933007e-06, | |
| "loss": 0.2328, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 1.8901183341017367, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.1032575291948371e-06, | |
| "loss": 0.2504, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 1.8931919471338559, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.0725261216963738e-06, | |
| "loss": 0.2605, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 1.896265560165975, | |
| "grad_norm": 2.625, | |
| "learning_rate": 1.0417947141979103e-06, | |
| "loss": 0.2615, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 1.8993391731980944, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 1.0110633066994468e-06, | |
| "loss": 0.2613, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 1.9024127862302136, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 9.803318992009835e-07, | |
| "loss": 0.2761, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 1.9054863992623328, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 9.496004917025201e-07, | |
| "loss": 0.2589, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.9085600122944522, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 9.188690842040567e-07, | |
| "loss": 0.2762, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 1.9116336253265716, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 8.881376767055933e-07, | |
| "loss": 0.2326, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 1.9147072383586905, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 8.574062692071297e-07, | |
| "loss": 0.2404, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 1.91778085139081, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 8.266748617086663e-07, | |
| "loss": 0.2708, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 1.9208544644229293, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 7.959434542102029e-07, | |
| "loss": 0.2478, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.9239280774550485, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 7.652120467117395e-07, | |
| "loss": 0.2615, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 1.9270016904871676, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 7.34480639213276e-07, | |
| "loss": 0.2484, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 1.930075303519287, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 7.037492317148126e-07, | |
| "loss": 0.2866, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 1.9331489165514062, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 6.730178242163492e-07, | |
| "loss": 0.2632, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 1.9362225295835254, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 6.422864167178858e-07, | |
| "loss": 0.2576, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.9392961426156448, | |
| "grad_norm": 2.125, | |
| "learning_rate": 6.115550092194224e-07, | |
| "loss": 0.2428, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 1.942369755647764, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 5.808236017209588e-07, | |
| "loss": 0.2435, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 1.945443368679883, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 5.500921942224954e-07, | |
| "loss": 0.2537, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 1.9485169817120025, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 5.19360786724032e-07, | |
| "loss": 0.2701, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 1.9515905947441217, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 4.886293792255686e-07, | |
| "loss": 0.1977, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 1.9546642077762408, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 4.5789797172710514e-07, | |
| "loss": 0.2635, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 1.9577378208083602, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 4.271665642286417e-07, | |
| "loss": 0.2385, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 1.9608114338404796, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 3.964351567301783e-07, | |
| "loss": 0.2657, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 1.9638850468725988, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 3.657037492317148e-07, | |
| "loss": 0.2301, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 1.966958659904718, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 3.3497234173325144e-07, | |
| "loss": 0.2379, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.9700322729368374, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 3.04240934234788e-07, | |
| "loss": 0.2598, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 1.9731058859689565, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 2.7350952673632457e-07, | |
| "loss": 0.253, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 1.9761794990010757, | |
| "grad_norm": 2.375, | |
| "learning_rate": 2.427781192378611e-07, | |
| "loss": 0.2614, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 1.979253112033195, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 2.120467117393977e-07, | |
| "loss": 0.2474, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 1.9823267250653143, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.8131530424093426e-07, | |
| "loss": 0.2825, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 1.9854003380974334, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.5058389674247082e-07, | |
| "loss": 0.2705, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 1.9884739511295528, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.1985248924400738e-07, | |
| "loss": 0.2716, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 1.991547564161672, | |
| "grad_norm": 2.25, | |
| "learning_rate": 8.912108174554396e-08, | |
| "loss": 0.2911, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 1.9946211771937912, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 5.838967424708052e-08, | |
| "loss": 0.2304, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 1.9976947902259106, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 2.7658266748617086e-08, | |
| "loss": 0.2298, | |
| "step": 6500 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 6508, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.59613886778096e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |