| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.15362377537410882, | |
| "eval_steps": 501, | |
| "global_step": 5500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 2.793159552256524e-05, | |
| "grad_norm": 37.485023498535156, | |
| "learning_rate": 2e-06, | |
| "loss": 2.1306, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.00027931595522565235, | |
| "grad_norm": 27.45977210998535, | |
| "learning_rate": 1.9994972347913524e-06, | |
| "loss": 2.0875, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0005586319104513047, | |
| "grad_norm": 18.15777015686035, | |
| "learning_rate": 1.998938606781744e-06, | |
| "loss": 1.9196, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0008379478656769571, | |
| "grad_norm": 20.21579360961914, | |
| "learning_rate": 1.9983799787721355e-06, | |
| "loss": 1.8183, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0011172638209026094, | |
| "grad_norm": 14.037498474121094, | |
| "learning_rate": 1.9978213507625273e-06, | |
| "loss": 1.7441, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.001396579776128262, | |
| "grad_norm": 13.131041526794434, | |
| "learning_rate": 1.9972627227529187e-06, | |
| "loss": 1.6705, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0016758957313539142, | |
| "grad_norm": 14.493760108947754, | |
| "learning_rate": 1.99670409474331e-06, | |
| "loss": 1.7261, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0019552116865795667, | |
| "grad_norm": 15.630404472351074, | |
| "learning_rate": 1.996145466733702e-06, | |
| "loss": 1.7375, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.002234527641805219, | |
| "grad_norm": 12.822752952575684, | |
| "learning_rate": 1.9955868387240936e-06, | |
| "loss": 1.6683, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0025138435970308713, | |
| "grad_norm": 12.364542961120605, | |
| "learning_rate": 1.995028210714485e-06, | |
| "loss": 1.5954, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.002793159552256524, | |
| "grad_norm": 14.236380577087402, | |
| "learning_rate": 1.994469582704877e-06, | |
| "loss": 1.6029, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0030724755074821764, | |
| "grad_norm": 13.032842636108398, | |
| "learning_rate": 1.9939109546952686e-06, | |
| "loss": 1.6026, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0033517914627078284, | |
| "grad_norm": 14.683618545532227, | |
| "learning_rate": 1.99335232668566e-06, | |
| "loss": 1.6354, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.003631107417933481, | |
| "grad_norm": 12.228134155273438, | |
| "learning_rate": 1.9927936986760517e-06, | |
| "loss": 1.5176, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.0039104233731591335, | |
| "grad_norm": 14.089844703674316, | |
| "learning_rate": 1.992235070666443e-06, | |
| "loss": 1.5644, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.004189739328384786, | |
| "grad_norm": 13.192647933959961, | |
| "learning_rate": 1.9916764426568345e-06, | |
| "loss": 1.6016, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.004469055283610438, | |
| "grad_norm": 13.630595207214355, | |
| "learning_rate": 1.9911178146472263e-06, | |
| "loss": 1.5195, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.00474837123883609, | |
| "grad_norm": 12.795489311218262, | |
| "learning_rate": 1.990559186637618e-06, | |
| "loss": 1.4888, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.005027687194061743, | |
| "grad_norm": 14.711332321166992, | |
| "learning_rate": 1.9900005586280094e-06, | |
| "loss": 1.4658, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.005307003149287395, | |
| "grad_norm": 14.496808052062988, | |
| "learning_rate": 1.989441930618401e-06, | |
| "loss": 1.5231, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.005586319104513048, | |
| "grad_norm": 11.26836109161377, | |
| "learning_rate": 1.988883302608793e-06, | |
| "loss": 1.3865, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0058656350597387, | |
| "grad_norm": 12.824557304382324, | |
| "learning_rate": 1.9883246745991844e-06, | |
| "loss": 1.4743, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.006144951014964353, | |
| "grad_norm": 13.50056266784668, | |
| "learning_rate": 1.9877660465895757e-06, | |
| "loss": 1.3856, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.006424266970190004, | |
| "grad_norm": 12.35004997253418, | |
| "learning_rate": 1.9872074185799675e-06, | |
| "loss": 1.4165, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.006703582925415657, | |
| "grad_norm": 11.969117164611816, | |
| "learning_rate": 1.986648790570359e-06, | |
| "loss": 1.3719, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.006982898880641309, | |
| "grad_norm": 12.795903205871582, | |
| "learning_rate": 1.9860901625607507e-06, | |
| "loss": 1.4586, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.007262214835866962, | |
| "grad_norm": 14.326574325561523, | |
| "learning_rate": 1.9855315345511424e-06, | |
| "loss": 1.4185, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.0075415307910926144, | |
| "grad_norm": 14.065360069274902, | |
| "learning_rate": 1.984972906541534e-06, | |
| "loss": 1.3441, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.007820846746318267, | |
| "grad_norm": 13.229423522949219, | |
| "learning_rate": 1.9844142785319256e-06, | |
| "loss": 1.4299, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.00810016270154392, | |
| "grad_norm": 10.81311321258545, | |
| "learning_rate": 1.9838556505223174e-06, | |
| "loss": 1.4352, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.008379478656769572, | |
| "grad_norm": 11.971837043762207, | |
| "learning_rate": 1.9832970225127088e-06, | |
| "loss": 1.3721, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.008658794611995225, | |
| "grad_norm": 10.858203887939453, | |
| "learning_rate": 1.9827383945031e-06, | |
| "loss": 1.3593, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.008938110567220875, | |
| "grad_norm": 13.713777542114258, | |
| "learning_rate": 1.982179766493492e-06, | |
| "loss": 1.3732, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.009217426522446528, | |
| "grad_norm": 10.59002685546875, | |
| "learning_rate": 1.9816211384838833e-06, | |
| "loss": 1.3841, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.00949674247767218, | |
| "grad_norm": 12.030527114868164, | |
| "learning_rate": 1.981062510474275e-06, | |
| "loss": 1.3584, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.009776058432897833, | |
| "grad_norm": 13.677680969238281, | |
| "learning_rate": 1.980503882464667e-06, | |
| "loss": 1.3468, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.010055374388123485, | |
| "grad_norm": 11.954497337341309, | |
| "learning_rate": 1.9799452544550582e-06, | |
| "loss": 1.3558, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.010334690343349138, | |
| "grad_norm": 11.743765830993652, | |
| "learning_rate": 1.97938662644545e-06, | |
| "loss": 1.33, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.01061400629857479, | |
| "grad_norm": 14.092965126037598, | |
| "learning_rate": 1.978827998435842e-06, | |
| "loss": 1.3247, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.010893322253800443, | |
| "grad_norm": 12.351668357849121, | |
| "learning_rate": 1.978269370426233e-06, | |
| "loss": 1.359, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.011172638209026095, | |
| "grad_norm": 12.778825759887695, | |
| "learning_rate": 1.9777107424166245e-06, | |
| "loss": 1.3317, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.011451954164251748, | |
| "grad_norm": 13.161787986755371, | |
| "learning_rate": 1.9771521144070163e-06, | |
| "loss": 1.3726, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.0117312701194774, | |
| "grad_norm": 12.683723449707031, | |
| "learning_rate": 1.9765934863974077e-06, | |
| "loss": 1.2869, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.012010586074703053, | |
| "grad_norm": 11.430862426757812, | |
| "learning_rate": 1.9760348583877995e-06, | |
| "loss": 1.3741, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.012289902029928705, | |
| "grad_norm": 12.193629264831543, | |
| "learning_rate": 1.9754762303781913e-06, | |
| "loss": 1.3247, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.012569217985154356, | |
| "grad_norm": 12.044336318969727, | |
| "learning_rate": 1.9749176023685826e-06, | |
| "loss": 1.3258, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.012848533940380009, | |
| "grad_norm": 13.162397384643555, | |
| "learning_rate": 1.9743589743589744e-06, | |
| "loss": 1.3035, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.013127849895605661, | |
| "grad_norm": 13.332141876220703, | |
| "learning_rate": 1.9738003463493658e-06, | |
| "loss": 1.2725, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.013407165850831314, | |
| "grad_norm": 11.433170318603516, | |
| "learning_rate": 1.9732417183397576e-06, | |
| "loss": 1.293, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.013686481806056966, | |
| "grad_norm": 11.537554740905762, | |
| "learning_rate": 1.972683090330149e-06, | |
| "loss": 1.3865, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.013965797761282619, | |
| "grad_norm": 13.405438423156738, | |
| "learning_rate": 1.9721244623205407e-06, | |
| "loss": 1.3675, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.013993729356805184, | |
| "eval_complexity_accuracy": 0.0, | |
| "eval_loss": 1.3878909349441528, | |
| "eval_runtime": 33.2902, | |
| "eval_samples_per_second": 15.019, | |
| "eval_steps_per_second": 1.892, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 0.014245113716508271, | |
| "grad_norm": 13.519207000732422, | |
| "learning_rate": 1.971565834310932e-06, | |
| "loss": 1.2924, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.014524429671733924, | |
| "grad_norm": 12.743926048278809, | |
| "learning_rate": 1.971007206301324e-06, | |
| "loss": 1.3396, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.014803745626959576, | |
| "grad_norm": 11.494955062866211, | |
| "learning_rate": 1.9704485782917157e-06, | |
| "loss": 1.3783, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.015083061582185229, | |
| "grad_norm": 13.423910140991211, | |
| "learning_rate": 1.969889950282107e-06, | |
| "loss": 1.3687, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.015362377537410881, | |
| "grad_norm": 12.793667793273926, | |
| "learning_rate": 1.969331322272499e-06, | |
| "loss": 1.3364, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.015641693492636534, | |
| "grad_norm": 12.060896873474121, | |
| "learning_rate": 1.96877269426289e-06, | |
| "loss": 1.3495, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.015921009447862185, | |
| "grad_norm": 10.879355430603027, | |
| "learning_rate": 1.968214066253282e-06, | |
| "loss": 1.2966, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.01620032540308784, | |
| "grad_norm": 11.109333038330078, | |
| "learning_rate": 1.9676554382436733e-06, | |
| "loss": 1.3587, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.01647964135831349, | |
| "grad_norm": 14.08962345123291, | |
| "learning_rate": 1.967096810234065e-06, | |
| "loss": 1.3881, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.016758957313539144, | |
| "grad_norm": 13.27667236328125, | |
| "learning_rate": 1.9665381822244565e-06, | |
| "loss": 1.3097, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.017038273268764795, | |
| "grad_norm": 11.379706382751465, | |
| "learning_rate": 1.9659795542148483e-06, | |
| "loss": 1.305, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.01731758922399045, | |
| "grad_norm": 12.25299072265625, | |
| "learning_rate": 1.96542092620524e-06, | |
| "loss": 1.3136, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.0175969051792161, | |
| "grad_norm": 11.619131088256836, | |
| "learning_rate": 1.9648622981956314e-06, | |
| "loss": 1.3265, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.01787622113444175, | |
| "grad_norm": 13.470244407653809, | |
| "learning_rate": 1.9643036701860228e-06, | |
| "loss": 1.3938, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.018155537089667405, | |
| "grad_norm": 12.438233375549316, | |
| "learning_rate": 1.9637450421764146e-06, | |
| "loss": 1.3579, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.018434853044893056, | |
| "grad_norm": 11.806841850280762, | |
| "learning_rate": 1.9631864141668064e-06, | |
| "loss": 1.3165, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.01871416900011871, | |
| "grad_norm": 10.943819999694824, | |
| "learning_rate": 1.9626277861571977e-06, | |
| "loss": 1.3435, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.01899348495534436, | |
| "grad_norm": 11.002156257629395, | |
| "learning_rate": 1.9620691581475895e-06, | |
| "loss": 1.3137, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.019272800910570015, | |
| "grad_norm": 11.192991256713867, | |
| "learning_rate": 1.961510530137981e-06, | |
| "loss": 1.341, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.019552116865795666, | |
| "grad_norm": 11.328652381896973, | |
| "learning_rate": 1.9609519021283727e-06, | |
| "loss": 1.3744, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.01983143282102132, | |
| "grad_norm": 11.382583618164062, | |
| "learning_rate": 1.9603932741187645e-06, | |
| "loss": 1.2749, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.02011074877624697, | |
| "grad_norm": 9.765230178833008, | |
| "learning_rate": 1.959834646109156e-06, | |
| "loss": 1.3148, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.020390064731472625, | |
| "grad_norm": 10.793863296508789, | |
| "learning_rate": 1.959276018099547e-06, | |
| "loss": 1.2991, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.020669380686698276, | |
| "grad_norm": 12.697861671447754, | |
| "learning_rate": 1.958717390089939e-06, | |
| "loss": 1.3673, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.02094869664192393, | |
| "grad_norm": 11.78731632232666, | |
| "learning_rate": 1.9581587620803308e-06, | |
| "loss": 1.36, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.02122801259714958, | |
| "grad_norm": 11.723365783691406, | |
| "learning_rate": 1.957600134070722e-06, | |
| "loss": 1.3558, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.02150732855237523, | |
| "grad_norm": 11.155319213867188, | |
| "learning_rate": 1.957041506061114e-06, | |
| "loss": 1.3266, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.021786644507600886, | |
| "grad_norm": 11.003241539001465, | |
| "learning_rate": 1.9564828780515053e-06, | |
| "loss": 1.3161, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.022065960462826537, | |
| "grad_norm": 11.691163063049316, | |
| "learning_rate": 1.955924250041897e-06, | |
| "loss": 1.3782, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.02234527641805219, | |
| "grad_norm": 13.002456665039062, | |
| "learning_rate": 1.955365622032289e-06, | |
| "loss": 1.3738, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.02262459237327784, | |
| "grad_norm": 10.829326629638672, | |
| "learning_rate": 1.9548069940226802e-06, | |
| "loss": 1.3089, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.022903908328503496, | |
| "grad_norm": 11.259895324707031, | |
| "learning_rate": 1.9542483660130716e-06, | |
| "loss": 1.3002, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.023183224283729147, | |
| "grad_norm": 12.811477661132812, | |
| "learning_rate": 1.9536897380034634e-06, | |
| "loss": 1.3126, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.0234625402389548, | |
| "grad_norm": 11.347965240478516, | |
| "learning_rate": 1.953131109993855e-06, | |
| "loss": 1.3364, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.02374185619418045, | |
| "grad_norm": 12.316996574401855, | |
| "learning_rate": 1.9525724819842465e-06, | |
| "loss": 1.3208, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.024021172149406106, | |
| "grad_norm": 11.446920394897461, | |
| "learning_rate": 1.9520138539746383e-06, | |
| "loss": 1.3292, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.024300488104631757, | |
| "grad_norm": 11.28432559967041, | |
| "learning_rate": 1.9514552259650297e-06, | |
| "loss": 1.3331, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.02457980405985741, | |
| "grad_norm": 11.215639114379883, | |
| "learning_rate": 1.9508965979554215e-06, | |
| "loss": 1.3026, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.02485912001508306, | |
| "grad_norm": 11.234190940856934, | |
| "learning_rate": 1.950337969945813e-06, | |
| "loss": 1.2926, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.025138435970308712, | |
| "grad_norm": 11.294180870056152, | |
| "learning_rate": 1.9497793419362046e-06, | |
| "loss": 1.3644, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.025417751925534367, | |
| "grad_norm": 11.346322059631348, | |
| "learning_rate": 1.949220713926596e-06, | |
| "loss": 1.3124, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.025697067880760018, | |
| "grad_norm": 11.497020721435547, | |
| "learning_rate": 1.9486620859169878e-06, | |
| "loss": 1.2695, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.025976383835985672, | |
| "grad_norm": 10.896917343139648, | |
| "learning_rate": 1.9481034579073796e-06, | |
| "loss": 1.3141, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.026255699791211323, | |
| "grad_norm": 10.956721305847168, | |
| "learning_rate": 1.947544829897771e-06, | |
| "loss": 1.36, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.026535015746436977, | |
| "grad_norm": 11.796623229980469, | |
| "learning_rate": 1.9469862018881627e-06, | |
| "loss": 1.3586, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.026814331701662628, | |
| "grad_norm": 11.082508087158203, | |
| "learning_rate": 1.946427573878554e-06, | |
| "loss": 1.3514, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.027093647656888282, | |
| "grad_norm": 11.789264678955078, | |
| "learning_rate": 1.945868945868946e-06, | |
| "loss": 1.328, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.027372963612113933, | |
| "grad_norm": 11.333861351013184, | |
| "learning_rate": 1.9453103178593372e-06, | |
| "loss": 1.2765, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.027652279567339587, | |
| "grad_norm": 12.05320930480957, | |
| "learning_rate": 1.944751689849729e-06, | |
| "loss": 1.3679, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.027931595522565238, | |
| "grad_norm": 12.946321487426758, | |
| "learning_rate": 1.9441930618401204e-06, | |
| "loss": 1.3105, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.02798745871361037, | |
| "eval_complexity_accuracy": 0.0, | |
| "eval_loss": 1.364721655845642, | |
| "eval_runtime": 34.1546, | |
| "eval_samples_per_second": 14.639, | |
| "eval_steps_per_second": 1.845, | |
| "step": 1002 | |
| }, | |
| { | |
| "epoch": 0.02821091147779089, | |
| "grad_norm": 12.720813751220703, | |
| "learning_rate": 1.943634433830512e-06, | |
| "loss": 1.2763, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.028490227433016543, | |
| "grad_norm": 10.137106895446777, | |
| "learning_rate": 1.943075805820904e-06, | |
| "loss": 1.3177, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.028769543388242193, | |
| "grad_norm": 11.257421493530273, | |
| "learning_rate": 1.9425171778112953e-06, | |
| "loss": 1.3078, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.029048859343467848, | |
| "grad_norm": 11.93409538269043, | |
| "learning_rate": 1.941958549801687e-06, | |
| "loss": 1.3251, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.0293281752986935, | |
| "grad_norm": 12.464277267456055, | |
| "learning_rate": 1.9413999217920785e-06, | |
| "loss": 1.3199, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.029607491253919153, | |
| "grad_norm": 12.42292308807373, | |
| "learning_rate": 1.9408412937824703e-06, | |
| "loss": 1.2815, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.029886807209144804, | |
| "grad_norm": 11.653295516967773, | |
| "learning_rate": 1.9402826657728616e-06, | |
| "loss": 1.2948, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.030166123164370458, | |
| "grad_norm": 12.255006790161133, | |
| "learning_rate": 1.9397240377632534e-06, | |
| "loss": 1.3263, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.03044543911959611, | |
| "grad_norm": 10.424007415771484, | |
| "learning_rate": 1.939165409753645e-06, | |
| "loss": 1.2892, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.030724755074821763, | |
| "grad_norm": 10.664515495300293, | |
| "learning_rate": 1.9386067817440366e-06, | |
| "loss": 1.3407, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.031004071030047414, | |
| "grad_norm": 12.733943939208984, | |
| "learning_rate": 1.9380481537344284e-06, | |
| "loss": 1.348, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.03128338698527307, | |
| "grad_norm": 10.41376781463623, | |
| "learning_rate": 1.9374895257248197e-06, | |
| "loss": 1.3827, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.03156270294049872, | |
| "grad_norm": 13.944782257080078, | |
| "learning_rate": 1.9369308977152115e-06, | |
| "loss": 1.3218, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.03184201889572437, | |
| "grad_norm": 12.373078346252441, | |
| "learning_rate": 1.936372269705603e-06, | |
| "loss": 1.2725, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.03212133485095002, | |
| "grad_norm": 11.583971977233887, | |
| "learning_rate": 1.9358136416959947e-06, | |
| "loss": 1.2983, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.03240065080617568, | |
| "grad_norm": 12.660507202148438, | |
| "learning_rate": 1.935255013686386e-06, | |
| "loss": 1.278, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.03267996676140133, | |
| "grad_norm": 10.222640991210938, | |
| "learning_rate": 1.934696385676778e-06, | |
| "loss": 1.2866, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.03295928271662698, | |
| "grad_norm": 12.668971061706543, | |
| "learning_rate": 1.934137757667169e-06, | |
| "loss": 1.3605, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.03323859867185263, | |
| "grad_norm": 10.59626579284668, | |
| "learning_rate": 1.933579129657561e-06, | |
| "loss": 1.321, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.03351791462707829, | |
| "grad_norm": 11.953704833984375, | |
| "learning_rate": 1.9330205016479528e-06, | |
| "loss": 1.2852, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.03379723058230394, | |
| "grad_norm": 11.2271146774292, | |
| "learning_rate": 1.932461873638344e-06, | |
| "loss": 1.3196, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.03407654653752959, | |
| "grad_norm": 10.453490257263184, | |
| "learning_rate": 1.9319032456287355e-06, | |
| "loss": 1.397, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.03435586249275524, | |
| "grad_norm": 13.665384292602539, | |
| "learning_rate": 1.9313446176191273e-06, | |
| "loss": 1.3058, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.0346351784479809, | |
| "grad_norm": 10.085427284240723, | |
| "learning_rate": 1.930785989609519e-06, | |
| "loss": 1.3081, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.03491449440320655, | |
| "grad_norm": 12.101105690002441, | |
| "learning_rate": 1.9302273615999105e-06, | |
| "loss": 1.345, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.0351938103584322, | |
| "grad_norm": 10.636537551879883, | |
| "learning_rate": 1.9296687335903022e-06, | |
| "loss": 1.3293, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.03547312631365785, | |
| "grad_norm": 12.76969051361084, | |
| "learning_rate": 1.9291101055806936e-06, | |
| "loss": 1.3403, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.0357524422688835, | |
| "grad_norm": 11.625609397888184, | |
| "learning_rate": 1.9285514775710854e-06, | |
| "loss": 1.2967, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.03603175822410916, | |
| "grad_norm": 12.158754348754883, | |
| "learning_rate": 1.927992849561477e-06, | |
| "loss": 1.3727, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.03631107417933481, | |
| "grad_norm": 13.211498260498047, | |
| "learning_rate": 1.9274342215518685e-06, | |
| "loss": 1.2925, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.03659039013456046, | |
| "grad_norm": 16.932209014892578, | |
| "learning_rate": 1.92687559354226e-06, | |
| "loss": 1.3434, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.03686970608978611, | |
| "grad_norm": 10.869868278503418, | |
| "learning_rate": 1.9263169655326517e-06, | |
| "loss": 1.3001, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.03714902204501177, | |
| "grad_norm": 11.199213027954102, | |
| "learning_rate": 1.9257583375230435e-06, | |
| "loss": 1.3927, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.03742833800023742, | |
| "grad_norm": 11.47125244140625, | |
| "learning_rate": 1.925199709513435e-06, | |
| "loss": 1.3426, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.03770765395546307, | |
| "grad_norm": 12.344675064086914, | |
| "learning_rate": 1.9246410815038266e-06, | |
| "loss": 1.3525, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.03798696991068872, | |
| "grad_norm": 12.831677436828613, | |
| "learning_rate": 1.924082453494218e-06, | |
| "loss": 1.329, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.03826628586591438, | |
| "grad_norm": 11.5836763381958, | |
| "learning_rate": 1.92352382548461e-06, | |
| "loss": 1.3188, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.03854560182114003, | |
| "grad_norm": 10.466170310974121, | |
| "learning_rate": 1.9229651974750016e-06, | |
| "loss": 1.3177, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.03882491777636568, | |
| "grad_norm": 12.394039154052734, | |
| "learning_rate": 1.922406569465393e-06, | |
| "loss": 1.3367, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.03910423373159133, | |
| "grad_norm": 10.985048294067383, | |
| "learning_rate": 1.9218479414557843e-06, | |
| "loss": 1.2887, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.03938354968681698, | |
| "grad_norm": 12.47451400756836, | |
| "learning_rate": 1.921289313446176e-06, | |
| "loss": 1.2986, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.03966286564204264, | |
| "grad_norm": 10.245006561279297, | |
| "learning_rate": 1.920730685436568e-06, | |
| "loss": 1.3413, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.03994218159726829, | |
| "grad_norm": 11.382227897644043, | |
| "learning_rate": 1.9201720574269593e-06, | |
| "loss": 1.3652, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.04022149755249394, | |
| "grad_norm": 13.765195846557617, | |
| "learning_rate": 1.919613429417351e-06, | |
| "loss": 1.3105, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.04050081350771959, | |
| "grad_norm": 10.82947063446045, | |
| "learning_rate": 1.9190548014077424e-06, | |
| "loss": 1.3128, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.04078012946294525, | |
| "grad_norm": 9.862834930419922, | |
| "learning_rate": 1.918496173398134e-06, | |
| "loss": 1.3368, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.0410594454181709, | |
| "grad_norm": 9.987138748168945, | |
| "learning_rate": 1.9179375453885256e-06, | |
| "loss": 1.3172, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.04133876137339655, | |
| "grad_norm": 10.993836402893066, | |
| "learning_rate": 1.9173789173789174e-06, | |
| "loss": 1.2961, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.0416180773286222, | |
| "grad_norm": 10.989373207092285, | |
| "learning_rate": 1.9168202893693087e-06, | |
| "loss": 1.2854, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.04189739328384786, | |
| "grad_norm": 14.129310607910156, | |
| "learning_rate": 1.9162616613597005e-06, | |
| "loss": 1.3157, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.04198118807041555, | |
| "eval_complexity_accuracy": 0.0, | |
| "eval_loss": 1.3546726703643799, | |
| "eval_runtime": 34.0455, | |
| "eval_samples_per_second": 14.686, | |
| "eval_steps_per_second": 1.85, | |
| "step": 1503 | |
| }, | |
| { | |
| "epoch": 0.04217670923907351, | |
| "grad_norm": 10.534819602966309, | |
| "learning_rate": 1.9157030333500923e-06, | |
| "loss": 1.3115, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.04245602519429916, | |
| "grad_norm": 10.998124122619629, | |
| "learning_rate": 1.9151444053404837e-06, | |
| "loss": 1.2958, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.04273534114952481, | |
| "grad_norm": 10.543405532836914, | |
| "learning_rate": 1.9145857773308754e-06, | |
| "loss": 1.2976, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.04301465710475046, | |
| "grad_norm": 11.423952102661133, | |
| "learning_rate": 1.914027149321267e-06, | |
| "loss": 1.2922, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.04329397305997612, | |
| "grad_norm": 10.33931827545166, | |
| "learning_rate": 1.9134685213116586e-06, | |
| "loss": 1.3221, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.04357328901520177, | |
| "grad_norm": 10.731399536132812, | |
| "learning_rate": 1.91290989330205e-06, | |
| "loss": 1.2949, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.04385260497042742, | |
| "grad_norm": 10.743152618408203, | |
| "learning_rate": 1.9123512652924418e-06, | |
| "loss": 1.275, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.04413192092565307, | |
| "grad_norm": 10.677448272705078, | |
| "learning_rate": 1.911792637282833e-06, | |
| "loss": 1.2822, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.04441123688087873, | |
| "grad_norm": 10.933751106262207, | |
| "learning_rate": 1.911234009273225e-06, | |
| "loss": 1.2784, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.04469055283610438, | |
| "grad_norm": 10.95008659362793, | |
| "learning_rate": 1.9106753812636167e-06, | |
| "loss": 1.337, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.04496986879133003, | |
| "grad_norm": 11.022769927978516, | |
| "learning_rate": 1.910116753254008e-06, | |
| "loss": 1.3194, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.04524918474655568, | |
| "grad_norm": 12.916274070739746, | |
| "learning_rate": 1.9095581252444e-06, | |
| "loss": 1.3023, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.04552850070178134, | |
| "grad_norm": 12.046470642089844, | |
| "learning_rate": 1.9089994972347912e-06, | |
| "loss": 1.2803, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.04580781665700699, | |
| "grad_norm": 10.913056373596191, | |
| "learning_rate": 1.9084408692251826e-06, | |
| "loss": 1.3405, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.04608713261223264, | |
| "grad_norm": 11.769244194030762, | |
| "learning_rate": 1.9078822412155744e-06, | |
| "loss": 1.2995, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.04636644856745829, | |
| "grad_norm": 11.765388488769531, | |
| "learning_rate": 1.907323613205966e-06, | |
| "loss": 1.3457, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.046645764522683944, | |
| "grad_norm": 11.881918907165527, | |
| "learning_rate": 1.9067649851963577e-06, | |
| "loss": 1.3367, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.0469250804779096, | |
| "grad_norm": 10.628633499145508, | |
| "learning_rate": 1.9062063571867493e-06, | |
| "loss": 1.3091, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.04720439643313525, | |
| "grad_norm": 11.146201133728027, | |
| "learning_rate": 1.9056477291771409e-06, | |
| "loss": 1.3041, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.0474837123883609, | |
| "grad_norm": 10.595499992370605, | |
| "learning_rate": 1.9050891011675325e-06, | |
| "loss": 1.3185, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.047763028343586554, | |
| "grad_norm": 12.041298866271973, | |
| "learning_rate": 1.904530473157924e-06, | |
| "loss": 1.3244, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.04804234429881221, | |
| "grad_norm": 11.456694602966309, | |
| "learning_rate": 1.9039718451483156e-06, | |
| "loss": 1.2795, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.04832166025403786, | |
| "grad_norm": 10.448249816894531, | |
| "learning_rate": 1.9034132171387072e-06, | |
| "loss": 1.2914, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.04860097620926351, | |
| "grad_norm": 11.16418170928955, | |
| "learning_rate": 1.9028545891290988e-06, | |
| "loss": 1.3405, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.048880292164489164, | |
| "grad_norm": 11.179234504699707, | |
| "learning_rate": 1.9022959611194903e-06, | |
| "loss": 1.3432, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.04915960811971482, | |
| "grad_norm": 10.457565307617188, | |
| "learning_rate": 1.9017373331098821e-06, | |
| "loss": 1.3358, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.04943892407494047, | |
| "grad_norm": 11.272239685058594, | |
| "learning_rate": 1.9011787051002737e-06, | |
| "loss": 1.2664, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.04971824003016612, | |
| "grad_norm": 11.015891075134277, | |
| "learning_rate": 1.9006200770906653e-06, | |
| "loss": 1.2642, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.049997555985391774, | |
| "grad_norm": 10.243793487548828, | |
| "learning_rate": 1.9000614490810569e-06, | |
| "loss": 1.2335, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.050276871940617425, | |
| "grad_norm": 11.970431327819824, | |
| "learning_rate": 1.8995028210714484e-06, | |
| "loss": 1.2568, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.05055618789584308, | |
| "grad_norm": 9.61301040649414, | |
| "learning_rate": 1.89894419306184e-06, | |
| "loss": 1.2969, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.050835503851068733, | |
| "grad_norm": 10.591397285461426, | |
| "learning_rate": 1.8983855650522316e-06, | |
| "loss": 1.3004, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.051114819806294384, | |
| "grad_norm": 15.13564682006836, | |
| "learning_rate": 1.8978269370426232e-06, | |
| "loss": 1.2676, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.051394135761520035, | |
| "grad_norm": 10.456026077270508, | |
| "learning_rate": 1.8972683090330148e-06, | |
| "loss": 1.254, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.05167345171674569, | |
| "grad_norm": 11.265973091125488, | |
| "learning_rate": 1.8967096810234065e-06, | |
| "loss": 1.2651, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.051952767671971344, | |
| "grad_norm": 10.13062858581543, | |
| "learning_rate": 1.8961510530137981e-06, | |
| "loss": 1.3549, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.052232083627196994, | |
| "grad_norm": 10.586962699890137, | |
| "learning_rate": 1.8955924250041897e-06, | |
| "loss": 1.326, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.052511399582422645, | |
| "grad_norm": 11.121024131774902, | |
| "learning_rate": 1.8950337969945813e-06, | |
| "loss": 1.2838, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.052790715537648296, | |
| "grad_norm": 10.71886920928955, | |
| "learning_rate": 1.8944751689849726e-06, | |
| "loss": 1.2793, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.053070031492873954, | |
| "grad_norm": 10.959943771362305, | |
| "learning_rate": 1.8939165409753644e-06, | |
| "loss": 1.2731, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.053349347448099604, | |
| "grad_norm": 11.72314453125, | |
| "learning_rate": 1.893357912965756e-06, | |
| "loss": 1.3049, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.053628663403325255, | |
| "grad_norm": 11.75049114227295, | |
| "learning_rate": 1.8927992849561476e-06, | |
| "loss": 1.273, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.053907979358550906, | |
| "grad_norm": 11.237908363342285, | |
| "learning_rate": 1.8922406569465392e-06, | |
| "loss": 1.3405, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.054187295313776564, | |
| "grad_norm": 13.297497749328613, | |
| "learning_rate": 1.891682028936931e-06, | |
| "loss": 1.3276, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.054466611269002214, | |
| "grad_norm": 12.209798812866211, | |
| "learning_rate": 1.8911234009273225e-06, | |
| "loss": 1.3376, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.054745927224227865, | |
| "grad_norm": 13.262669563293457, | |
| "learning_rate": 1.890564772917714e-06, | |
| "loss": 1.2976, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.055025243179453516, | |
| "grad_norm": 10.766546249389648, | |
| "learning_rate": 1.8900061449081055e-06, | |
| "loss": 1.3522, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.055304559134679174, | |
| "grad_norm": 10.29268741607666, | |
| "learning_rate": 1.889447516898497e-06, | |
| "loss": 1.28, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.055583875089904825, | |
| "grad_norm": 11.653640747070312, | |
| "learning_rate": 1.8888888888888888e-06, | |
| "loss": 1.2993, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.055863191045130475, | |
| "grad_norm": 10.069348335266113, | |
| "learning_rate": 1.8883302608792804e-06, | |
| "loss": 1.27, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.05597491742722074, | |
| "eval_complexity_accuracy": 0.0, | |
| "eval_loss": 1.3485850095748901, | |
| "eval_runtime": 34.0417, | |
| "eval_samples_per_second": 14.688, | |
| "eval_steps_per_second": 1.851, | |
| "step": 2004 | |
| }, | |
| { | |
| "epoch": 0.056142507000356126, | |
| "grad_norm": 10.894604682922363, | |
| "learning_rate": 1.887771632869672e-06, | |
| "loss": 1.3105, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.05642182295558178, | |
| "grad_norm": 11.579715728759766, | |
| "learning_rate": 1.8872130048600636e-06, | |
| "loss": 1.2776, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.056701138910807435, | |
| "grad_norm": 10.074790000915527, | |
| "learning_rate": 1.8866543768504553e-06, | |
| "loss": 1.3366, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.056980454866033085, | |
| "grad_norm": 11.219857215881348, | |
| "learning_rate": 1.886095748840847e-06, | |
| "loss": 1.2873, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.057259770821258736, | |
| "grad_norm": 10.627588272094727, | |
| "learning_rate": 1.8855371208312385e-06, | |
| "loss": 1.3311, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.05753908677648439, | |
| "grad_norm": 10.92846393585205, | |
| "learning_rate": 1.8849784928216299e-06, | |
| "loss": 1.3101, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.057818402731710045, | |
| "grad_norm": 11.262550354003906, | |
| "learning_rate": 1.8844198648120214e-06, | |
| "loss": 1.3465, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.058097718686935695, | |
| "grad_norm": 13.099771499633789, | |
| "learning_rate": 1.8838612368024132e-06, | |
| "loss": 1.3157, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.058377034642161346, | |
| "grad_norm": 9.9907865524292, | |
| "learning_rate": 1.8833026087928048e-06, | |
| "loss": 1.298, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.058656350597387, | |
| "grad_norm": 10.225235939025879, | |
| "learning_rate": 1.8827439807831964e-06, | |
| "loss": 1.2737, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.058935666552612655, | |
| "grad_norm": 14.671952247619629, | |
| "learning_rate": 1.882185352773588e-06, | |
| "loss": 1.2994, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.059214982507838305, | |
| "grad_norm": 10.452831268310547, | |
| "learning_rate": 1.8816267247639797e-06, | |
| "loss": 1.3168, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.059494298463063956, | |
| "grad_norm": 11.753946304321289, | |
| "learning_rate": 1.8810680967543713e-06, | |
| "loss": 1.3209, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.05977361441828961, | |
| "grad_norm": 11.631643295288086, | |
| "learning_rate": 1.8805094687447627e-06, | |
| "loss": 1.3339, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.06005293037351526, | |
| "grad_norm": 11.326909065246582, | |
| "learning_rate": 1.8799508407351543e-06, | |
| "loss": 1.3191, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.060332246328740916, | |
| "grad_norm": 11.047061920166016, | |
| "learning_rate": 1.8793922127255458e-06, | |
| "loss": 1.346, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.060611562283966566, | |
| "grad_norm": 11.53350830078125, | |
| "learning_rate": 1.8788335847159376e-06, | |
| "loss": 1.3125, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.06089087823919222, | |
| "grad_norm": 11.501274108886719, | |
| "learning_rate": 1.8782749567063292e-06, | |
| "loss": 1.3432, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.06117019419441787, | |
| "grad_norm": 11.525626182556152, | |
| "learning_rate": 1.8777163286967208e-06, | |
| "loss": 1.362, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.061449510149643526, | |
| "grad_norm": 13.74886703491211, | |
| "learning_rate": 1.8771577006871124e-06, | |
| "loss": 1.3157, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.061728826104869176, | |
| "grad_norm": 12.192688941955566, | |
| "learning_rate": 1.8765990726775042e-06, | |
| "loss": 1.287, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.06200814206009483, | |
| "grad_norm": 10.64345645904541, | |
| "learning_rate": 1.8760404446678955e-06, | |
| "loss": 1.2499, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.06228745801532048, | |
| "grad_norm": 11.966428756713867, | |
| "learning_rate": 1.875481816658287e-06, | |
| "loss": 1.2789, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.06256677397054614, | |
| "grad_norm": 11.889241218566895, | |
| "learning_rate": 1.8749231886486787e-06, | |
| "loss": 1.2621, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.06284608992577179, | |
| "grad_norm": 13.372054100036621, | |
| "learning_rate": 1.8743645606390702e-06, | |
| "loss": 1.3493, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.06312540588099744, | |
| "grad_norm": 10.879005432128906, | |
| "learning_rate": 1.873805932629462e-06, | |
| "loss": 1.3077, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.06340472183622309, | |
| "grad_norm": 11.956343650817871, | |
| "learning_rate": 1.8732473046198536e-06, | |
| "loss": 1.3108, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.06368403779144874, | |
| "grad_norm": 11.269684791564941, | |
| "learning_rate": 1.8726886766102452e-06, | |
| "loss": 1.2956, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.06396335374667439, | |
| "grad_norm": 13.093775749206543, | |
| "learning_rate": 1.8721300486006368e-06, | |
| "loss": 1.2553, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.06424266970190004, | |
| "grad_norm": 9.943842887878418, | |
| "learning_rate": 1.8715714205910286e-06, | |
| "loss": 1.2936, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.0645219856571257, | |
| "grad_norm": 10.660123825073242, | |
| "learning_rate": 1.87101279258142e-06, | |
| "loss": 1.3319, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.06480130161235136, | |
| "grad_norm": 11.023526191711426, | |
| "learning_rate": 1.8704541645718115e-06, | |
| "loss": 1.3441, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.065080617567577, | |
| "grad_norm": 11.04121208190918, | |
| "learning_rate": 1.869895536562203e-06, | |
| "loss": 1.3157, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.06535993352280266, | |
| "grad_norm": 10.915820121765137, | |
| "learning_rate": 1.8693369085525946e-06, | |
| "loss": 1.2891, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.06563924947802831, | |
| "grad_norm": 11.1669282913208, | |
| "learning_rate": 1.8687782805429864e-06, | |
| "loss": 1.3301, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.06591856543325396, | |
| "grad_norm": 13.473467826843262, | |
| "learning_rate": 1.868219652533378e-06, | |
| "loss": 1.3412, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.06619788138847961, | |
| "grad_norm": 9.66751480102539, | |
| "learning_rate": 1.8676610245237696e-06, | |
| "loss": 1.4053, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.06647719734370526, | |
| "grad_norm": 10.621736526489258, | |
| "learning_rate": 1.8671023965141612e-06, | |
| "loss": 1.301, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.06675651329893093, | |
| "grad_norm": 12.115357398986816, | |
| "learning_rate": 1.8665437685045527e-06, | |
| "loss": 1.3193, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.06703582925415658, | |
| "grad_norm": 10.837126731872559, | |
| "learning_rate": 1.8659851404949443e-06, | |
| "loss": 1.2737, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.06731514520938223, | |
| "grad_norm": 11.175081253051758, | |
| "learning_rate": 1.865426512485336e-06, | |
| "loss": 1.3254, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.06759446116460788, | |
| "grad_norm": 11.028107643127441, | |
| "learning_rate": 1.8648678844757275e-06, | |
| "loss": 1.3037, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.06787377711983353, | |
| "grad_norm": 11.444878578186035, | |
| "learning_rate": 1.864309256466119e-06, | |
| "loss": 1.3237, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.06815309307505918, | |
| "grad_norm": 10.279289245605469, | |
| "learning_rate": 1.8637506284565108e-06, | |
| "loss": 1.2836, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.06843240903028483, | |
| "grad_norm": 10.37401008605957, | |
| "learning_rate": 1.8631920004469024e-06, | |
| "loss": 1.3329, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.06871172498551048, | |
| "grad_norm": 9.833236694335938, | |
| "learning_rate": 1.862633372437294e-06, | |
| "loss": 1.3125, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.06899104094073613, | |
| "grad_norm": 11.059619903564453, | |
| "learning_rate": 1.8620747444276854e-06, | |
| "loss": 1.2746, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.0692703568959618, | |
| "grad_norm": 10.897518157958984, | |
| "learning_rate": 1.8615161164180771e-06, | |
| "loss": 1.2552, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.06954967285118745, | |
| "grad_norm": 12.665666580200195, | |
| "learning_rate": 1.8609574884084687e-06, | |
| "loss": 1.3093, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.0698289888064131, | |
| "grad_norm": 10.878984451293945, | |
| "learning_rate": 1.8603988603988603e-06, | |
| "loss": 1.291, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.06996864678402592, | |
| "eval_complexity_accuracy": 0.0, | |
| "eval_loss": 1.3446284532546997, | |
| "eval_runtime": 33.87, | |
| "eval_samples_per_second": 14.762, | |
| "eval_steps_per_second": 1.86, | |
| "step": 2505 | |
| }, | |
| { | |
| "epoch": 0.07010830476163875, | |
| "grad_norm": 11.848414421081543, | |
| "learning_rate": 1.8598402323892519e-06, | |
| "loss": 1.3266, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.0703876207168644, | |
| "grad_norm": 11.258633613586426, | |
| "learning_rate": 1.8592816043796435e-06, | |
| "loss": 1.2747, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.07066693667209005, | |
| "grad_norm": 12.249394416809082, | |
| "learning_rate": 1.8587229763700352e-06, | |
| "loss": 1.2717, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.0709462526273157, | |
| "grad_norm": 11.384076118469238, | |
| "learning_rate": 1.8581643483604268e-06, | |
| "loss": 1.3339, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.07122556858254135, | |
| "grad_norm": 11.27473258972168, | |
| "learning_rate": 1.8576057203508182e-06, | |
| "loss": 1.2737, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.071504884537767, | |
| "grad_norm": 11.083890914916992, | |
| "learning_rate": 1.8570470923412098e-06, | |
| "loss": 1.3492, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.07178420049299267, | |
| "grad_norm": 12.925027847290039, | |
| "learning_rate": 1.8564884643316015e-06, | |
| "loss": 1.3546, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.07206351644821832, | |
| "grad_norm": 11.500834465026855, | |
| "learning_rate": 1.8559298363219931e-06, | |
| "loss": 1.2662, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.07234283240344397, | |
| "grad_norm": 10.518533706665039, | |
| "learning_rate": 1.8553712083123847e-06, | |
| "loss": 1.2815, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.07262214835866962, | |
| "grad_norm": 12.124496459960938, | |
| "learning_rate": 1.8548125803027763e-06, | |
| "loss": 1.312, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.07290146431389527, | |
| "grad_norm": 10.693092346191406, | |
| "learning_rate": 1.8542539522931679e-06, | |
| "loss": 1.3071, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.07318078026912092, | |
| "grad_norm": 9.837552070617676, | |
| "learning_rate": 1.8536953242835596e-06, | |
| "loss": 1.2985, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.07346009622434657, | |
| "grad_norm": 11.058207511901855, | |
| "learning_rate": 1.8531366962739512e-06, | |
| "loss": 1.3406, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.07373941217957222, | |
| "grad_norm": 10.664831161499023, | |
| "learning_rate": 1.8525780682643426e-06, | |
| "loss": 1.3086, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.07401872813479787, | |
| "grad_norm": 11.020722389221191, | |
| "learning_rate": 1.8520194402547342e-06, | |
| "loss": 1.2951, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.07429804409002354, | |
| "grad_norm": 11.75809383392334, | |
| "learning_rate": 1.851460812245126e-06, | |
| "loss": 1.2933, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.07457736004524919, | |
| "grad_norm": 11.260404586791992, | |
| "learning_rate": 1.8509021842355175e-06, | |
| "loss": 1.3669, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.07485667600047484, | |
| "grad_norm": 11.38213062286377, | |
| "learning_rate": 1.8503435562259091e-06, | |
| "loss": 1.3048, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.07513599195570049, | |
| "grad_norm": 10.554960250854492, | |
| "learning_rate": 1.8497849282163007e-06, | |
| "loss": 1.3218, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.07541530791092614, | |
| "grad_norm": 13.747076034545898, | |
| "learning_rate": 1.8492263002066923e-06, | |
| "loss": 1.3201, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.07569462386615179, | |
| "grad_norm": 10.723194122314453, | |
| "learning_rate": 1.848667672197084e-06, | |
| "loss": 1.254, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.07597393982137744, | |
| "grad_norm": 11.047980308532715, | |
| "learning_rate": 1.8481090441874754e-06, | |
| "loss": 1.3657, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.0762532557766031, | |
| "grad_norm": 10.199549674987793, | |
| "learning_rate": 1.847550416177867e-06, | |
| "loss": 1.3775, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.07653257173182876, | |
| "grad_norm": 9.60568904876709, | |
| "learning_rate": 1.8469917881682586e-06, | |
| "loss": 1.3047, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.07681188768705441, | |
| "grad_norm": 10.989706993103027, | |
| "learning_rate": 1.8464331601586501e-06, | |
| "loss": 1.3228, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.07709120364228006, | |
| "grad_norm": 12.18575668334961, | |
| "learning_rate": 1.845874532149042e-06, | |
| "loss": 1.3358, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.07737051959750571, | |
| "grad_norm": 11.24397087097168, | |
| "learning_rate": 1.8453159041394335e-06, | |
| "loss": 1.3065, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.07764983555273136, | |
| "grad_norm": 10.88451862335205, | |
| "learning_rate": 1.844757276129825e-06, | |
| "loss": 1.3278, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.07792915150795701, | |
| "grad_norm": 11.730112075805664, | |
| "learning_rate": 1.8441986481202167e-06, | |
| "loss": 1.2865, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.07820846746318266, | |
| "grad_norm": 11.872193336486816, | |
| "learning_rate": 1.8436400201106082e-06, | |
| "loss": 1.2728, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.07848778341840831, | |
| "grad_norm": 13.440178871154785, | |
| "learning_rate": 1.8430813921009998e-06, | |
| "loss": 1.3169, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.07876709937363396, | |
| "grad_norm": 10.802016258239746, | |
| "learning_rate": 1.8425227640913914e-06, | |
| "loss": 1.2491, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.07904641532885963, | |
| "grad_norm": 11.56015396118164, | |
| "learning_rate": 1.841964136081783e-06, | |
| "loss": 1.313, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.07932573128408528, | |
| "grad_norm": 11.145283699035645, | |
| "learning_rate": 1.8414055080721745e-06, | |
| "loss": 1.293, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.07960504723931093, | |
| "grad_norm": 10.63716983795166, | |
| "learning_rate": 1.8408468800625663e-06, | |
| "loss": 1.3308, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.07988436319453658, | |
| "grad_norm": 11.486001968383789, | |
| "learning_rate": 1.840288252052958e-06, | |
| "loss": 1.3047, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.08016367914976223, | |
| "grad_norm": 10.340072631835938, | |
| "learning_rate": 1.8397296240433495e-06, | |
| "loss": 1.2763, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.08044299510498788, | |
| "grad_norm": 11.177892684936523, | |
| "learning_rate": 1.839170996033741e-06, | |
| "loss": 1.3218, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.08072231106021353, | |
| "grad_norm": 11.822985649108887, | |
| "learning_rate": 1.8386123680241326e-06, | |
| "loss": 1.3039, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.08100162701543918, | |
| "grad_norm": 13.245485305786133, | |
| "learning_rate": 1.8380537400145242e-06, | |
| "loss": 1.281, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.08128094297066484, | |
| "grad_norm": 11.78788948059082, | |
| "learning_rate": 1.8374951120049158e-06, | |
| "loss": 1.2176, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.0815602589258905, | |
| "grad_norm": 11.278291702270508, | |
| "learning_rate": 1.8369364839953074e-06, | |
| "loss": 1.2972, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.08183957488111615, | |
| "grad_norm": 11.119109153747559, | |
| "learning_rate": 1.836377855985699e-06, | |
| "loss": 1.2689, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.0821188908363418, | |
| "grad_norm": 11.489620208740234, | |
| "learning_rate": 1.8358192279760907e-06, | |
| "loss": 1.3288, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.08239820679156745, | |
| "grad_norm": 9.556941032409668, | |
| "learning_rate": 1.8352605999664823e-06, | |
| "loss": 1.3035, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.0826775227467931, | |
| "grad_norm": 11.121188163757324, | |
| "learning_rate": 1.8347019719568739e-06, | |
| "loss": 1.2944, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.08295683870201875, | |
| "grad_norm": 12.729305267333984, | |
| "learning_rate": 1.8341433439472653e-06, | |
| "loss": 1.3125, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.0832361546572444, | |
| "grad_norm": 11.878944396972656, | |
| "learning_rate": 1.833584715937657e-06, | |
| "loss": 1.2959, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.08351547061247006, | |
| "grad_norm": 11.5958833694458, | |
| "learning_rate": 1.8330260879280486e-06, | |
| "loss": 1.3453, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.08379478656769572, | |
| "grad_norm": 12.451947212219238, | |
| "learning_rate": 1.8324674599184402e-06, | |
| "loss": 1.2819, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.0839623761408311, | |
| "eval_complexity_accuracy": 0.0, | |
| "eval_loss": 1.3420253992080688, | |
| "eval_runtime": 33.8789, | |
| "eval_samples_per_second": 14.758, | |
| "eval_steps_per_second": 1.86, | |
| "step": 3006 | |
| }, | |
| { | |
| "epoch": 0.08407410252292137, | |
| "grad_norm": 12.807692527770996, | |
| "learning_rate": 1.8319088319088318e-06, | |
| "loss": 1.3238, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.08435341847814702, | |
| "grad_norm": 10.1639404296875, | |
| "learning_rate": 1.8313502038992234e-06, | |
| "loss": 1.2694, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.08463273443337267, | |
| "grad_norm": 11.123089790344238, | |
| "learning_rate": 1.8307915758896151e-06, | |
| "loss": 1.2404, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.08491205038859832, | |
| "grad_norm": 11.976441383361816, | |
| "learning_rate": 1.8302329478800067e-06, | |
| "loss": 1.3319, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.08519136634382397, | |
| "grad_norm": 11.400232315063477, | |
| "learning_rate": 1.829674319870398e-06, | |
| "loss": 1.2899, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.08547068229904962, | |
| "grad_norm": 9.668082237243652, | |
| "learning_rate": 1.8291156918607897e-06, | |
| "loss": 1.3343, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.08574999825427528, | |
| "grad_norm": 9.114018440246582, | |
| "learning_rate": 1.8285570638511814e-06, | |
| "loss": 1.2822, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.08602931420950093, | |
| "grad_norm": 11.763662338256836, | |
| "learning_rate": 1.827998435841573e-06, | |
| "loss": 1.303, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.08630863016472659, | |
| "grad_norm": 12.478301048278809, | |
| "learning_rate": 1.8274398078319646e-06, | |
| "loss": 1.3204, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.08658794611995224, | |
| "grad_norm": 13.733002662658691, | |
| "learning_rate": 1.8268811798223562e-06, | |
| "loss": 1.2763, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.08686726207517789, | |
| "grad_norm": 11.211143493652344, | |
| "learning_rate": 1.8263225518127478e-06, | |
| "loss": 1.3059, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.08714657803040354, | |
| "grad_norm": 10.02708911895752, | |
| "learning_rate": 1.8257639238031395e-06, | |
| "loss": 1.2576, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.0874258939856292, | |
| "grad_norm": 10.271854400634766, | |
| "learning_rate": 1.8252052957935311e-06, | |
| "loss": 1.3526, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.08770520994085484, | |
| "grad_norm": 10.915563583374023, | |
| "learning_rate": 1.8246466677839225e-06, | |
| "loss": 1.2951, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.0879845258960805, | |
| "grad_norm": 12.06615161895752, | |
| "learning_rate": 1.824088039774314e-06, | |
| "loss": 1.2678, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.08826384185130615, | |
| "grad_norm": 11.441333770751953, | |
| "learning_rate": 1.8235294117647058e-06, | |
| "loss": 1.3605, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.0885431578065318, | |
| "grad_norm": 11.135004997253418, | |
| "learning_rate": 1.8229707837550974e-06, | |
| "loss": 1.3638, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.08882247376175746, | |
| "grad_norm": 10.272753715515137, | |
| "learning_rate": 1.822412155745489e-06, | |
| "loss": 1.3748, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.08910178971698311, | |
| "grad_norm": 10.645270347595215, | |
| "learning_rate": 1.8218535277358806e-06, | |
| "loss": 1.3288, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.08938110567220876, | |
| "grad_norm": 11.341635704040527, | |
| "learning_rate": 1.8212948997262722e-06, | |
| "loss": 1.3295, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.08966042162743441, | |
| "grad_norm": 11.285005569458008, | |
| "learning_rate": 1.820736271716664e-06, | |
| "loss": 1.3094, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.08993973758266006, | |
| "grad_norm": 11.092018127441406, | |
| "learning_rate": 1.8201776437070553e-06, | |
| "loss": 1.2616, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.09021905353788572, | |
| "grad_norm": 11.833807945251465, | |
| "learning_rate": 1.8196190156974469e-06, | |
| "loss": 1.2915, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.09049836949311137, | |
| "grad_norm": 11.941621780395508, | |
| "learning_rate": 1.8190603876878385e-06, | |
| "loss": 1.2984, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.09077768544833702, | |
| "grad_norm": 11.135613441467285, | |
| "learning_rate": 1.8185017596782303e-06, | |
| "loss": 1.2638, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.09105700140356268, | |
| "grad_norm": 11.356342315673828, | |
| "learning_rate": 1.8179431316686218e-06, | |
| "loss": 1.3199, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.09133631735878833, | |
| "grad_norm": 11.519587516784668, | |
| "learning_rate": 1.8173845036590134e-06, | |
| "loss": 1.288, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.09161563331401398, | |
| "grad_norm": 11.335143089294434, | |
| "learning_rate": 1.816825875649405e-06, | |
| "loss": 1.2472, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.09189494926923963, | |
| "grad_norm": 12.195459365844727, | |
| "learning_rate": 1.8162672476397966e-06, | |
| "loss": 1.2992, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.09217426522446528, | |
| "grad_norm": 12.05800724029541, | |
| "learning_rate": 1.8157086196301881e-06, | |
| "loss": 1.3177, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.09245358117969094, | |
| "grad_norm": 10.606769561767578, | |
| "learning_rate": 1.8151499916205797e-06, | |
| "loss": 1.2693, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.09273289713491659, | |
| "grad_norm": 9.679693222045898, | |
| "learning_rate": 1.8145913636109713e-06, | |
| "loss": 1.2985, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.09301221309014224, | |
| "grad_norm": 10.03492546081543, | |
| "learning_rate": 1.8140327356013629e-06, | |
| "loss": 1.2724, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.09329152904536789, | |
| "grad_norm": 10.985275268554688, | |
| "learning_rate": 1.8134741075917547e-06, | |
| "loss": 1.2551, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.09357084500059355, | |
| "grad_norm": 11.815603256225586, | |
| "learning_rate": 1.8129154795821462e-06, | |
| "loss": 1.3062, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.0938501609558192, | |
| "grad_norm": 10.699769020080566, | |
| "learning_rate": 1.8123568515725378e-06, | |
| "loss": 1.2841, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.09412947691104485, | |
| "grad_norm": 12.014618873596191, | |
| "learning_rate": 1.8117982235629294e-06, | |
| "loss": 1.3132, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.0944087928662705, | |
| "grad_norm": 11.724242210388184, | |
| "learning_rate": 1.811239595553321e-06, | |
| "loss": 1.289, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.09468810882149616, | |
| "grad_norm": 12.180294036865234, | |
| "learning_rate": 1.8106809675437125e-06, | |
| "loss": 1.3496, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.0949674247767218, | |
| "grad_norm": 10.988664627075195, | |
| "learning_rate": 1.8101223395341041e-06, | |
| "loss": 1.3244, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.09524674073194746, | |
| "grad_norm": 12.344855308532715, | |
| "learning_rate": 1.8095637115244957e-06, | |
| "loss": 1.2996, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.09552605668717311, | |
| "grad_norm": 10.685724258422852, | |
| "learning_rate": 1.8090050835148873e-06, | |
| "loss": 1.261, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.09580537264239876, | |
| "grad_norm": 12.516709327697754, | |
| "learning_rate": 1.808446455505279e-06, | |
| "loss": 1.2756, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.09608468859762442, | |
| "grad_norm": 11.27023983001709, | |
| "learning_rate": 1.8078878274956706e-06, | |
| "loss": 1.2935, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.09636400455285007, | |
| "grad_norm": 12.012152671813965, | |
| "learning_rate": 1.8073291994860622e-06, | |
| "loss": 1.317, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.09664332050807573, | |
| "grad_norm": 11.254688262939453, | |
| "learning_rate": 1.8067705714764538e-06, | |
| "loss": 1.3272, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.09692263646330138, | |
| "grad_norm": 12.010251998901367, | |
| "learning_rate": 1.8062119434668452e-06, | |
| "loss": 1.3732, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.09720195241852703, | |
| "grad_norm": 12.29020881652832, | |
| "learning_rate": 1.805653315457237e-06, | |
| "loss": 1.2978, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.09748126837375268, | |
| "grad_norm": 12.708207130432129, | |
| "learning_rate": 1.8050946874476285e-06, | |
| "loss": 1.3173, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.09776058432897833, | |
| "grad_norm": 11.069357872009277, | |
| "learning_rate": 1.80453605943802e-06, | |
| "loss": 1.3188, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.0979561054976363, | |
| "eval_complexity_accuracy": 0.0, | |
| "eval_loss": 1.3392640352249146, | |
| "eval_runtime": 34.0525, | |
| "eval_samples_per_second": 14.683, | |
| "eval_steps_per_second": 1.85, | |
| "step": 3507 | |
| }, | |
| { | |
| "epoch": 0.09803990028420398, | |
| "grad_norm": 13.221611976623535, | |
| "learning_rate": 1.8039774314284117e-06, | |
| "loss": 1.2449, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.09831921623942964, | |
| "grad_norm": 11.22923755645752, | |
| "learning_rate": 1.8034188034188035e-06, | |
| "loss": 1.3242, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.0985985321946553, | |
| "grad_norm": 10.731654167175293, | |
| "learning_rate": 1.802860175409195e-06, | |
| "loss": 1.363, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.09887784814988095, | |
| "grad_norm": 11.269989967346191, | |
| "learning_rate": 1.8023015473995866e-06, | |
| "loss": 1.2708, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.0991571641051066, | |
| "grad_norm": 10.26361083984375, | |
| "learning_rate": 1.801742919389978e-06, | |
| "loss": 1.3219, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.09943648006033225, | |
| "grad_norm": 10.341995239257812, | |
| "learning_rate": 1.8011842913803696e-06, | |
| "loss": 1.2953, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.0997157960155579, | |
| "grad_norm": 10.96583080291748, | |
| "learning_rate": 1.8006256633707613e-06, | |
| "loss": 1.3132, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.09999511197078355, | |
| "grad_norm": 11.878289222717285, | |
| "learning_rate": 1.800067035361153e-06, | |
| "loss": 1.3109, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.1002744279260092, | |
| "grad_norm": 9.536112785339355, | |
| "learning_rate": 1.7995084073515445e-06, | |
| "loss": 1.3468, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.10055374388123485, | |
| "grad_norm": 10.972228050231934, | |
| "learning_rate": 1.798949779341936e-06, | |
| "loss": 1.2877, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.10083305983646051, | |
| "grad_norm": 13.208352088928223, | |
| "learning_rate": 1.7983911513323279e-06, | |
| "loss": 1.3701, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.10111237579168617, | |
| "grad_norm": 11.069518089294434, | |
| "learning_rate": 1.7978325233227194e-06, | |
| "loss": 1.2269, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.10139169174691182, | |
| "grad_norm": 11.275925636291504, | |
| "learning_rate": 1.797273895313111e-06, | |
| "loss": 1.3039, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.10167100770213747, | |
| "grad_norm": 9.614294052124023, | |
| "learning_rate": 1.7967152673035024e-06, | |
| "loss": 1.2987, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.10195032365736312, | |
| "grad_norm": 11.417302131652832, | |
| "learning_rate": 1.796156639293894e-06, | |
| "loss": 1.3161, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.10222963961258877, | |
| "grad_norm": 13.481733322143555, | |
| "learning_rate": 1.7955980112842857e-06, | |
| "loss": 1.277, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.10250895556781442, | |
| "grad_norm": 12.135738372802734, | |
| "learning_rate": 1.7950393832746773e-06, | |
| "loss": 1.3031, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.10278827152304007, | |
| "grad_norm": 11.81387710571289, | |
| "learning_rate": 1.794480755265069e-06, | |
| "loss": 1.3195, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.10306758747826572, | |
| "grad_norm": 12.341436386108398, | |
| "learning_rate": 1.7939221272554605e-06, | |
| "loss": 1.344, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.10334690343349139, | |
| "grad_norm": 11.813607215881348, | |
| "learning_rate": 1.7933634992458523e-06, | |
| "loss": 1.2456, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.10362621938871704, | |
| "grad_norm": 10.025679588317871, | |
| "learning_rate": 1.7928048712362438e-06, | |
| "loss": 1.3462, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.10390553534394269, | |
| "grad_norm": 11.027300834655762, | |
| "learning_rate": 1.7922462432266352e-06, | |
| "loss": 1.248, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.10418485129916834, | |
| "grad_norm": 10.462127685546875, | |
| "learning_rate": 1.7916876152170268e-06, | |
| "loss": 1.2827, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.10446416725439399, | |
| "grad_norm": 11.07565689086914, | |
| "learning_rate": 1.7911289872074184e-06, | |
| "loss": 1.3317, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.10474348320961964, | |
| "grad_norm": 10.2979097366333, | |
| "learning_rate": 1.7905703591978101e-06, | |
| "loss": 1.2484, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.10502279916484529, | |
| "grad_norm": 11.009065628051758, | |
| "learning_rate": 1.7900117311882017e-06, | |
| "loss": 1.2882, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.10530211512007094, | |
| "grad_norm": 11.308358192443848, | |
| "learning_rate": 1.7894531031785933e-06, | |
| "loss": 1.3095, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.10558143107529659, | |
| "grad_norm": 11.058066368103027, | |
| "learning_rate": 1.7888944751689849e-06, | |
| "loss": 1.3372, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.10586074703052226, | |
| "grad_norm": 13.103239059448242, | |
| "learning_rate": 1.7883358471593767e-06, | |
| "loss": 1.3124, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 0.10614006298574791, | |
| "grad_norm": 10.5227689743042, | |
| "learning_rate": 1.787777219149768e-06, | |
| "loss": 1.2608, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.10641937894097356, | |
| "grad_norm": 10.993918418884277, | |
| "learning_rate": 1.7872185911401596e-06, | |
| "loss": 1.259, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 0.10669869489619921, | |
| "grad_norm": 11.612725257873535, | |
| "learning_rate": 1.7866599631305512e-06, | |
| "loss": 1.3046, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.10697801085142486, | |
| "grad_norm": 11.200050354003906, | |
| "learning_rate": 1.7861013351209428e-06, | |
| "loss": 1.3439, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 0.10725732680665051, | |
| "grad_norm": 12.19509220123291, | |
| "learning_rate": 1.7855427071113346e-06, | |
| "loss": 1.3107, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.10753664276187616, | |
| "grad_norm": 11.498516082763672, | |
| "learning_rate": 1.7849840791017261e-06, | |
| "loss": 1.3341, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.10781595871710181, | |
| "grad_norm": 12.180155754089355, | |
| "learning_rate": 1.7844254510921177e-06, | |
| "loss": 1.2753, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.10809527467232748, | |
| "grad_norm": 10.637706756591797, | |
| "learning_rate": 1.7838668230825093e-06, | |
| "loss": 1.2221, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 0.10837459062755313, | |
| "grad_norm": 11.029936790466309, | |
| "learning_rate": 1.783308195072901e-06, | |
| "loss": 1.3397, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.10865390658277878, | |
| "grad_norm": 9.736263275146484, | |
| "learning_rate": 1.7827495670632924e-06, | |
| "loss": 1.339, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 0.10893322253800443, | |
| "grad_norm": 11.16982364654541, | |
| "learning_rate": 1.782190939053684e-06, | |
| "loss": 1.3309, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.10921253849323008, | |
| "grad_norm": 10.91207218170166, | |
| "learning_rate": 1.7816323110440756e-06, | |
| "loss": 1.2543, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 0.10949185444845573, | |
| "grad_norm": 14.678290367126465, | |
| "learning_rate": 1.7810736830344672e-06, | |
| "loss": 1.355, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.10977117040368138, | |
| "grad_norm": 11.110123634338379, | |
| "learning_rate": 1.780515055024859e-06, | |
| "loss": 1.251, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 0.11005048635890703, | |
| "grad_norm": 11.788151741027832, | |
| "learning_rate": 1.7799564270152505e-06, | |
| "loss": 1.2544, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.11032980231413268, | |
| "grad_norm": 10.897525787353516, | |
| "learning_rate": 1.7793977990056421e-06, | |
| "loss": 1.2932, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.11060911826935835, | |
| "grad_norm": 12.554097175598145, | |
| "learning_rate": 1.7788391709960337e-06, | |
| "loss": 1.3412, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.110888434224584, | |
| "grad_norm": 11.195846557617188, | |
| "learning_rate": 1.7782805429864253e-06, | |
| "loss": 1.311, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 0.11116775017980965, | |
| "grad_norm": 11.825657844543457, | |
| "learning_rate": 1.7777219149768168e-06, | |
| "loss": 1.2449, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.1114470661350353, | |
| "grad_norm": 11.154561996459961, | |
| "learning_rate": 1.7771632869672084e-06, | |
| "loss": 1.2969, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 0.11172638209026095, | |
| "grad_norm": 12.427309036254883, | |
| "learning_rate": 1.7766046589576e-06, | |
| "loss": 1.3205, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.11194983485444147, | |
| "eval_complexity_accuracy": 0.0, | |
| "eval_loss": 1.337980031967163, | |
| "eval_runtime": 33.7197, | |
| "eval_samples_per_second": 14.828, | |
| "eval_steps_per_second": 1.868, | |
| "step": 4008 | |
| }, | |
| { | |
| "epoch": 0.1120056980454866, | |
| "grad_norm": 11.303837776184082, | |
| "learning_rate": 1.7760460309479916e-06, | |
| "loss": 1.2941, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 0.11228501400071225, | |
| "grad_norm": 10.283913612365723, | |
| "learning_rate": 1.7754874029383834e-06, | |
| "loss": 1.2637, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.1125643299559379, | |
| "grad_norm": 9.881290435791016, | |
| "learning_rate": 1.774928774928775e-06, | |
| "loss": 1.2764, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 0.11284364591116355, | |
| "grad_norm": 10.254637718200684, | |
| "learning_rate": 1.7743701469191665e-06, | |
| "loss": 1.3215, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.11312296186638922, | |
| "grad_norm": 11.556249618530273, | |
| "learning_rate": 1.7738115189095579e-06, | |
| "loss": 1.3581, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.11340227782161487, | |
| "grad_norm": 11.59968376159668, | |
| "learning_rate": 1.7732528908999497e-06, | |
| "loss": 1.3089, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.11368159377684052, | |
| "grad_norm": 11.252206802368164, | |
| "learning_rate": 1.7726942628903412e-06, | |
| "loss": 1.242, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 0.11396090973206617, | |
| "grad_norm": 10.428114891052246, | |
| "learning_rate": 1.7721356348807328e-06, | |
| "loss": 1.3395, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.11424022568729182, | |
| "grad_norm": 12.992630958557129, | |
| "learning_rate": 1.7715770068711244e-06, | |
| "loss": 1.305, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 0.11451954164251747, | |
| "grad_norm": 10.460079193115234, | |
| "learning_rate": 1.771018378861516e-06, | |
| "loss": 1.2225, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.11479885759774312, | |
| "grad_norm": 10.601390838623047, | |
| "learning_rate": 1.7704597508519078e-06, | |
| "loss": 1.3129, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 0.11507817355296877, | |
| "grad_norm": 13.683563232421875, | |
| "learning_rate": 1.7699011228422993e-06, | |
| "loss": 1.3081, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.11535748950819444, | |
| "grad_norm": 12.05490493774414, | |
| "learning_rate": 1.769342494832691e-06, | |
| "loss": 1.2893, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 0.11563680546342009, | |
| "grad_norm": 10.546974182128906, | |
| "learning_rate": 1.7687838668230823e-06, | |
| "loss": 1.3494, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.11591612141864574, | |
| "grad_norm": 11.625492095947266, | |
| "learning_rate": 1.768225238813474e-06, | |
| "loss": 1.2876, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.11619543737387139, | |
| "grad_norm": 11.499431610107422, | |
| "learning_rate": 1.7676666108038656e-06, | |
| "loss": 1.2298, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.11647475332909704, | |
| "grad_norm": 10.968666076660156, | |
| "learning_rate": 1.7671079827942572e-06, | |
| "loss": 1.3229, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 0.11675406928432269, | |
| "grad_norm": 10.56057071685791, | |
| "learning_rate": 1.7665493547846488e-06, | |
| "loss": 1.2644, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.11703338523954834, | |
| "grad_norm": 10.645150184631348, | |
| "learning_rate": 1.7659907267750404e-06, | |
| "loss": 1.3216, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 0.117312701194774, | |
| "grad_norm": 10.945796966552734, | |
| "learning_rate": 1.7654320987654322e-06, | |
| "loss": 1.3395, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.11759201714999964, | |
| "grad_norm": 11.30075740814209, | |
| "learning_rate": 1.7648734707558237e-06, | |
| "loss": 1.3201, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 0.11787133310522531, | |
| "grad_norm": 11.912382125854492, | |
| "learning_rate": 1.764314842746215e-06, | |
| "loss": 1.3076, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.11815064906045096, | |
| "grad_norm": 11.546857833862305, | |
| "learning_rate": 1.7637562147366067e-06, | |
| "loss": 1.2776, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 0.11842996501567661, | |
| "grad_norm": 11.775701522827148, | |
| "learning_rate": 1.7631975867269985e-06, | |
| "loss": 1.3094, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.11870928097090226, | |
| "grad_norm": 11.965110778808594, | |
| "learning_rate": 1.76263895871739e-06, | |
| "loss": 1.2815, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.11898859692612791, | |
| "grad_norm": 9.932812690734863, | |
| "learning_rate": 1.7620803307077816e-06, | |
| "loss": 1.2965, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.11926791288135356, | |
| "grad_norm": 10.788895606994629, | |
| "learning_rate": 1.7615217026981732e-06, | |
| "loss": 1.3025, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 0.11954722883657921, | |
| "grad_norm": 12.008225440979004, | |
| "learning_rate": 1.7609630746885648e-06, | |
| "loss": 1.2758, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.11982654479180486, | |
| "grad_norm": 11.157905578613281, | |
| "learning_rate": 1.7604044466789566e-06, | |
| "loss": 1.3369, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 0.12010586074703052, | |
| "grad_norm": 12.967375755310059, | |
| "learning_rate": 1.759845818669348e-06, | |
| "loss": 1.3124, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.12038517670225618, | |
| "grad_norm": 13.764420509338379, | |
| "learning_rate": 1.7592871906597395e-06, | |
| "loss": 1.3243, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 0.12066449265748183, | |
| "grad_norm": 11.486067771911621, | |
| "learning_rate": 1.758728562650131e-06, | |
| "loss": 1.2865, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.12094380861270748, | |
| "grad_norm": 11.377238273620605, | |
| "learning_rate": 1.7581699346405229e-06, | |
| "loss": 1.2747, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 0.12122312456793313, | |
| "grad_norm": 11.644318580627441, | |
| "learning_rate": 1.7576113066309144e-06, | |
| "loss": 1.2855, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.12150244052315878, | |
| "grad_norm": 11.282743453979492, | |
| "learning_rate": 1.757052678621306e-06, | |
| "loss": 1.2109, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.12178175647838443, | |
| "grad_norm": 10.718985557556152, | |
| "learning_rate": 1.7564940506116976e-06, | |
| "loss": 1.3098, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.12206107243361008, | |
| "grad_norm": 10.54099178314209, | |
| "learning_rate": 1.7559354226020892e-06, | |
| "loss": 1.2253, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 0.12234038838883574, | |
| "grad_norm": 10.001184463500977, | |
| "learning_rate": 1.755376794592481e-06, | |
| "loss": 1.3096, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.1226197043440614, | |
| "grad_norm": 10.10665512084961, | |
| "learning_rate": 1.7548181665828723e-06, | |
| "loss": 1.3204, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 0.12289902029928705, | |
| "grad_norm": 13.317100524902344, | |
| "learning_rate": 1.754259538573264e-06, | |
| "loss": 1.2701, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.1231783362545127, | |
| "grad_norm": 10.948107719421387, | |
| "learning_rate": 1.7537009105636555e-06, | |
| "loss": 1.3417, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 0.12345765220973835, | |
| "grad_norm": 11.12563705444336, | |
| "learning_rate": 1.7531422825540473e-06, | |
| "loss": 1.2768, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.123736968164964, | |
| "grad_norm": 11.270187377929688, | |
| "learning_rate": 1.7525836545444389e-06, | |
| "loss": 1.2476, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 0.12401628412018965, | |
| "grad_norm": 11.370152473449707, | |
| "learning_rate": 1.7520250265348304e-06, | |
| "loss": 1.3711, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.1242956000754153, | |
| "grad_norm": 12.357138633728027, | |
| "learning_rate": 1.751466398525222e-06, | |
| "loss": 1.2697, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.12457491603064096, | |
| "grad_norm": 10.51325511932373, | |
| "learning_rate": 1.7509077705156136e-06, | |
| "loss": 1.3495, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.1248542319858666, | |
| "grad_norm": 14.585171699523926, | |
| "learning_rate": 1.7503491425060052e-06, | |
| "loss": 1.3023, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 0.12513354794109227, | |
| "grad_norm": 11.234824180603027, | |
| "learning_rate": 1.7497905144963967e-06, | |
| "loss": 1.2785, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.12541286389631792, | |
| "grad_norm": 10.963340759277344, | |
| "learning_rate": 1.7492318864867883e-06, | |
| "loss": 1.3196, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 0.12569217985154357, | |
| "grad_norm": 10.97410774230957, | |
| "learning_rate": 1.7486732584771799e-06, | |
| "loss": 1.3396, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.12594356421124667, | |
| "eval_complexity_accuracy": 0.912, | |
| "eval_loss": 1.3366564512252808, | |
| "eval_runtime": 33.6692, | |
| "eval_samples_per_second": 14.85, | |
| "eval_steps_per_second": 1.871, | |
| "step": 4509 | |
| }, | |
| { | |
| "epoch": 0.12597149580676922, | |
| "grad_norm": 10.35742473602295, | |
| "learning_rate": 1.7481146304675715e-06, | |
| "loss": 1.3028, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 0.12625081176199487, | |
| "grad_norm": 11.008344650268555, | |
| "learning_rate": 1.7475560024579633e-06, | |
| "loss": 1.3369, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.12653012771722053, | |
| "grad_norm": 13.630735397338867, | |
| "learning_rate": 1.7469973744483548e-06, | |
| "loss": 1.2786, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 0.12680944367244618, | |
| "grad_norm": 11.712303161621094, | |
| "learning_rate": 1.7464387464387464e-06, | |
| "loss": 1.2838, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 0.12708875962767183, | |
| "grad_norm": 11.680615425109863, | |
| "learning_rate": 1.7458801184291378e-06, | |
| "loss": 1.3096, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.12736807558289748, | |
| "grad_norm": 9.936148643493652, | |
| "learning_rate": 1.7453214904195296e-06, | |
| "loss": 1.3508, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.12764739153812313, | |
| "grad_norm": 10.1597261428833, | |
| "learning_rate": 1.7447628624099211e-06, | |
| "loss": 1.2996, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 0.12792670749334878, | |
| "grad_norm": 9.299288749694824, | |
| "learning_rate": 1.7442042344003127e-06, | |
| "loss": 1.3327, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 0.12820602344857443, | |
| "grad_norm": 11.090012550354004, | |
| "learning_rate": 1.7436456063907043e-06, | |
| "loss": 1.3217, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 0.12848533940380008, | |
| "grad_norm": 10.919537544250488, | |
| "learning_rate": 1.7430869783810959e-06, | |
| "loss": 1.2738, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.12876465535902576, | |
| "grad_norm": 10.606612205505371, | |
| "learning_rate": 1.7425283503714877e-06, | |
| "loss": 1.3391, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 0.1290439713142514, | |
| "grad_norm": 11.103971481323242, | |
| "learning_rate": 1.7419697223618792e-06, | |
| "loss": 1.2768, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 0.12932328726947706, | |
| "grad_norm": 10.45857048034668, | |
| "learning_rate": 1.7414110943522708e-06, | |
| "loss": 1.292, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 0.1296026032247027, | |
| "grad_norm": 12.78720760345459, | |
| "learning_rate": 1.7408524663426622e-06, | |
| "loss": 1.2728, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.12988191917992836, | |
| "grad_norm": 10.232451438903809, | |
| "learning_rate": 1.740293838333054e-06, | |
| "loss": 1.305, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.130161235135154, | |
| "grad_norm": 10.413008689880371, | |
| "learning_rate": 1.7397352103234455e-06, | |
| "loss": 1.3135, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 0.13044055109037966, | |
| "grad_norm": 11.938608169555664, | |
| "learning_rate": 1.7391765823138371e-06, | |
| "loss": 1.3101, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 0.13071986704560531, | |
| "grad_norm": 10.876611709594727, | |
| "learning_rate": 1.7386179543042287e-06, | |
| "loss": 1.2753, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 0.13099918300083097, | |
| "grad_norm": 10.039010047912598, | |
| "learning_rate": 1.7380593262946203e-06, | |
| "loss": 1.2957, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 0.13127849895605662, | |
| "grad_norm": 13.189595222473145, | |
| "learning_rate": 1.737500698285012e-06, | |
| "loss": 1.2917, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.13155781491128227, | |
| "grad_norm": 11.356356620788574, | |
| "learning_rate": 1.7369420702754036e-06, | |
| "loss": 1.3185, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 0.13183713086650792, | |
| "grad_norm": 11.284613609313965, | |
| "learning_rate": 1.736383442265795e-06, | |
| "loss": 1.2605, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 0.13211644682173357, | |
| "grad_norm": 9.668716430664062, | |
| "learning_rate": 1.7358248142561866e-06, | |
| "loss": 1.3078, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 0.13239576277695922, | |
| "grad_norm": 12.375937461853027, | |
| "learning_rate": 1.7352661862465784e-06, | |
| "loss": 1.3748, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 0.13267507873218487, | |
| "grad_norm": 11.52265453338623, | |
| "learning_rate": 1.73470755823697e-06, | |
| "loss": 1.3253, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.13295439468741052, | |
| "grad_norm": 10.54103946685791, | |
| "learning_rate": 1.7341489302273615e-06, | |
| "loss": 1.2973, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 0.13323371064263617, | |
| "grad_norm": 11.810563087463379, | |
| "learning_rate": 1.733590302217753e-06, | |
| "loss": 1.2983, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 0.13351302659786185, | |
| "grad_norm": 11.471932411193848, | |
| "learning_rate": 1.7330316742081447e-06, | |
| "loss": 1.3038, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 0.1337923425530875, | |
| "grad_norm": 11.196157455444336, | |
| "learning_rate": 1.7324730461985365e-06, | |
| "loss": 1.339, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 0.13407165850831315, | |
| "grad_norm": 10.879687309265137, | |
| "learning_rate": 1.7319144181889278e-06, | |
| "loss": 1.2856, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.1343509744635388, | |
| "grad_norm": 10.327743530273438, | |
| "learning_rate": 1.7313557901793194e-06, | |
| "loss": 1.3102, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 0.13463029041876445, | |
| "grad_norm": 12.245965003967285, | |
| "learning_rate": 1.730797162169711e-06, | |
| "loss": 1.3179, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 0.1349096063739901, | |
| "grad_norm": 10.2786226272583, | |
| "learning_rate": 1.7302385341601028e-06, | |
| "loss": 1.2797, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 0.13518892232921575, | |
| "grad_norm": 10.446268081665039, | |
| "learning_rate": 1.7296799061504943e-06, | |
| "loss": 1.2856, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 0.1354682382844414, | |
| "grad_norm": 11.422130584716797, | |
| "learning_rate": 1.729121278140886e-06, | |
| "loss": 1.323, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.13574755423966706, | |
| "grad_norm": 11.97488021850586, | |
| "learning_rate": 1.7285626501312775e-06, | |
| "loss": 1.3054, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 0.1360268701948927, | |
| "grad_norm": 11.220852851867676, | |
| "learning_rate": 1.728004022121669e-06, | |
| "loss": 1.3171, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 0.13630618615011836, | |
| "grad_norm": 9.52205753326416, | |
| "learning_rate": 1.7274453941120609e-06, | |
| "loss": 1.2387, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 0.136585502105344, | |
| "grad_norm": 10.432751655578613, | |
| "learning_rate": 1.7268867661024522e-06, | |
| "loss": 1.2646, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 0.13686481806056966, | |
| "grad_norm": 11.69746208190918, | |
| "learning_rate": 1.7263281380928438e-06, | |
| "loss": 1.2954, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.1371441340157953, | |
| "grad_norm": 10.778327941894531, | |
| "learning_rate": 1.7257695100832354e-06, | |
| "loss": 1.28, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 0.13742344997102096, | |
| "grad_norm": 11.078811645507812, | |
| "learning_rate": 1.7252108820736272e-06, | |
| "loss": 1.2915, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 0.1377027659262466, | |
| "grad_norm": 11.492058753967285, | |
| "learning_rate": 1.7246522540640187e-06, | |
| "loss": 1.2967, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 0.13798208188147226, | |
| "grad_norm": 10.493326187133789, | |
| "learning_rate": 1.7240936260544103e-06, | |
| "loss": 1.3236, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 0.1382613978366979, | |
| "grad_norm": 10.878108978271484, | |
| "learning_rate": 1.723534998044802e-06, | |
| "loss": 1.2702, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.1385407137919236, | |
| "grad_norm": 11.983351707458496, | |
| "learning_rate": 1.7229763700351935e-06, | |
| "loss": 1.314, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.13882002974714924, | |
| "grad_norm": 10.559981346130371, | |
| "learning_rate": 1.722417742025585e-06, | |
| "loss": 1.3231, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 0.1390993457023749, | |
| "grad_norm": 12.265423774719238, | |
| "learning_rate": 1.7218591140159766e-06, | |
| "loss": 1.3418, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 0.13937866165760054, | |
| "grad_norm": 9.850886344909668, | |
| "learning_rate": 1.7213004860063682e-06, | |
| "loss": 1.2421, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 0.1396579776128262, | |
| "grad_norm": 10.524002075195312, | |
| "learning_rate": 1.7207418579967598e-06, | |
| "loss": 1.242, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.13993729356805185, | |
| "grad_norm": 12.710641860961914, | |
| "learning_rate": 1.7201832299871516e-06, | |
| "loss": 1.2802, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 0.13993729356805185, | |
| "eval_complexity_accuracy": 0.916, | |
| "eval_loss": 1.3355051279067993, | |
| "eval_runtime": 33.5364, | |
| "eval_samples_per_second": 14.909, | |
| "eval_steps_per_second": 1.879, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 0.1402166095232775, | |
| "grad_norm": 10.802959442138672, | |
| "learning_rate": 1.7196246019775432e-06, | |
| "loss": 1.2864, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 0.14049592547850315, | |
| "grad_norm": 10.689055442810059, | |
| "learning_rate": 1.7190659739679347e-06, | |
| "loss": 1.2735, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 0.1407752414337288, | |
| "grad_norm": 11.609500885009766, | |
| "learning_rate": 1.7185073459583263e-06, | |
| "loss": 1.3131, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 0.14105455738895445, | |
| "grad_norm": 11.694178581237793, | |
| "learning_rate": 1.7179487179487177e-06, | |
| "loss": 1.2796, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.1413338733441801, | |
| "grad_norm": 10.71261215209961, | |
| "learning_rate": 1.7173900899391095e-06, | |
| "loss": 1.2928, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 0.14161318929940575, | |
| "grad_norm": 11.323657989501953, | |
| "learning_rate": 1.716831461929501e-06, | |
| "loss": 1.3168, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 0.1418925052546314, | |
| "grad_norm": 11.165552139282227, | |
| "learning_rate": 1.7162728339198926e-06, | |
| "loss": 1.3048, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 0.14217182120985705, | |
| "grad_norm": 10.069772720336914, | |
| "learning_rate": 1.7157142059102842e-06, | |
| "loss": 1.3143, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 0.1424511371650827, | |
| "grad_norm": 11.59792709350586, | |
| "learning_rate": 1.715155577900676e-06, | |
| "loss": 1.2753, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.14273045312030835, | |
| "grad_norm": 10.197514533996582, | |
| "learning_rate": 1.7145969498910676e-06, | |
| "loss": 1.3432, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 0.143009769075534, | |
| "grad_norm": 10.098687171936035, | |
| "learning_rate": 1.7140383218814591e-06, | |
| "loss": 1.2387, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.14328908503075968, | |
| "grad_norm": 13.285723686218262, | |
| "learning_rate": 1.7134796938718507e-06, | |
| "loss": 1.2843, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 0.14356840098598533, | |
| "grad_norm": 14.88563346862793, | |
| "learning_rate": 1.712921065862242e-06, | |
| "loss": 1.3464, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 0.14384771694121098, | |
| "grad_norm": 10.287967681884766, | |
| "learning_rate": 1.7123624378526339e-06, | |
| "loss": 1.2919, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.14412703289643664, | |
| "grad_norm": 13.416029930114746, | |
| "learning_rate": 1.7118038098430254e-06, | |
| "loss": 1.304, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 0.1444063488516623, | |
| "grad_norm": 10.358808517456055, | |
| "learning_rate": 1.711245181833417e-06, | |
| "loss": 1.2667, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 0.14468566480688794, | |
| "grad_norm": 9.454345703125, | |
| "learning_rate": 1.7106865538238086e-06, | |
| "loss": 1.2677, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 0.1449649807621136, | |
| "grad_norm": 10.137917518615723, | |
| "learning_rate": 1.7101279258142004e-06, | |
| "loss": 1.246, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 0.14524429671733924, | |
| "grad_norm": 10.27364730834961, | |
| "learning_rate": 1.709569297804592e-06, | |
| "loss": 1.2487, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.1455236126725649, | |
| "grad_norm": 11.590679168701172, | |
| "learning_rate": 1.7090106697949835e-06, | |
| "loss": 1.3315, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 0.14580292862779054, | |
| "grad_norm": 12.223170280456543, | |
| "learning_rate": 1.708452041785375e-06, | |
| "loss": 1.3591, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 0.1460822445830162, | |
| "grad_norm": 10.8696928024292, | |
| "learning_rate": 1.7078934137757665e-06, | |
| "loss": 1.2855, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 0.14636156053824184, | |
| "grad_norm": 10.847172737121582, | |
| "learning_rate": 1.7073347857661583e-06, | |
| "loss": 1.2744, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 0.1466408764934675, | |
| "grad_norm": 11.290687561035156, | |
| "learning_rate": 1.7067761577565498e-06, | |
| "loss": 1.2815, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.14692019244869314, | |
| "grad_norm": 10.246102333068848, | |
| "learning_rate": 1.7062175297469414e-06, | |
| "loss": 1.2697, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 0.1471995084039188, | |
| "grad_norm": 10.220574378967285, | |
| "learning_rate": 1.705658901737333e-06, | |
| "loss": 1.274, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 0.14747882435914444, | |
| "grad_norm": 11.137274742126465, | |
| "learning_rate": 1.7051002737277248e-06, | |
| "loss": 1.2915, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 0.1477581403143701, | |
| "grad_norm": 11.349177360534668, | |
| "learning_rate": 1.7045416457181164e-06, | |
| "loss": 1.3005, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 0.14803745626959575, | |
| "grad_norm": 11.108057975769043, | |
| "learning_rate": 1.7039830177085077e-06, | |
| "loss": 1.2922, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.14831677222482142, | |
| "grad_norm": 10.836882591247559, | |
| "learning_rate": 1.7034243896988993e-06, | |
| "loss": 1.29, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 0.14859608818004708, | |
| "grad_norm": 11.927931785583496, | |
| "learning_rate": 1.7028657616892909e-06, | |
| "loss": 1.3138, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 0.14887540413527273, | |
| "grad_norm": 10.31083869934082, | |
| "learning_rate": 1.7023071336796827e-06, | |
| "loss": 1.3356, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 0.14915472009049838, | |
| "grad_norm": 12.571051597595215, | |
| "learning_rate": 1.7017485056700742e-06, | |
| "loss": 1.3247, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 0.14943403604572403, | |
| "grad_norm": 11.460820198059082, | |
| "learning_rate": 1.7011898776604658e-06, | |
| "loss": 1.3115, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.14971335200094968, | |
| "grad_norm": 11.103178977966309, | |
| "learning_rate": 1.7006312496508574e-06, | |
| "loss": 1.288, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 0.14999266795617533, | |
| "grad_norm": 11.281828880310059, | |
| "learning_rate": 1.7000726216412492e-06, | |
| "loss": 1.2285, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 0.15027198391140098, | |
| "grad_norm": 12.560543060302734, | |
| "learning_rate": 1.6995139936316408e-06, | |
| "loss": 1.3078, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 0.15055129986662663, | |
| "grad_norm": 10.196359634399414, | |
| "learning_rate": 1.6989553656220321e-06, | |
| "loss": 1.3406, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 0.15083061582185228, | |
| "grad_norm": 10.276470184326172, | |
| "learning_rate": 1.6983967376124237e-06, | |
| "loss": 1.3514, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.15110993177707793, | |
| "grad_norm": 10.547111511230469, | |
| "learning_rate": 1.6978381096028153e-06, | |
| "loss": 1.2428, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 0.15138924773230358, | |
| "grad_norm": 14.352306365966797, | |
| "learning_rate": 1.697279481593207e-06, | |
| "loss": 1.3123, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 0.15166856368752923, | |
| "grad_norm": 11.18830394744873, | |
| "learning_rate": 1.6967208535835986e-06, | |
| "loss": 1.2438, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 0.15194787964275489, | |
| "grad_norm": 10.590067863464355, | |
| "learning_rate": 1.6961622255739902e-06, | |
| "loss": 1.3224, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 0.15222719559798054, | |
| "grad_norm": 10.839982032775879, | |
| "learning_rate": 1.6956035975643818e-06, | |
| "loss": 1.284, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.1525065115532062, | |
| "grad_norm": 10.421679496765137, | |
| "learning_rate": 1.6950449695547736e-06, | |
| "loss": 1.2974, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 0.15278582750843184, | |
| "grad_norm": 10.920546531677246, | |
| "learning_rate": 1.694486341545165e-06, | |
| "loss": 1.3018, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 0.15306514346365752, | |
| "grad_norm": 10.71149730682373, | |
| "learning_rate": 1.6939277135355565e-06, | |
| "loss": 1.3121, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 0.15334445941888317, | |
| "grad_norm": 10.763243675231934, | |
| "learning_rate": 1.6933690855259481e-06, | |
| "loss": 1.2922, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 0.15362377537410882, | |
| "grad_norm": 12.36917781829834, | |
| "learning_rate": 1.6928104575163397e-06, | |
| "loss": 1.2845, | |
| "step": 5500 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 35802, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |