diff --git "a/cost_to_drop_frequency_3591/checkpoint-40000/trainer_state.json" "b/cost_to_drop_frequency_3591/checkpoint-40000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/cost_to_drop_frequency_3591/checkpoint-40000/trainer_state.json" @@ -0,0 +1,6003 @@ +{ + "best_global_step": 40000, + "best_metric": 3.5466434955596924, + "best_model_checkpoint": "/scratch/cl5625/exceptions/models/cost_to_drop_frequency_3591/checkpoint-40000", + "epoch": 11.644982819870712, + "eval_steps": 1000, + "global_step": 40000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.014559431599790344, + "grad_norm": 1.4049561023712158, + "learning_rate": 0.000294, + "loss": 8.4124, + "step": 50 + }, + { + "epoch": 0.029118863199580687, + "grad_norm": 0.8360756635665894, + "learning_rate": 0.0005939999999999999, + "loss": 6.7276, + "step": 100 + }, + { + "epoch": 0.043678294799371034, + "grad_norm": 0.42227354645729065, + "learning_rate": 0.0005995714285714286, + "loss": 6.3402, + "step": 150 + }, + { + "epoch": 0.058237726399161374, + "grad_norm": 0.9324970841407776, + "learning_rate": 0.0005991341107871719, + "loss": 6.1609, + "step": 200 + }, + { + "epoch": 0.07279715799895171, + "grad_norm": 0.4407173991203308, + "learning_rate": 0.0005986967930029154, + "loss": 6.0089, + "step": 250 + }, + { + "epoch": 0.08735658959874207, + "grad_norm": 0.4540535807609558, + "learning_rate": 0.0005982594752186589, + "loss": 5.8627, + "step": 300 + }, + { + "epoch": 0.10191602119853241, + "grad_norm": 0.4887666404247284, + "learning_rate": 0.0005978221574344022, + "loss": 5.74, + "step": 350 + }, + { + "epoch": 0.11647545279832275, + "grad_norm": 0.5809242129325867, + "learning_rate": 0.0005973848396501457, + "loss": 5.6281, + "step": 400 + }, + { + "epoch": 0.1310348843981131, + "grad_norm": 0.4683547914028168, + "learning_rate": 0.0005969475218658892, + "loss": 5.5265, + "step": 450 + }, + { + "epoch": 0.14559431599790343, + "grad_norm": 0.4089968502521515, + "learning_rate": 0.0005965102040816326, + "loss": 5.4214, + "step": 500 + }, + { + "epoch": 0.1601537475976938, + "grad_norm": 0.5466117858886719, + "learning_rate": 0.000596072886297376, + "loss": 5.3411, + "step": 550 + }, + { + "epoch": 0.17471317919748414, + "grad_norm": 0.3846788704395294, + "learning_rate": 0.0005956355685131195, + "loss": 5.2665, + "step": 600 + }, + { + "epoch": 0.18927261079727448, + "grad_norm": 0.4610619843006134, + "learning_rate": 0.0005951982507288629, + "loss": 5.2078, + "step": 650 + }, + { + "epoch": 0.20383204239706482, + "grad_norm": 0.41991209983825684, + "learning_rate": 0.0005947609329446064, + "loss": 5.1301, + "step": 700 + }, + { + "epoch": 0.21839147399685516, + "grad_norm": 0.4753279685974121, + "learning_rate": 0.0005943236151603498, + "loss": 5.0702, + "step": 750 + }, + { + "epoch": 0.2329509055966455, + "grad_norm": 0.4781185984611511, + "learning_rate": 0.0005938862973760932, + "loss": 5.0195, + "step": 800 + }, + { + "epoch": 0.24751033719643586, + "grad_norm": 0.41803014278411865, + "learning_rate": 0.0005934489795918367, + "loss": 4.971, + "step": 850 + }, + { + "epoch": 0.2620697687962262, + "grad_norm": 0.444289892911911, + "learning_rate": 0.0005930116618075802, + "loss": 4.9305, + "step": 900 + }, + { + "epoch": 0.2766292003960165, + "grad_norm": 0.4531804025173187, + "learning_rate": 0.0005925743440233235, + "loss": 4.8862, + "step": 950 + }, + { + "epoch": 0.29118863199580686, + "grad_norm": 0.4998404085636139, + "learning_rate": 0.000592137026239067, + "loss": 4.8266, + "step": 1000 + }, + { + "epoch": 0.29118863199580686, + "eval_accuracy": 0.25396983481710367, + "eval_loss": 4.760892868041992, + "eval_runtime": 179.1934, + "eval_samples_per_second": 92.877, + "eval_steps_per_second": 5.809, + "step": 1000 + }, + { + "epoch": 0.30574806359559725, + "grad_norm": 0.44976159930229187, + "learning_rate": 0.0005916997084548104, + "loss": 4.7891, + "step": 1050 + }, + { + "epoch": 0.3203074951953876, + "grad_norm": 0.38453996181488037, + "learning_rate": 0.0005912623906705539, + "loss": 4.7294, + "step": 1100 + }, + { + "epoch": 0.33486692679517793, + "grad_norm": 0.4332706928253174, + "learning_rate": 0.0005908250728862974, + "loss": 4.7002, + "step": 1150 + }, + { + "epoch": 0.3494263583949683, + "grad_norm": 0.42371395230293274, + "learning_rate": 0.0005903877551020407, + "loss": 4.6808, + "step": 1200 + }, + { + "epoch": 0.3639857899947586, + "grad_norm": 0.45705753564834595, + "learning_rate": 0.0005899504373177842, + "loss": 4.6327, + "step": 1250 + }, + { + "epoch": 0.37854522159454895, + "grad_norm": 0.42063650488853455, + "learning_rate": 0.0005895131195335277, + "loss": 4.6117, + "step": 1300 + }, + { + "epoch": 0.3931046531943393, + "grad_norm": 0.43308427929878235, + "learning_rate": 0.0005890758017492711, + "loss": 4.5751, + "step": 1350 + }, + { + "epoch": 0.40766408479412963, + "grad_norm": 0.43480074405670166, + "learning_rate": 0.0005886384839650145, + "loss": 4.5591, + "step": 1400 + }, + { + "epoch": 0.42222351639392, + "grad_norm": 0.45868223905563354, + "learning_rate": 0.000588201166180758, + "loss": 4.5263, + "step": 1450 + }, + { + "epoch": 0.4367829479937103, + "grad_norm": 0.41984814405441284, + "learning_rate": 0.0005877638483965014, + "loss": 4.5044, + "step": 1500 + }, + { + "epoch": 0.45134237959350065, + "grad_norm": 0.4139959216117859, + "learning_rate": 0.0005873265306122449, + "loss": 4.4968, + "step": 1550 + }, + { + "epoch": 0.465901811193291, + "grad_norm": 0.38750138878822327, + "learning_rate": 0.0005868892128279882, + "loss": 4.4646, + "step": 1600 + }, + { + "epoch": 0.48046124279308133, + "grad_norm": 0.41930243372917175, + "learning_rate": 0.0005864518950437317, + "loss": 4.4529, + "step": 1650 + }, + { + "epoch": 0.49502067439287173, + "grad_norm": 0.41106143593788147, + "learning_rate": 0.0005860145772594752, + "loss": 4.4362, + "step": 1700 + }, + { + "epoch": 0.509580105992662, + "grad_norm": 0.39897602796554565, + "learning_rate": 0.0005855772594752186, + "loss": 4.4112, + "step": 1750 + }, + { + "epoch": 0.5241395375924524, + "grad_norm": 0.4214461147785187, + "learning_rate": 0.000585139941690962, + "loss": 4.404, + "step": 1800 + }, + { + "epoch": 0.5386989691922427, + "grad_norm": 0.3887820541858673, + "learning_rate": 0.0005847026239067055, + "loss": 4.3787, + "step": 1850 + }, + { + "epoch": 0.553258400792033, + "grad_norm": 0.3768806755542755, + "learning_rate": 0.0005842653061224489, + "loss": 4.3711, + "step": 1900 + }, + { + "epoch": 0.5678178323918234, + "grad_norm": 0.3779532313346863, + "learning_rate": 0.0005838279883381924, + "loss": 4.3456, + "step": 1950 + }, + { + "epoch": 0.5823772639916137, + "grad_norm": 0.3921726942062378, + "learning_rate": 0.0005833906705539359, + "loss": 4.3399, + "step": 2000 + }, + { + "epoch": 0.5823772639916137, + "eval_accuracy": 0.2996934707950652, + "eval_loss": 4.28386926651001, + "eval_runtime": 179.6428, + "eval_samples_per_second": 92.645, + "eval_steps_per_second": 5.795, + "step": 2000 + }, + { + "epoch": 0.5969366955914042, + "grad_norm": 0.38071900606155396, + "learning_rate": 0.0005829533527696792, + "loss": 4.3206, + "step": 2050 + }, + { + "epoch": 0.6114961271911945, + "grad_norm": 0.4333866536617279, + "learning_rate": 0.0005825160349854227, + "loss": 4.316, + "step": 2100 + }, + { + "epoch": 0.6260555587909848, + "grad_norm": 0.3910558223724365, + "learning_rate": 0.0005820787172011661, + "loss": 4.2961, + "step": 2150 + }, + { + "epoch": 0.6406149903907752, + "grad_norm": 0.3819257318973541, + "learning_rate": 0.0005816413994169096, + "loss": 4.2951, + "step": 2200 + }, + { + "epoch": 0.6551744219905655, + "grad_norm": 0.4080394506454468, + "learning_rate": 0.000581204081632653, + "loss": 4.2756, + "step": 2250 + }, + { + "epoch": 0.6697338535903559, + "grad_norm": 0.37072518467903137, + "learning_rate": 0.0005807667638483965, + "loss": 4.2638, + "step": 2300 + }, + { + "epoch": 0.6842932851901462, + "grad_norm": 0.3981825113296509, + "learning_rate": 0.0005803294460641399, + "loss": 4.2662, + "step": 2350 + }, + { + "epoch": 0.6988527167899365, + "grad_norm": 0.384818971157074, + "learning_rate": 0.0005798921282798834, + "loss": 4.2509, + "step": 2400 + }, + { + "epoch": 0.7134121483897269, + "grad_norm": 0.43530362844467163, + "learning_rate": 0.0005794548104956267, + "loss": 4.2352, + "step": 2450 + }, + { + "epoch": 0.7279715799895172, + "grad_norm": 0.3544856607913971, + "learning_rate": 0.0005790174927113702, + "loss": 4.2268, + "step": 2500 + }, + { + "epoch": 0.7425310115893076, + "grad_norm": 0.38703247904777527, + "learning_rate": 0.0005785801749271137, + "loss": 4.2107, + "step": 2550 + }, + { + "epoch": 0.7570904431890979, + "grad_norm": 0.37904635071754456, + "learning_rate": 0.000578142857142857, + "loss": 4.1982, + "step": 2600 + }, + { + "epoch": 0.7716498747888882, + "grad_norm": 0.41309526562690735, + "learning_rate": 0.0005777055393586005, + "loss": 4.1833, + "step": 2650 + }, + { + "epoch": 0.7862093063886786, + "grad_norm": 0.42821475863456726, + "learning_rate": 0.000577268221574344, + "loss": 4.1892, + "step": 2700 + }, + { + "epoch": 0.8007687379884689, + "grad_norm": 0.4209707975387573, + "learning_rate": 0.0005768309037900874, + "loss": 4.1834, + "step": 2750 + }, + { + "epoch": 0.8153281695882593, + "grad_norm": 0.3531130254268646, + "learning_rate": 0.0005763935860058308, + "loss": 4.1801, + "step": 2800 + }, + { + "epoch": 0.8298876011880496, + "grad_norm": 0.34633395075798035, + "learning_rate": 0.0005759562682215744, + "loss": 4.1681, + "step": 2850 + }, + { + "epoch": 0.84444703278784, + "grad_norm": 0.3938649892807007, + "learning_rate": 0.0005755189504373177, + "loss": 4.1636, + "step": 2900 + }, + { + "epoch": 0.8590064643876303, + "grad_norm": 0.3613823652267456, + "learning_rate": 0.0005750816326530612, + "loss": 4.1578, + "step": 2950 + }, + { + "epoch": 0.8735658959874206, + "grad_norm": 0.3491958677768707, + "learning_rate": 0.0005746443148688046, + "loss": 4.1452, + "step": 3000 + }, + { + "epoch": 0.8735658959874206, + "eval_accuracy": 0.31544864157201075, + "eval_loss": 4.095163822174072, + "eval_runtime": 179.6171, + "eval_samples_per_second": 92.658, + "eval_steps_per_second": 5.796, + "step": 3000 + }, + { + "epoch": 0.888125327587211, + "grad_norm": 0.3552567958831787, + "learning_rate": 0.000574206997084548, + "loss": 4.1285, + "step": 3050 + }, + { + "epoch": 0.9026847591870013, + "grad_norm": 0.35991519689559937, + "learning_rate": 0.0005737696793002915, + "loss": 4.132, + "step": 3100 + }, + { + "epoch": 0.9172441907867916, + "grad_norm": 0.3861224949359894, + "learning_rate": 0.000573332361516035, + "loss": 4.1214, + "step": 3150 + }, + { + "epoch": 0.931803622386582, + "grad_norm": 0.3921383023262024, + "learning_rate": 0.0005728950437317784, + "loss": 4.1157, + "step": 3200 + }, + { + "epoch": 0.9463630539863723, + "grad_norm": 0.3566656708717346, + "learning_rate": 0.0005724577259475218, + "loss": 4.1088, + "step": 3250 + }, + { + "epoch": 0.9609224855861627, + "grad_norm": 0.3769164979457855, + "learning_rate": 0.0005720204081632652, + "loss": 4.0963, + "step": 3300 + }, + { + "epoch": 0.975481917185953, + "grad_norm": 0.3577769100666046, + "learning_rate": 0.0005715830903790087, + "loss": 4.1067, + "step": 3350 + }, + { + "epoch": 0.9900413487857435, + "grad_norm": 0.35588538646698, + "learning_rate": 0.0005711457725947522, + "loss": 4.0912, + "step": 3400 + }, + { + "epoch": 1.0043678294799372, + "grad_norm": 0.3580274283885956, + "learning_rate": 0.0005707084548104955, + "loss": 4.0849, + "step": 3450 + }, + { + "epoch": 1.0189272610797275, + "grad_norm": 0.3520485758781433, + "learning_rate": 0.000570271137026239, + "loss": 4.0188, + "step": 3500 + }, + { + "epoch": 1.0334866926795179, + "grad_norm": 0.3421690762042999, + "learning_rate": 0.0005698338192419825, + "loss": 4.0128, + "step": 3550 + }, + { + "epoch": 1.0480461242793082, + "grad_norm": 0.3418625593185425, + "learning_rate": 0.0005693965014577259, + "loss": 4.0056, + "step": 3600 + }, + { + "epoch": 1.0626055558790986, + "grad_norm": 0.34526926279067993, + "learning_rate": 0.0005689591836734693, + "loss": 4.0084, + "step": 3650 + }, + { + "epoch": 1.077164987478889, + "grad_norm": 0.35390642285346985, + "learning_rate": 0.0005685218658892128, + "loss": 4.0061, + "step": 3700 + }, + { + "epoch": 1.0917244190786792, + "grad_norm": 0.3431430459022522, + "learning_rate": 0.0005680845481049562, + "loss": 3.9994, + "step": 3750 + }, + { + "epoch": 1.1062838506784696, + "grad_norm": 0.357334166765213, + "learning_rate": 0.0005676472303206997, + "loss": 4.0071, + "step": 3800 + }, + { + "epoch": 1.12084328227826, + "grad_norm": 0.3587090075016022, + "learning_rate": 0.000567209912536443, + "loss": 3.985, + "step": 3850 + }, + { + "epoch": 1.1354027138780503, + "grad_norm": 0.3586151599884033, + "learning_rate": 0.0005667725947521865, + "loss": 4.0047, + "step": 3900 + }, + { + "epoch": 1.1499621454778406, + "grad_norm": 0.37636685371398926, + "learning_rate": 0.00056633527696793, + "loss": 3.9987, + "step": 3950 + }, + { + "epoch": 1.164521577077631, + "grad_norm": 0.35518568754196167, + "learning_rate": 0.0005658979591836735, + "loss": 3.9904, + "step": 4000 + }, + { + "epoch": 1.164521577077631, + "eval_accuracy": 0.32538388464653073, + "eval_loss": 3.9894351959228516, + "eval_runtime": 179.567, + "eval_samples_per_second": 92.684, + "eval_steps_per_second": 5.797, + "step": 4000 + }, + { + "epoch": 1.1790810086774213, + "grad_norm": 0.3445068895816803, + "learning_rate": 0.0005654606413994169, + "loss": 3.9831, + "step": 4050 + }, + { + "epoch": 1.1936404402772116, + "grad_norm": 0.3411754369735718, + "learning_rate": 0.0005650233236151603, + "loss": 3.9741, + "step": 4100 + }, + { + "epoch": 1.208199871877002, + "grad_norm": 0.3622643053531647, + "learning_rate": 0.0005645860058309037, + "loss": 3.9812, + "step": 4150 + }, + { + "epoch": 1.2227593034767923, + "grad_norm": 0.35340210795402527, + "learning_rate": 0.0005641486880466472, + "loss": 3.9853, + "step": 4200 + }, + { + "epoch": 1.2373187350765826, + "grad_norm": 0.34644776582717896, + "learning_rate": 0.0005637113702623907, + "loss": 3.9733, + "step": 4250 + }, + { + "epoch": 1.251878166676373, + "grad_norm": 0.33221983909606934, + "learning_rate": 0.000563274052478134, + "loss": 3.9601, + "step": 4300 + }, + { + "epoch": 1.2664375982761633, + "grad_norm": 0.3372167646884918, + "learning_rate": 0.0005628367346938775, + "loss": 3.9708, + "step": 4350 + }, + { + "epoch": 1.2809970298759537, + "grad_norm": 0.3629266321659088, + "learning_rate": 0.0005623994169096209, + "loss": 3.9556, + "step": 4400 + }, + { + "epoch": 1.295556461475744, + "grad_norm": 0.31815558671951294, + "learning_rate": 0.0005619620991253644, + "loss": 3.9644, + "step": 4450 + }, + { + "epoch": 1.3101158930755343, + "grad_norm": 0.3518199622631073, + "learning_rate": 0.0005615247813411078, + "loss": 3.9551, + "step": 4500 + }, + { + "epoch": 1.3246753246753247, + "grad_norm": 0.3197888135910034, + "learning_rate": 0.0005610874635568513, + "loss": 3.9556, + "step": 4550 + }, + { + "epoch": 1.339234756275115, + "grad_norm": 0.35236433148384094, + "learning_rate": 0.0005606501457725947, + "loss": 3.9573, + "step": 4600 + }, + { + "epoch": 1.3537941878749054, + "grad_norm": 0.3366566002368927, + "learning_rate": 0.0005602128279883382, + "loss": 3.9619, + "step": 4650 + }, + { + "epoch": 1.3683536194746957, + "grad_norm": 0.3635067939758301, + "learning_rate": 0.0005597755102040816, + "loss": 3.9568, + "step": 4700 + }, + { + "epoch": 1.382913051074486, + "grad_norm": 0.3495481610298157, + "learning_rate": 0.000559338192419825, + "loss": 3.935, + "step": 4750 + }, + { + "epoch": 1.3974724826742764, + "grad_norm": 0.34598347544670105, + "learning_rate": 0.0005589008746355685, + "loss": 3.9463, + "step": 4800 + }, + { + "epoch": 1.4120319142740667, + "grad_norm": 0.32707110047340393, + "learning_rate": 0.0005584635568513118, + "loss": 3.9388, + "step": 4850 + }, + { + "epoch": 1.426591345873857, + "grad_norm": 0.35207509994506836, + "learning_rate": 0.0005580262390670554, + "loss": 3.9363, + "step": 4900 + }, + { + "epoch": 1.4411507774736474, + "grad_norm": 0.33082953095436096, + "learning_rate": 0.0005575889212827988, + "loss": 3.9443, + "step": 4950 + }, + { + "epoch": 1.4557102090734377, + "grad_norm": 0.36195048689842224, + "learning_rate": 0.0005571516034985422, + "loss": 3.934, + "step": 5000 + }, + { + "epoch": 1.4557102090734377, + "eval_accuracy": 0.3320231362585752, + "eval_loss": 3.9117023944854736, + "eval_runtime": 179.5118, + "eval_samples_per_second": 92.713, + "eval_steps_per_second": 5.799, + "step": 5000 + }, + { + "epoch": 1.470269640673228, + "grad_norm": 0.3603370487689972, + "learning_rate": 0.0005567142857142856, + "loss": 3.9232, + "step": 5050 + }, + { + "epoch": 1.4848290722730184, + "grad_norm": 0.3303501307964325, + "learning_rate": 0.0005562769679300292, + "loss": 3.929, + "step": 5100 + }, + { + "epoch": 1.4993885038728088, + "grad_norm": 0.34812071919441223, + "learning_rate": 0.0005558396501457725, + "loss": 3.9186, + "step": 5150 + }, + { + "epoch": 1.5139479354725993, + "grad_norm": 0.3245297372341156, + "learning_rate": 0.000555402332361516, + "loss": 3.9281, + "step": 5200 + }, + { + "epoch": 1.5285073670723897, + "grad_norm": 0.32848072052001953, + "learning_rate": 0.0005549650145772595, + "loss": 3.9081, + "step": 5250 + }, + { + "epoch": 1.54306679867218, + "grad_norm": 0.3524268865585327, + "learning_rate": 0.0005545276967930028, + "loss": 3.9169, + "step": 5300 + }, + { + "epoch": 1.5576262302719703, + "grad_norm": 0.3273775279521942, + "learning_rate": 0.0005540903790087463, + "loss": 3.9057, + "step": 5350 + }, + { + "epoch": 1.5721856618717607, + "grad_norm": 0.33142444491386414, + "learning_rate": 0.0005536530612244898, + "loss": 3.9117, + "step": 5400 + }, + { + "epoch": 1.586745093471551, + "grad_norm": 0.35404613614082336, + "learning_rate": 0.0005532157434402332, + "loss": 3.9, + "step": 5450 + }, + { + "epoch": 1.6013045250713414, + "grad_norm": 0.3326050341129303, + "learning_rate": 0.0005527784256559766, + "loss": 3.9023, + "step": 5500 + }, + { + "epoch": 1.6158639566711317, + "grad_norm": 0.32253944873809814, + "learning_rate": 0.00055234110787172, + "loss": 3.9036, + "step": 5550 + }, + { + "epoch": 1.630423388270922, + "grad_norm": 0.40896502137184143, + "learning_rate": 0.0005519037900874635, + "loss": 3.892, + "step": 5600 + }, + { + "epoch": 1.6449828198707124, + "grad_norm": 0.33099985122680664, + "learning_rate": 0.000551466472303207, + "loss": 3.8921, + "step": 5650 + }, + { + "epoch": 1.6595422514705027, + "grad_norm": 0.3134934902191162, + "learning_rate": 0.0005510291545189503, + "loss": 3.8986, + "step": 5700 + }, + { + "epoch": 1.674101683070293, + "grad_norm": 0.32286426424980164, + "learning_rate": 0.0005505918367346938, + "loss": 3.8705, + "step": 5750 + }, + { + "epoch": 1.6886611146700834, + "grad_norm": 0.3152390122413635, + "learning_rate": 0.0005501545189504373, + "loss": 3.8843, + "step": 5800 + }, + { + "epoch": 1.7032205462698737, + "grad_norm": 0.3241208493709564, + "learning_rate": 0.0005497172011661807, + "loss": 3.8915, + "step": 5850 + }, + { + "epoch": 1.717779977869664, + "grad_norm": 0.3297117054462433, + "learning_rate": 0.0005492798833819241, + "loss": 3.8959, + "step": 5900 + }, + { + "epoch": 1.7323394094694544, + "grad_norm": 0.34585368633270264, + "learning_rate": 0.0005488425655976676, + "loss": 3.8631, + "step": 5950 + }, + { + "epoch": 1.7468988410692448, + "grad_norm": 0.32093173265457153, + "learning_rate": 0.000548405247813411, + "loss": 3.8774, + "step": 6000 + }, + { + "epoch": 1.7468988410692448, + "eval_accuracy": 0.3372265721042079, + "eval_loss": 3.8566107749938965, + "eval_runtime": 179.5862, + "eval_samples_per_second": 92.674, + "eval_steps_per_second": 5.797, + "step": 6000 + }, + { + "epoch": 1.761458272669035, + "grad_norm": 0.3342028260231018, + "learning_rate": 0.0005479679300291545, + "loss": 3.8767, + "step": 6050 + }, + { + "epoch": 1.7760177042688254, + "grad_norm": 0.331476628780365, + "learning_rate": 0.000547530612244898, + "loss": 3.8741, + "step": 6100 + }, + { + "epoch": 1.7905771358686158, + "grad_norm": 0.3178947865962982, + "learning_rate": 0.0005470932944606413, + "loss": 3.8753, + "step": 6150 + }, + { + "epoch": 1.8051365674684061, + "grad_norm": 0.33139607310295105, + "learning_rate": 0.0005466559766763848, + "loss": 3.8686, + "step": 6200 + }, + { + "epoch": 1.8196959990681965, + "grad_norm": 0.35270482301712036, + "learning_rate": 0.0005462186588921283, + "loss": 3.8577, + "step": 6250 + }, + { + "epoch": 1.8342554306679868, + "grad_norm": 0.3247964382171631, + "learning_rate": 0.0005457813411078717, + "loss": 3.8574, + "step": 6300 + }, + { + "epoch": 1.8488148622677771, + "grad_norm": 0.33985435962677, + "learning_rate": 0.0005453440233236151, + "loss": 3.8546, + "step": 6350 + }, + { + "epoch": 1.8633742938675675, + "grad_norm": 0.33400237560272217, + "learning_rate": 0.0005449067055393585, + "loss": 3.8636, + "step": 6400 + }, + { + "epoch": 1.8779337254673578, + "grad_norm": 0.3367692232131958, + "learning_rate": 0.0005444693877551019, + "loss": 3.8718, + "step": 6450 + }, + { + "epoch": 1.8924931570671482, + "grad_norm": 0.3267197608947754, + "learning_rate": 0.0005440320699708455, + "loss": 3.8507, + "step": 6500 + }, + { + "epoch": 1.9070525886669385, + "grad_norm": 0.3389538824558258, + "learning_rate": 0.0005435947521865888, + "loss": 3.8546, + "step": 6550 + }, + { + "epoch": 1.9216120202667288, + "grad_norm": 0.32694804668426514, + "learning_rate": 0.0005431574344023323, + "loss": 3.8391, + "step": 6600 + }, + { + "epoch": 1.9361714518665192, + "grad_norm": 0.3353123366832733, + "learning_rate": 0.0005427201166180758, + "loss": 3.8435, + "step": 6650 + }, + { + "epoch": 1.9507308834663095, + "grad_norm": 0.32406482100486755, + "learning_rate": 0.0005422827988338192, + "loss": 3.8409, + "step": 6700 + }, + { + "epoch": 1.9652903150660999, + "grad_norm": 0.3334747850894928, + "learning_rate": 0.0005418454810495626, + "loss": 3.8506, + "step": 6750 + }, + { + "epoch": 1.9798497466658902, + "grad_norm": 0.33217740058898926, + "learning_rate": 0.0005414081632653061, + "loss": 3.8396, + "step": 6800 + }, + { + "epoch": 1.9944091782656805, + "grad_norm": 0.33468008041381836, + "learning_rate": 0.0005409708454810495, + "loss": 3.8407, + "step": 6850 + }, + { + "epoch": 2.0087356589598744, + "grad_norm": 0.3196060359477997, + "learning_rate": 0.0005405335276967929, + "loss": 3.7913, + "step": 6900 + }, + { + "epoch": 2.0232950905596647, + "grad_norm": 0.3573300540447235, + "learning_rate": 0.0005400962099125365, + "loss": 3.7409, + "step": 6950 + }, + { + "epoch": 2.037854522159455, + "grad_norm": 0.3402981460094452, + "learning_rate": 0.0005396588921282798, + "loss": 3.7556, + "step": 7000 + }, + { + "epoch": 2.037854522159455, + "eval_accuracy": 0.34190017535271905, + "eval_loss": 3.809979200363159, + "eval_runtime": 179.6501, + "eval_samples_per_second": 92.641, + "eval_steps_per_second": 5.795, + "step": 7000 + }, + { + "epoch": 2.0524139537592454, + "grad_norm": 0.3510541319847107, + "learning_rate": 0.0005392215743440233, + "loss": 3.7422, + "step": 7050 + }, + { + "epoch": 2.0669733853590357, + "grad_norm": 0.31116750836372375, + "learning_rate": 0.0005387842565597666, + "loss": 3.7475, + "step": 7100 + }, + { + "epoch": 2.081532816958826, + "grad_norm": 0.3254874050617218, + "learning_rate": 0.0005383469387755102, + "loss": 3.7546, + "step": 7150 + }, + { + "epoch": 2.0960922485586164, + "grad_norm": 0.3147241771221161, + "learning_rate": 0.0005379096209912536, + "loss": 3.7518, + "step": 7200 + }, + { + "epoch": 2.1106516801584068, + "grad_norm": 0.3199782073497772, + "learning_rate": 0.000537472303206997, + "loss": 3.7659, + "step": 7250 + }, + { + "epoch": 2.125211111758197, + "grad_norm": 0.3094785809516907, + "learning_rate": 0.0005370349854227405, + "loss": 3.7481, + "step": 7300 + }, + { + "epoch": 2.1397705433579874, + "grad_norm": 0.3172190189361572, + "learning_rate": 0.0005365976676384839, + "loss": 3.7408, + "step": 7350 + }, + { + "epoch": 2.154329974957778, + "grad_norm": 0.3381129801273346, + "learning_rate": 0.0005361603498542273, + "loss": 3.7448, + "step": 7400 + }, + { + "epoch": 2.168889406557568, + "grad_norm": 0.3302014470100403, + "learning_rate": 0.0005357230320699708, + "loss": 3.7451, + "step": 7450 + }, + { + "epoch": 2.1834488381573585, + "grad_norm": 0.34532982110977173, + "learning_rate": 0.0005352857142857143, + "loss": 3.7459, + "step": 7500 + }, + { + "epoch": 2.198008269757149, + "grad_norm": 0.3262939751148224, + "learning_rate": 0.0005348483965014576, + "loss": 3.7466, + "step": 7550 + }, + { + "epoch": 2.212567701356939, + "grad_norm": 0.33892711997032166, + "learning_rate": 0.0005344110787172011, + "loss": 3.7505, + "step": 7600 + }, + { + "epoch": 2.2271271329567295, + "grad_norm": 0.3445602059364319, + "learning_rate": 0.0005339737609329446, + "loss": 3.7429, + "step": 7650 + }, + { + "epoch": 2.24168656455652, + "grad_norm": 0.3161507248878479, + "learning_rate": 0.000533536443148688, + "loss": 3.7541, + "step": 7700 + }, + { + "epoch": 2.25624599615631, + "grad_norm": 0.31178775429725647, + "learning_rate": 0.0005330991253644314, + "loss": 3.7447, + "step": 7750 + }, + { + "epoch": 2.2708054277561005, + "grad_norm": 0.3178870975971222, + "learning_rate": 0.0005326618075801749, + "loss": 3.7506, + "step": 7800 + }, + { + "epoch": 2.285364859355891, + "grad_norm": 0.3333457112312317, + "learning_rate": 0.0005322244897959183, + "loss": 3.7494, + "step": 7850 + }, + { + "epoch": 2.299924290955681, + "grad_norm": 0.3204410672187805, + "learning_rate": 0.0005317871720116618, + "loss": 3.7474, + "step": 7900 + }, + { + "epoch": 2.3144837225554715, + "grad_norm": 0.31767410039901733, + "learning_rate": 0.0005313498542274051, + "loss": 3.7368, + "step": 7950 + }, + { + "epoch": 2.329043154155262, + "grad_norm": 0.33374062180519104, + "learning_rate": 0.0005309125364431486, + "loss": 3.7524, + "step": 8000 + }, + { + "epoch": 2.329043154155262, + "eval_accuracy": 0.34463388108962084, + "eval_loss": 3.7798807621002197, + "eval_runtime": 179.8167, + "eval_samples_per_second": 92.555, + "eval_steps_per_second": 5.789, + "step": 8000 + }, + { + "epoch": 2.343602585755052, + "grad_norm": 0.32286617159843445, + "learning_rate": 0.0005304752186588921, + "loss": 3.7492, + "step": 8050 + }, + { + "epoch": 2.3581620173548425, + "grad_norm": 0.33228906989097595, + "learning_rate": 0.0005300379008746355, + "loss": 3.764, + "step": 8100 + }, + { + "epoch": 2.372721448954633, + "grad_norm": 0.33857783675193787, + "learning_rate": 0.000529600583090379, + "loss": 3.759, + "step": 8150 + }, + { + "epoch": 2.3872808805544232, + "grad_norm": 0.3177933394908905, + "learning_rate": 0.0005291632653061224, + "loss": 3.7536, + "step": 8200 + }, + { + "epoch": 2.4018403121542136, + "grad_norm": 0.3171054720878601, + "learning_rate": 0.0005287259475218658, + "loss": 3.7403, + "step": 8250 + }, + { + "epoch": 2.416399743754004, + "grad_norm": 0.32724741101264954, + "learning_rate": 0.0005282886297376093, + "loss": 3.7446, + "step": 8300 + }, + { + "epoch": 2.4309591753537942, + "grad_norm": 0.3406330347061157, + "learning_rate": 0.0005278513119533528, + "loss": 3.7441, + "step": 8350 + }, + { + "epoch": 2.4455186069535846, + "grad_norm": 0.3245644271373749, + "learning_rate": 0.0005274139941690961, + "loss": 3.7317, + "step": 8400 + }, + { + "epoch": 2.460078038553375, + "grad_norm": 0.3408276438713074, + "learning_rate": 0.0005269766763848396, + "loss": 3.7373, + "step": 8450 + }, + { + "epoch": 2.4746374701531653, + "grad_norm": 0.31394264101982117, + "learning_rate": 0.0005265393586005831, + "loss": 3.732, + "step": 8500 + }, + { + "epoch": 2.4891969017529556, + "grad_norm": 0.3347412645816803, + "learning_rate": 0.0005261020408163265, + "loss": 3.7266, + "step": 8550 + }, + { + "epoch": 2.503756333352746, + "grad_norm": 0.32223114371299744, + "learning_rate": 0.0005256647230320699, + "loss": 3.7293, + "step": 8600 + }, + { + "epoch": 2.5183157649525363, + "grad_norm": 0.3145173490047455, + "learning_rate": 0.0005252274052478134, + "loss": 3.7471, + "step": 8650 + }, + { + "epoch": 2.5328751965523266, + "grad_norm": 0.31143006682395935, + "learning_rate": 0.0005247900874635568, + "loss": 3.7394, + "step": 8700 + }, + { + "epoch": 2.547434628152117, + "grad_norm": 0.3238007724285126, + "learning_rate": 0.0005243527696793003, + "loss": 3.7283, + "step": 8750 + }, + { + "epoch": 2.5619940597519073, + "grad_norm": 0.3301667869091034, + "learning_rate": 0.0005239154518950436, + "loss": 3.7463, + "step": 8800 + }, + { + "epoch": 2.5765534913516976, + "grad_norm": 0.32847797870635986, + "learning_rate": 0.0005234781341107871, + "loss": 3.7397, + "step": 8850 + }, + { + "epoch": 2.591112922951488, + "grad_norm": 0.32561489939689636, + "learning_rate": 0.0005230408163265306, + "loss": 3.7437, + "step": 8900 + }, + { + "epoch": 2.6056723545512783, + "grad_norm": 0.30937111377716064, + "learning_rate": 0.000522603498542274, + "loss": 3.7399, + "step": 8950 + }, + { + "epoch": 2.6202317861510687, + "grad_norm": 0.32154905796051025, + "learning_rate": 0.0005221661807580175, + "loss": 3.7339, + "step": 9000 + }, + { + "epoch": 2.6202317861510687, + "eval_accuracy": 0.3472925683629266, + "eval_loss": 3.7506699562072754, + "eval_runtime": 179.8206, + "eval_samples_per_second": 92.553, + "eval_steps_per_second": 5.789, + "step": 9000 + }, + { + "epoch": 2.634791217750859, + "grad_norm": 0.317490816116333, + "learning_rate": 0.0005217288629737609, + "loss": 3.7263, + "step": 9050 + }, + { + "epoch": 2.6493506493506493, + "grad_norm": 0.32750970125198364, + "learning_rate": 0.0005212915451895043, + "loss": 3.7324, + "step": 9100 + }, + { + "epoch": 2.6639100809504397, + "grad_norm": 0.3290070593357086, + "learning_rate": 0.0005208542274052477, + "loss": 3.7314, + "step": 9150 + }, + { + "epoch": 2.67846951255023, + "grad_norm": 0.34482887387275696, + "learning_rate": 0.0005204169096209913, + "loss": 3.7192, + "step": 9200 + }, + { + "epoch": 2.6930289441500204, + "grad_norm": 0.31812381744384766, + "learning_rate": 0.0005199795918367346, + "loss": 3.7308, + "step": 9250 + }, + { + "epoch": 2.7075883757498107, + "grad_norm": 0.33570706844329834, + "learning_rate": 0.0005195422740524781, + "loss": 3.7338, + "step": 9300 + }, + { + "epoch": 2.722147807349601, + "grad_norm": 0.3004995584487915, + "learning_rate": 0.0005191049562682216, + "loss": 3.7224, + "step": 9350 + }, + { + "epoch": 2.7367072389493914, + "grad_norm": 0.3277261555194855, + "learning_rate": 0.000518667638483965, + "loss": 3.7313, + "step": 9400 + }, + { + "epoch": 2.7512666705491817, + "grad_norm": 0.3260866701602936, + "learning_rate": 0.0005182303206997084, + "loss": 3.7252, + "step": 9450 + }, + { + "epoch": 2.765826102148972, + "grad_norm": 0.30772513151168823, + "learning_rate": 0.0005177930029154519, + "loss": 3.7263, + "step": 9500 + }, + { + "epoch": 2.7803855337487624, + "grad_norm": 0.3158465027809143, + "learning_rate": 0.0005173556851311953, + "loss": 3.728, + "step": 9550 + }, + { + "epoch": 2.7949449653485527, + "grad_norm": 0.31197673082351685, + "learning_rate": 0.0005169183673469387, + "loss": 3.7135, + "step": 9600 + }, + { + "epoch": 2.809504396948343, + "grad_norm": 0.33720263838768005, + "learning_rate": 0.0005164810495626821, + "loss": 3.7205, + "step": 9650 + }, + { + "epoch": 2.8240638285481334, + "grad_norm": 0.3222922086715698, + "learning_rate": 0.0005160437317784256, + "loss": 3.7212, + "step": 9700 + }, + { + "epoch": 2.8386232601479238, + "grad_norm": 0.32163000106811523, + "learning_rate": 0.0005156064139941691, + "loss": 3.7303, + "step": 9750 + }, + { + "epoch": 2.853182691747714, + "grad_norm": 0.29815468192100525, + "learning_rate": 0.0005151690962099124, + "loss": 3.7143, + "step": 9800 + }, + { + "epoch": 2.8677421233475044, + "grad_norm": 0.3258896768093109, + "learning_rate": 0.000514731778425656, + "loss": 3.7076, + "step": 9850 + }, + { + "epoch": 2.882301554947295, + "grad_norm": 0.32969552278518677, + "learning_rate": 0.0005142944606413994, + "loss": 3.7269, + "step": 9900 + }, + { + "epoch": 2.896860986547085, + "grad_norm": 0.31835922598838806, + "learning_rate": 0.0005138571428571428, + "loss": 3.7207, + "step": 9950 + }, + { + "epoch": 2.9114204181468755, + "grad_norm": 0.3245142102241516, + "learning_rate": 0.0005134198250728862, + "loss": 3.7167, + "step": 10000 + }, + { + "epoch": 2.9114204181468755, + "eval_accuracy": 0.3495557037372717, + "eval_loss": 3.7238857746124268, + "eval_runtime": 179.8397, + "eval_samples_per_second": 92.544, + "eval_steps_per_second": 5.788, + "step": 10000 + }, + { + "epoch": 2.925979849746666, + "grad_norm": 0.32630476355552673, + "learning_rate": 0.0005129825072886297, + "loss": 3.7083, + "step": 10050 + }, + { + "epoch": 2.940539281346456, + "grad_norm": 0.3315964341163635, + "learning_rate": 0.0005125451895043731, + "loss": 3.7064, + "step": 10100 + }, + { + "epoch": 2.9550987129462465, + "grad_norm": 0.31410086154937744, + "learning_rate": 0.0005121078717201166, + "loss": 3.7236, + "step": 10150 + }, + { + "epoch": 2.969658144546037, + "grad_norm": 0.33839717507362366, + "learning_rate": 0.0005116705539358601, + "loss": 3.7078, + "step": 10200 + }, + { + "epoch": 2.984217576145827, + "grad_norm": 0.32319313287734985, + "learning_rate": 0.0005112332361516034, + "loss": 3.7141, + "step": 10250 + }, + { + "epoch": 2.9987770077456175, + "grad_norm": 0.3235074579715729, + "learning_rate": 0.0005107959183673469, + "loss": 3.7063, + "step": 10300 + }, + { + "epoch": 3.0131034884398114, + "grad_norm": 0.314828097820282, + "learning_rate": 0.0005103586005830903, + "loss": 3.6245, + "step": 10350 + }, + { + "epoch": 3.0276629200396017, + "grad_norm": 0.31607604026794434, + "learning_rate": 0.0005099212827988338, + "loss": 3.6112, + "step": 10400 + }, + { + "epoch": 3.042222351639392, + "grad_norm": 0.35359737277030945, + "learning_rate": 0.0005094839650145772, + "loss": 3.609, + "step": 10450 + }, + { + "epoch": 3.0567817832391824, + "grad_norm": 0.32654085755348206, + "learning_rate": 0.0005090466472303206, + "loss": 3.6166, + "step": 10500 + }, + { + "epoch": 3.0713412148389727, + "grad_norm": 0.3420456051826477, + "learning_rate": 0.0005086093294460641, + "loss": 3.6039, + "step": 10550 + }, + { + "epoch": 3.085900646438763, + "grad_norm": 0.32927215099334717, + "learning_rate": 0.0005081720116618076, + "loss": 3.6076, + "step": 10600 + }, + { + "epoch": 3.1004600780385534, + "grad_norm": 0.32174116373062134, + "learning_rate": 0.0005077346938775509, + "loss": 3.62, + "step": 10650 + }, + { + "epoch": 3.1150195096383437, + "grad_norm": 0.32081031799316406, + "learning_rate": 0.0005072973760932944, + "loss": 3.6198, + "step": 10700 + }, + { + "epoch": 3.129578941238134, + "grad_norm": 0.3233294188976288, + "learning_rate": 0.0005068600583090379, + "loss": 3.6221, + "step": 10750 + }, + { + "epoch": 3.1441383728379244, + "grad_norm": 0.3179484009742737, + "learning_rate": 0.0005064227405247813, + "loss": 3.6265, + "step": 10800 + }, + { + "epoch": 3.1586978044377148, + "grad_norm": 0.3125128746032715, + "learning_rate": 0.0005059854227405247, + "loss": 3.6316, + "step": 10850 + }, + { + "epoch": 3.173257236037505, + "grad_norm": 0.32463568449020386, + "learning_rate": 0.0005055481049562682, + "loss": 3.6245, + "step": 10900 + }, + { + "epoch": 3.1878166676372954, + "grad_norm": 0.31310543417930603, + "learning_rate": 0.0005051107871720116, + "loss": 3.6185, + "step": 10950 + }, + { + "epoch": 3.2023760992370858, + "grad_norm": 0.3464823067188263, + "learning_rate": 0.0005046734693877551, + "loss": 3.6204, + "step": 11000 + }, + { + "epoch": 3.2023760992370858, + "eval_accuracy": 0.3516197697403503, + "eval_loss": 3.7100203037261963, + "eval_runtime": 180.2504, + "eval_samples_per_second": 92.333, + "eval_steps_per_second": 5.775, + "step": 11000 + }, + { + "epoch": 3.216935530836876, + "grad_norm": 0.3277588486671448, + "learning_rate": 0.0005042361516034986, + "loss": 3.6268, + "step": 11050 + }, + { + "epoch": 3.2314949624366665, + "grad_norm": 0.32706061005592346, + "learning_rate": 0.0005037988338192419, + "loss": 3.6153, + "step": 11100 + }, + { + "epoch": 3.246054394036457, + "grad_norm": 0.31766435503959656, + "learning_rate": 0.0005033615160349854, + "loss": 3.6336, + "step": 11150 + }, + { + "epoch": 3.260613825636247, + "grad_norm": 0.3006264269351959, + "learning_rate": 0.0005029241982507288, + "loss": 3.6275, + "step": 11200 + }, + { + "epoch": 3.2751732572360375, + "grad_norm": 0.32919037342071533, + "learning_rate": 0.0005024868804664723, + "loss": 3.6301, + "step": 11250 + }, + { + "epoch": 3.289732688835828, + "grad_norm": 0.3155740797519684, + "learning_rate": 0.0005020495626822157, + "loss": 3.6203, + "step": 11300 + }, + { + "epoch": 3.304292120435618, + "grad_norm": 0.3527681529521942, + "learning_rate": 0.0005016122448979591, + "loss": 3.6288, + "step": 11350 + }, + { + "epoch": 3.3188515520354085, + "grad_norm": 0.3135804831981659, + "learning_rate": 0.0005011749271137026, + "loss": 3.6415, + "step": 11400 + }, + { + "epoch": 3.333410983635199, + "grad_norm": 0.3078667223453522, + "learning_rate": 0.0005007376093294461, + "loss": 3.6284, + "step": 11450 + }, + { + "epoch": 3.347970415234989, + "grad_norm": 0.319755494594574, + "learning_rate": 0.0005003002915451894, + "loss": 3.6314, + "step": 11500 + }, + { + "epoch": 3.3625298468347795, + "grad_norm": 0.32641854882240295, + "learning_rate": 0.0004998629737609329, + "loss": 3.629, + "step": 11550 + }, + { + "epoch": 3.37708927843457, + "grad_norm": 0.3268803060054779, + "learning_rate": 0.0004994256559766764, + "loss": 3.6372, + "step": 11600 + }, + { + "epoch": 3.39164871003436, + "grad_norm": 0.32382065057754517, + "learning_rate": 0.0004989883381924198, + "loss": 3.6286, + "step": 11650 + }, + { + "epoch": 3.4062081416341505, + "grad_norm": 0.3158361613750458, + "learning_rate": 0.0004985510204081632, + "loss": 3.6329, + "step": 11700 + }, + { + "epoch": 3.420767573233941, + "grad_norm": 0.31245240569114685, + "learning_rate": 0.0004981137026239067, + "loss": 3.6428, + "step": 11750 + }, + { + "epoch": 3.435327004833731, + "grad_norm": 0.3362303078174591, + "learning_rate": 0.0004976763848396501, + "loss": 3.6369, + "step": 11800 + }, + { + "epoch": 3.4498864364335216, + "grad_norm": 0.3208737373352051, + "learning_rate": 0.0004972390670553935, + "loss": 3.6428, + "step": 11850 + }, + { + "epoch": 3.464445868033312, + "grad_norm": 0.3163570761680603, + "learning_rate": 0.000496801749271137, + "loss": 3.6239, + "step": 11900 + }, + { + "epoch": 3.4790052996331022, + "grad_norm": 0.3181529641151428, + "learning_rate": 0.0004963644314868804, + "loss": 3.6303, + "step": 11950 + }, + { + "epoch": 3.4935647312328926, + "grad_norm": 0.33231833577156067, + "learning_rate": 0.0004959271137026239, + "loss": 3.6358, + "step": 12000 + }, + { + "epoch": 3.4935647312328926, + "eval_accuracy": 0.35356460577150667, + "eval_loss": 3.6901774406433105, + "eval_runtime": 180.1137, + "eval_samples_per_second": 92.403, + "eval_steps_per_second": 5.78, + "step": 12000 + }, + { + "epoch": 3.508124162832683, + "grad_norm": 0.3368173837661743, + "learning_rate": 0.0004954897959183672, + "loss": 3.6389, + "step": 12050 + }, + { + "epoch": 3.5226835944324733, + "grad_norm": 0.33402830362319946, + "learning_rate": 0.0004950524781341108, + "loss": 3.645, + "step": 12100 + }, + { + "epoch": 3.5372430260322636, + "grad_norm": 0.33064502477645874, + "learning_rate": 0.0004946151603498542, + "loss": 3.6336, + "step": 12150 + }, + { + "epoch": 3.551802457632054, + "grad_norm": 0.31694450974464417, + "learning_rate": 0.0004941778425655976, + "loss": 3.6325, + "step": 12200 + }, + { + "epoch": 3.5663618892318443, + "grad_norm": 0.3069068193435669, + "learning_rate": 0.0004937405247813411, + "loss": 3.6246, + "step": 12250 + }, + { + "epoch": 3.5809213208316346, + "grad_norm": 0.3142222464084625, + "learning_rate": 0.0004933032069970845, + "loss": 3.6453, + "step": 12300 + }, + { + "epoch": 3.595480752431425, + "grad_norm": 0.3237994909286499, + "learning_rate": 0.0004928658892128279, + "loss": 3.6295, + "step": 12350 + }, + { + "epoch": 3.6100401840312153, + "grad_norm": 0.30255311727523804, + "learning_rate": 0.0004924285714285714, + "loss": 3.6468, + "step": 12400 + }, + { + "epoch": 3.6245996156310056, + "grad_norm": 0.3128635883331299, + "learning_rate": 0.0004919912536443149, + "loss": 3.6346, + "step": 12450 + }, + { + "epoch": 3.639159047230796, + "grad_norm": 0.31057000160217285, + "learning_rate": 0.0004915539358600582, + "loss": 3.622, + "step": 12500 + }, + { + "epoch": 3.6537184788305863, + "grad_norm": 0.3453236520290375, + "learning_rate": 0.0004911166180758017, + "loss": 3.6354, + "step": 12550 + }, + { + "epoch": 3.6682779104303767, + "grad_norm": 0.3358878493309021, + "learning_rate": 0.0004906793002915452, + "loss": 3.6276, + "step": 12600 + }, + { + "epoch": 3.682837342030167, + "grad_norm": 0.3207370638847351, + "learning_rate": 0.0004902419825072886, + "loss": 3.6358, + "step": 12650 + }, + { + "epoch": 3.6973967736299573, + "grad_norm": 0.31057843565940857, + "learning_rate": 0.000489804664723032, + "loss": 3.6433, + "step": 12700 + }, + { + "epoch": 3.7119562052297477, + "grad_norm": 0.32829779386520386, + "learning_rate": 0.0004893673469387754, + "loss": 3.6282, + "step": 12750 + }, + { + "epoch": 3.726515636829538, + "grad_norm": 0.32469305396080017, + "learning_rate": 0.0004889300291545189, + "loss": 3.6353, + "step": 12800 + }, + { + "epoch": 3.7410750684293284, + "grad_norm": 0.32968953251838684, + "learning_rate": 0.0004884927113702624, + "loss": 3.6307, + "step": 12850 + }, + { + "epoch": 3.755634500029119, + "grad_norm": 0.3125181198120117, + "learning_rate": 0.0004880553935860058, + "loss": 3.6321, + "step": 12900 + }, + { + "epoch": 3.770193931628909, + "grad_norm": 0.31494152545928955, + "learning_rate": 0.0004876180758017492, + "loss": 3.6359, + "step": 12950 + }, + { + "epoch": 3.7847533632287, + "grad_norm": 0.32235443592071533, + "learning_rate": 0.0004871807580174927, + "loss": 3.6319, + "step": 13000 + }, + { + "epoch": 3.7847533632287, + "eval_accuracy": 0.35539115250113085, + "eval_loss": 3.6755480766296387, + "eval_runtime": 179.9398, + "eval_samples_per_second": 92.492, + "eval_steps_per_second": 5.785, + "step": 13000 + }, + { + "epoch": 3.7993127948284897, + "grad_norm": 0.3097991645336151, + "learning_rate": 0.00048674344023323613, + "loss": 3.6393, + "step": 13050 + }, + { + "epoch": 3.8138722264282805, + "grad_norm": 0.3186699450016022, + "learning_rate": 0.00048630612244897955, + "loss": 3.6318, + "step": 13100 + }, + { + "epoch": 3.8284316580280704, + "grad_norm": 0.3037383556365967, + "learning_rate": 0.00048586880466472296, + "loss": 3.6293, + "step": 13150 + }, + { + "epoch": 3.842991089627861, + "grad_norm": 0.32788893580436707, + "learning_rate": 0.0004854314868804664, + "loss": 3.6152, + "step": 13200 + }, + { + "epoch": 3.857550521227651, + "grad_norm": 0.3229829967021942, + "learning_rate": 0.0004849941690962099, + "loss": 3.6483, + "step": 13250 + }, + { + "epoch": 3.872109952827442, + "grad_norm": 0.3292683959007263, + "learning_rate": 0.0004845568513119533, + "loss": 3.6381, + "step": 13300 + }, + { + "epoch": 3.8866693844272318, + "grad_norm": 0.3210625648498535, + "learning_rate": 0.00048411953352769677, + "loss": 3.6269, + "step": 13350 + }, + { + "epoch": 3.9012288160270225, + "grad_norm": 0.31549862027168274, + "learning_rate": 0.0004836822157434402, + "loss": 3.6213, + "step": 13400 + }, + { + "epoch": 3.9157882476268124, + "grad_norm": 0.30793866515159607, + "learning_rate": 0.00048324489795918365, + "loss": 3.6309, + "step": 13450 + }, + { + "epoch": 3.930347679226603, + "grad_norm": 0.3356075882911682, + "learning_rate": 0.00048280758017492706, + "loss": 3.6262, + "step": 13500 + }, + { + "epoch": 3.944907110826393, + "grad_norm": 0.32913827896118164, + "learning_rate": 0.00048237026239067053, + "loss": 3.6213, + "step": 13550 + }, + { + "epoch": 3.959466542426184, + "grad_norm": 0.3327690362930298, + "learning_rate": 0.00048193294460641394, + "loss": 3.6438, + "step": 13600 + }, + { + "epoch": 3.974025974025974, + "grad_norm": 0.3101835250854492, + "learning_rate": 0.00048149562682215735, + "loss": 3.6296, + "step": 13650 + }, + { + "epoch": 3.9885854056257646, + "grad_norm": 0.327761709690094, + "learning_rate": 0.0004810583090379009, + "loss": 3.6235, + "step": 13700 + }, + { + "epoch": 4.002911886319958, + "grad_norm": 0.32982325553894043, + "learning_rate": 0.0004806209912536443, + "loss": 3.6143, + "step": 13750 + }, + { + "epoch": 4.017471317919749, + "grad_norm": 0.32604551315307617, + "learning_rate": 0.0004801836734693877, + "loss": 3.511, + "step": 13800 + }, + { + "epoch": 4.032030749519539, + "grad_norm": 0.3375633955001831, + "learning_rate": 0.0004797463556851311, + "loss": 3.5208, + "step": 13850 + }, + { + "epoch": 4.046590181119329, + "grad_norm": 0.3174295127391815, + "learning_rate": 0.00047930903790087463, + "loss": 3.5217, + "step": 13900 + }, + { + "epoch": 4.061149612719119, + "grad_norm": 0.3297431170940399, + "learning_rate": 0.00047887172011661805, + "loss": 3.5218, + "step": 13950 + }, + { + "epoch": 4.07570904431891, + "grad_norm": 0.33495378494262695, + "learning_rate": 0.00047843440233236146, + "loss": 3.5213, + "step": 14000 + }, + { + "epoch": 4.07570904431891, + "eval_accuracy": 0.35668316328168387, + "eval_loss": 3.6667861938476562, + "eval_runtime": 179.8727, + "eval_samples_per_second": 92.527, + "eval_steps_per_second": 5.787, + "step": 14000 + }, + { + "epoch": 4.0902684759187, + "grad_norm": 0.3054860234260559, + "learning_rate": 0.0004779970845481049, + "loss": 3.5277, + "step": 14050 + }, + { + "epoch": 4.104827907518491, + "grad_norm": 0.3263727128505707, + "learning_rate": 0.00047755976676384834, + "loss": 3.5327, + "step": 14100 + }, + { + "epoch": 4.119387339118281, + "grad_norm": 0.3170093894004822, + "learning_rate": 0.0004771224489795918, + "loss": 3.5418, + "step": 14150 + }, + { + "epoch": 4.1339467707180715, + "grad_norm": 0.33194735646247864, + "learning_rate": 0.00047668513119533527, + "loss": 3.5364, + "step": 14200 + }, + { + "epoch": 4.148506202317861, + "grad_norm": 0.32043886184692383, + "learning_rate": 0.0004762478134110787, + "loss": 3.544, + "step": 14250 + }, + { + "epoch": 4.163065633917652, + "grad_norm": 0.32483235001564026, + "learning_rate": 0.0004758104956268221, + "loss": 3.5385, + "step": 14300 + }, + { + "epoch": 4.177625065517442, + "grad_norm": 0.3203752934932709, + "learning_rate": 0.0004753731778425656, + "loss": 3.5636, + "step": 14350 + }, + { + "epoch": 4.192184497117233, + "grad_norm": 0.3080170452594757, + "learning_rate": 0.00047493586005830903, + "loss": 3.5453, + "step": 14400 + }, + { + "epoch": 4.206743928717023, + "grad_norm": 0.3298153877258301, + "learning_rate": 0.00047449854227405244, + "loss": 3.5491, + "step": 14450 + }, + { + "epoch": 4.2213033603168135, + "grad_norm": 0.3529611825942993, + "learning_rate": 0.00047406122448979585, + "loss": 3.544, + "step": 14500 + }, + { + "epoch": 4.235862791916603, + "grad_norm": 0.3273563086986542, + "learning_rate": 0.00047362390670553926, + "loss": 3.5498, + "step": 14550 + }, + { + "epoch": 4.250422223516394, + "grad_norm": 0.313999205827713, + "learning_rate": 0.0004731865889212828, + "loss": 3.5526, + "step": 14600 + }, + { + "epoch": 4.264981655116184, + "grad_norm": 0.30790430307388306, + "learning_rate": 0.0004727492711370262, + "loss": 3.5544, + "step": 14650 + }, + { + "epoch": 4.279541086715975, + "grad_norm": 0.33186236023902893, + "learning_rate": 0.0004723119533527696, + "loss": 3.5506, + "step": 14700 + }, + { + "epoch": 4.294100518315765, + "grad_norm": 0.32786890864372253, + "learning_rate": 0.0004718746355685131, + "loss": 3.5475, + "step": 14750 + }, + { + "epoch": 4.308659949915556, + "grad_norm": 0.3234544098377228, + "learning_rate": 0.0004714373177842565, + "loss": 3.5549, + "step": 14800 + }, + { + "epoch": 4.3232193815153455, + "grad_norm": 0.31056949496269226, + "learning_rate": 0.00047099999999999996, + "loss": 3.5447, + "step": 14850 + }, + { + "epoch": 4.337778813115136, + "grad_norm": 0.3284071087837219, + "learning_rate": 0.0004705626822157434, + "loss": 3.5585, + "step": 14900 + }, + { + "epoch": 4.352338244714926, + "grad_norm": 0.32166486978530884, + "learning_rate": 0.00047012536443148683, + "loss": 3.5546, + "step": 14950 + }, + { + "epoch": 4.366897676314717, + "grad_norm": 0.3296414613723755, + "learning_rate": 0.00046968804664723025, + "loss": 3.5562, + "step": 15000 + }, + { + "epoch": 4.366897676314717, + "eval_accuracy": 0.3575879706129867, + "eval_loss": 3.6574151515960693, + "eval_runtime": 179.9318, + "eval_samples_per_second": 92.496, + "eval_steps_per_second": 5.786, + "step": 15000 + }, + { + "epoch": 4.381457107914507, + "grad_norm": 0.31862205266952515, + "learning_rate": 0.00046925072886297377, + "loss": 3.5609, + "step": 15050 + }, + { + "epoch": 4.396016539514298, + "grad_norm": 0.321135938167572, + "learning_rate": 0.0004688134110787172, + "loss": 3.5592, + "step": 15100 + }, + { + "epoch": 4.4105759711140875, + "grad_norm": 0.34049704670906067, + "learning_rate": 0.0004683760932944606, + "loss": 3.5666, + "step": 15150 + }, + { + "epoch": 4.425135402713878, + "grad_norm": 0.32759514451026917, + "learning_rate": 0.000467938775510204, + "loss": 3.5645, + "step": 15200 + }, + { + "epoch": 4.439694834313668, + "grad_norm": 0.31559038162231445, + "learning_rate": 0.00046750145772594747, + "loss": 3.5424, + "step": 15250 + }, + { + "epoch": 4.454254265913459, + "grad_norm": 0.31429657340049744, + "learning_rate": 0.00046706413994169094, + "loss": 3.5577, + "step": 15300 + }, + { + "epoch": 4.468813697513249, + "grad_norm": 0.32119688391685486, + "learning_rate": 0.00046662682215743435, + "loss": 3.5645, + "step": 15350 + }, + { + "epoch": 4.48337312911304, + "grad_norm": 0.32725510001182556, + "learning_rate": 0.0004661895043731778, + "loss": 3.558, + "step": 15400 + }, + { + "epoch": 4.4979325607128295, + "grad_norm": 0.3302425742149353, + "learning_rate": 0.00046575218658892123, + "loss": 3.5645, + "step": 15450 + }, + { + "epoch": 4.51249199231262, + "grad_norm": 0.33752188086509705, + "learning_rate": 0.0004653148688046647, + "loss": 3.5654, + "step": 15500 + }, + { + "epoch": 4.52705142391241, + "grad_norm": 0.3348866105079651, + "learning_rate": 0.0004648775510204081, + "loss": 3.5587, + "step": 15550 + }, + { + "epoch": 4.541610855512201, + "grad_norm": 0.33069008588790894, + "learning_rate": 0.0004644402332361516, + "loss": 3.5564, + "step": 15600 + }, + { + "epoch": 4.556170287111991, + "grad_norm": 0.36258620023727417, + "learning_rate": 0.000464002915451895, + "loss": 3.5586, + "step": 15650 + }, + { + "epoch": 4.570729718711782, + "grad_norm": 0.3146510422229767, + "learning_rate": 0.0004635655976676384, + "loss": 3.5612, + "step": 15700 + }, + { + "epoch": 4.585289150311572, + "grad_norm": 0.3268812298774719, + "learning_rate": 0.0004631282798833819, + "loss": 3.5536, + "step": 15750 + }, + { + "epoch": 4.599848581911362, + "grad_norm": 0.31493905186653137, + "learning_rate": 0.00046269096209912533, + "loss": 3.5717, + "step": 15800 + }, + { + "epoch": 4.614408013511152, + "grad_norm": 0.3173486590385437, + "learning_rate": 0.00046225364431486875, + "loss": 3.5678, + "step": 15850 + }, + { + "epoch": 4.628967445110943, + "grad_norm": 0.32398083806037903, + "learning_rate": 0.00046181632653061216, + "loss": 3.557, + "step": 15900 + }, + { + "epoch": 4.643526876710733, + "grad_norm": 0.31683549284935, + "learning_rate": 0.0004613790087463557, + "loss": 3.5652, + "step": 15950 + }, + { + "epoch": 4.658086308310524, + "grad_norm": 0.3226284682750702, + "learning_rate": 0.0004609416909620991, + "loss": 3.5583, + "step": 16000 + }, + { + "epoch": 4.658086308310524, + "eval_accuracy": 0.35886363724551484, + "eval_loss": 3.641108989715576, + "eval_runtime": 179.9045, + "eval_samples_per_second": 92.51, + "eval_steps_per_second": 5.786, + "step": 16000 + }, + { + "epoch": 4.672645739910314, + "grad_norm": 0.3244362771511078, + "learning_rate": 0.0004605043731778425, + "loss": 3.5653, + "step": 16050 + }, + { + "epoch": 4.687205171510104, + "grad_norm": 0.3218280076980591, + "learning_rate": 0.00046006705539358597, + "loss": 3.5573, + "step": 16100 + }, + { + "epoch": 4.701764603109894, + "grad_norm": 0.31557270884513855, + "learning_rate": 0.0004596297376093294, + "loss": 3.5697, + "step": 16150 + }, + { + "epoch": 4.716324034709685, + "grad_norm": 0.32409724593162537, + "learning_rate": 0.00045919241982507285, + "loss": 3.5727, + "step": 16200 + }, + { + "epoch": 4.730883466309475, + "grad_norm": 0.32196715474128723, + "learning_rate": 0.0004587551020408163, + "loss": 3.5696, + "step": 16250 + }, + { + "epoch": 4.745442897909266, + "grad_norm": 0.3190127909183502, + "learning_rate": 0.00045831778425655973, + "loss": 3.5589, + "step": 16300 + }, + { + "epoch": 4.760002329509056, + "grad_norm": 0.3492906391620636, + "learning_rate": 0.00045788046647230314, + "loss": 3.576, + "step": 16350 + }, + { + "epoch": 4.7745617611088464, + "grad_norm": 0.3227944076061249, + "learning_rate": 0.00045744314868804666, + "loss": 3.5511, + "step": 16400 + }, + { + "epoch": 4.789121192708636, + "grad_norm": 0.3169122040271759, + "learning_rate": 0.0004570058309037901, + "loss": 3.5684, + "step": 16450 + }, + { + "epoch": 4.803680624308427, + "grad_norm": 0.31213343143463135, + "learning_rate": 0.0004565685131195335, + "loss": 3.5667, + "step": 16500 + }, + { + "epoch": 4.818240055908217, + "grad_norm": 0.32593971490859985, + "learning_rate": 0.0004561311953352769, + "loss": 3.5661, + "step": 16550 + }, + { + "epoch": 4.832799487508008, + "grad_norm": 0.33425310254096985, + "learning_rate": 0.0004556938775510203, + "loss": 3.5688, + "step": 16600 + }, + { + "epoch": 4.847358919107798, + "grad_norm": 0.32003140449523926, + "learning_rate": 0.00045525655976676383, + "loss": 3.5601, + "step": 16650 + }, + { + "epoch": 4.8619183507075885, + "grad_norm": 0.3596481382846832, + "learning_rate": 0.00045481924198250724, + "loss": 3.5688, + "step": 16700 + }, + { + "epoch": 4.876477782307378, + "grad_norm": 0.3375333547592163, + "learning_rate": 0.00045438192419825066, + "loss": 3.5685, + "step": 16750 + }, + { + "epoch": 4.891037213907169, + "grad_norm": 0.31676721572875977, + "learning_rate": 0.0004539446064139941, + "loss": 3.5581, + "step": 16800 + }, + { + "epoch": 4.905596645506959, + "grad_norm": 0.3257509469985962, + "learning_rate": 0.0004535072886297376, + "loss": 3.5537, + "step": 16850 + }, + { + "epoch": 4.92015607710675, + "grad_norm": 0.3176610767841339, + "learning_rate": 0.000453069970845481, + "loss": 3.5678, + "step": 16900 + }, + { + "epoch": 4.93471550870654, + "grad_norm": 0.3168198764324188, + "learning_rate": 0.00045263265306122447, + "loss": 3.5499, + "step": 16950 + }, + { + "epoch": 4.9492749403063305, + "grad_norm": 0.31883013248443604, + "learning_rate": 0.0004521953352769679, + "loss": 3.5668, + "step": 17000 + }, + { + "epoch": 4.9492749403063305, + "eval_accuracy": 0.360385171601208, + "eval_loss": 3.626793146133423, + "eval_runtime": 180.0874, + "eval_samples_per_second": 92.416, + "eval_steps_per_second": 5.781, + "step": 17000 + }, + { + "epoch": 4.96383437190612, + "grad_norm": 0.3429825007915497, + "learning_rate": 0.0004517580174927113, + "loss": 3.5693, + "step": 17050 + }, + { + "epoch": 4.978393803505911, + "grad_norm": 0.31468144059181213, + "learning_rate": 0.0004513206997084548, + "loss": 3.5648, + "step": 17100 + }, + { + "epoch": 4.992953235105701, + "grad_norm": 0.3186092972755432, + "learning_rate": 0.0004508833819241982, + "loss": 3.5611, + "step": 17150 + }, + { + "epoch": 5.007279715799895, + "grad_norm": 0.32911449670791626, + "learning_rate": 0.00045044606413994164, + "loss": 3.5003, + "step": 17200 + }, + { + "epoch": 5.021839147399685, + "grad_norm": 0.32932335138320923, + "learning_rate": 0.00045000874635568505, + "loss": 3.4462, + "step": 17250 + }, + { + "epoch": 5.036398578999476, + "grad_norm": 0.3199908435344696, + "learning_rate": 0.00044957142857142857, + "loss": 3.4569, + "step": 17300 + }, + { + "epoch": 5.050958010599266, + "grad_norm": 0.33716824650764465, + "learning_rate": 0.000449134110787172, + "loss": 3.4669, + "step": 17350 + }, + { + "epoch": 5.065517442199057, + "grad_norm": 0.32985949516296387, + "learning_rate": 0.0004486967930029154, + "loss": 3.4787, + "step": 17400 + }, + { + "epoch": 5.080076873798847, + "grad_norm": 0.3227981925010681, + "learning_rate": 0.00044825947521865886, + "loss": 3.46, + "step": 17450 + }, + { + "epoch": 5.094636305398637, + "grad_norm": 0.32830196619033813, + "learning_rate": 0.0004478221574344023, + "loss": 3.4714, + "step": 17500 + }, + { + "epoch": 5.109195736998427, + "grad_norm": 0.33184128999710083, + "learning_rate": 0.00044738483965014574, + "loss": 3.4636, + "step": 17550 + }, + { + "epoch": 5.123755168598218, + "grad_norm": 0.3285403251647949, + "learning_rate": 0.00044694752186588915, + "loss": 3.4711, + "step": 17600 + }, + { + "epoch": 5.138314600198008, + "grad_norm": 0.3541177809238434, + "learning_rate": 0.0004465102040816326, + "loss": 3.4806, + "step": 17650 + }, + { + "epoch": 5.152874031797799, + "grad_norm": 0.3223034143447876, + "learning_rate": 0.00044607288629737603, + "loss": 3.4813, + "step": 17700 + }, + { + "epoch": 5.167433463397589, + "grad_norm": 0.3218257427215576, + "learning_rate": 0.0004456355685131195, + "loss": 3.4826, + "step": 17750 + }, + { + "epoch": 5.1819928949973795, + "grad_norm": 0.3309643566608429, + "learning_rate": 0.00044519825072886297, + "loss": 3.4888, + "step": 17800 + }, + { + "epoch": 5.196552326597169, + "grad_norm": 0.3264036774635315, + "learning_rate": 0.0004447609329446064, + "loss": 3.483, + "step": 17850 + }, + { + "epoch": 5.21111175819696, + "grad_norm": 0.324790358543396, + "learning_rate": 0.0004443236151603498, + "loss": 3.4896, + "step": 17900 + }, + { + "epoch": 5.22567118979675, + "grad_norm": 0.3223564922809601, + "learning_rate": 0.0004438862973760932, + "loss": 3.4711, + "step": 17950 + }, + { + "epoch": 5.240230621396541, + "grad_norm": 0.33800962567329407, + "learning_rate": 0.0004434489795918367, + "loss": 3.4813, + "step": 18000 + }, + { + "epoch": 5.240230621396541, + "eval_accuracy": 0.36069406424049744, + "eval_loss": 3.6294045448303223, + "eval_runtime": 179.9519, + "eval_samples_per_second": 92.486, + "eval_steps_per_second": 5.785, + "step": 18000 + }, + { + "epoch": 5.254790052996331, + "grad_norm": 0.3145381510257721, + "learning_rate": 0.00044301166180758014, + "loss": 3.4916, + "step": 18050 + }, + { + "epoch": 5.2693494845961215, + "grad_norm": 0.33484500646591187, + "learning_rate": 0.00044257434402332355, + "loss": 3.5041, + "step": 18100 + }, + { + "epoch": 5.283908916195911, + "grad_norm": 0.3385532796382904, + "learning_rate": 0.000442137026239067, + "loss": 3.4968, + "step": 18150 + }, + { + "epoch": 5.298468347795702, + "grad_norm": 0.32390275597572327, + "learning_rate": 0.0004416997084548105, + "loss": 3.4945, + "step": 18200 + }, + { + "epoch": 5.313027779395492, + "grad_norm": 0.3384283483028412, + "learning_rate": 0.0004412623906705539, + "loss": 3.4902, + "step": 18250 + }, + { + "epoch": 5.327587210995283, + "grad_norm": 0.3334660530090332, + "learning_rate": 0.00044082507288629736, + "loss": 3.4961, + "step": 18300 + }, + { + "epoch": 5.342146642595073, + "grad_norm": 0.32754096388816833, + "learning_rate": 0.0004403877551020408, + "loss": 3.5051, + "step": 18350 + }, + { + "epoch": 5.3567060741948636, + "grad_norm": 0.3279802203178406, + "learning_rate": 0.0004399504373177842, + "loss": 3.5005, + "step": 18400 + }, + { + "epoch": 5.3712655057946534, + "grad_norm": 0.3342030346393585, + "learning_rate": 0.0004395131195335277, + "loss": 3.4931, + "step": 18450 + }, + { + "epoch": 5.385824937394444, + "grad_norm": 0.3162689805030823, + "learning_rate": 0.0004390758017492711, + "loss": 3.4952, + "step": 18500 + }, + { + "epoch": 5.400384368994234, + "grad_norm": 0.32114139199256897, + "learning_rate": 0.00043863848396501453, + "loss": 3.4999, + "step": 18550 + }, + { + "epoch": 5.414943800594025, + "grad_norm": 0.32540494203567505, + "learning_rate": 0.00043820116618075794, + "loss": 3.5038, + "step": 18600 + }, + { + "epoch": 5.429503232193815, + "grad_norm": 0.33477944135665894, + "learning_rate": 0.00043776384839650147, + "loss": 3.4941, + "step": 18650 + }, + { + "epoch": 5.444062663793606, + "grad_norm": 0.3286316990852356, + "learning_rate": 0.0004373265306122449, + "loss": 3.502, + "step": 18700 + }, + { + "epoch": 5.4586220953933955, + "grad_norm": 0.3312956690788269, + "learning_rate": 0.0004368892128279883, + "loss": 3.5054, + "step": 18750 + }, + { + "epoch": 5.473181526993186, + "grad_norm": 0.33420413732528687, + "learning_rate": 0.0004364518950437317, + "loss": 3.5176, + "step": 18800 + }, + { + "epoch": 5.487740958592976, + "grad_norm": 0.312959223985672, + "learning_rate": 0.00043601457725947517, + "loss": 3.5112, + "step": 18850 + }, + { + "epoch": 5.502300390192767, + "grad_norm": 0.31764182448387146, + "learning_rate": 0.00043557725947521864, + "loss": 3.4987, + "step": 18900 + }, + { + "epoch": 5.516859821792557, + "grad_norm": 0.3139015734195709, + "learning_rate": 0.00043513994169096205, + "loss": 3.4989, + "step": 18950 + }, + { + "epoch": 5.531419253392348, + "grad_norm": 0.32444003224372864, + "learning_rate": 0.0004347026239067055, + "loss": 3.4996, + "step": 19000 + }, + { + "epoch": 5.531419253392348, + "eval_accuracy": 0.3617237455660619, + "eval_loss": 3.6197915077209473, + "eval_runtime": 180.0468, + "eval_samples_per_second": 92.437, + "eval_steps_per_second": 5.782, + "step": 19000 + }, + { + "epoch": 5.5459786849921375, + "grad_norm": 0.32666343450546265, + "learning_rate": 0.0004342653061224489, + "loss": 3.5084, + "step": 19050 + }, + { + "epoch": 5.560538116591928, + "grad_norm": 0.32281461358070374, + "learning_rate": 0.0004338279883381924, + "loss": 3.5143, + "step": 19100 + }, + { + "epoch": 5.575097548191718, + "grad_norm": 0.3272330164909363, + "learning_rate": 0.00043339067055393586, + "loss": 3.5108, + "step": 19150 + }, + { + "epoch": 5.589656979791509, + "grad_norm": 0.31538012623786926, + "learning_rate": 0.00043295335276967927, + "loss": 3.504, + "step": 19200 + }, + { + "epoch": 5.604216411391299, + "grad_norm": 0.34619444608688354, + "learning_rate": 0.0004325160349854227, + "loss": 3.4962, + "step": 19250 + }, + { + "epoch": 5.61877584299109, + "grad_norm": 0.33601802587509155, + "learning_rate": 0.0004320787172011661, + "loss": 3.5107, + "step": 19300 + }, + { + "epoch": 5.6333352745908805, + "grad_norm": 0.32709893584251404, + "learning_rate": 0.0004316413994169096, + "loss": 3.5085, + "step": 19350 + }, + { + "epoch": 5.64789470619067, + "grad_norm": 0.332736611366272, + "learning_rate": 0.00043120408163265303, + "loss": 3.5027, + "step": 19400 + }, + { + "epoch": 5.66245413779046, + "grad_norm": 0.32013025879859924, + "learning_rate": 0.00043076676384839644, + "loss": 3.507, + "step": 19450 + }, + { + "epoch": 5.677013569390251, + "grad_norm": 0.34380871057510376, + "learning_rate": 0.0004303294460641399, + "loss": 3.5091, + "step": 19500 + }, + { + "epoch": 5.691573000990042, + "grad_norm": 0.3146701455116272, + "learning_rate": 0.0004298921282798834, + "loss": 3.5085, + "step": 19550 + }, + { + "epoch": 5.706132432589832, + "grad_norm": 0.3258221447467804, + "learning_rate": 0.0004294548104956268, + "loss": 3.513, + "step": 19600 + }, + { + "epoch": 5.720691864189622, + "grad_norm": 0.3335384726524353, + "learning_rate": 0.0004290174927113702, + "loss": 3.5065, + "step": 19650 + }, + { + "epoch": 5.735251295789412, + "grad_norm": 0.333322674036026, + "learning_rate": 0.00042858017492711367, + "loss": 3.5061, + "step": 19700 + }, + { + "epoch": 5.749810727389203, + "grad_norm": 0.3227587342262268, + "learning_rate": 0.0004281428571428571, + "loss": 3.5212, + "step": 19750 + }, + { + "epoch": 5.764370158988993, + "grad_norm": 0.3334672152996063, + "learning_rate": 0.00042770553935860055, + "loss": 3.4976, + "step": 19800 + }, + { + "epoch": 5.778929590588783, + "grad_norm": 0.3109551966190338, + "learning_rate": 0.000427268221574344, + "loss": 3.5159, + "step": 19850 + }, + { + "epoch": 5.793489022188574, + "grad_norm": 0.3229271471500397, + "learning_rate": 0.0004268309037900874, + "loss": 3.511, + "step": 19900 + }, + { + "epoch": 5.8080484537883645, + "grad_norm": 0.31595003604888916, + "learning_rate": 0.00042639358600583084, + "loss": 3.5083, + "step": 19950 + }, + { + "epoch": 5.822607885388154, + "grad_norm": 0.3313562273979187, + "learning_rate": 0.00042595626822157436, + "loss": 3.5032, + "step": 20000 + }, + { + "epoch": 5.822607885388154, + "eval_accuracy": 0.3630212827851557, + "eval_loss": 3.6066231727600098, + "eval_runtime": 180.0689, + "eval_samples_per_second": 92.426, + "eval_steps_per_second": 5.781, + "step": 20000 + }, + { + "epoch": 5.837167316987944, + "grad_norm": 0.32517609000205994, + "learning_rate": 0.00042551895043731777, + "loss": 3.5075, + "step": 20050 + }, + { + "epoch": 5.851726748587735, + "grad_norm": 0.3312103748321533, + "learning_rate": 0.0004250816326530612, + "loss": 3.5038, + "step": 20100 + }, + { + "epoch": 5.866286180187526, + "grad_norm": 0.3302570879459381, + "learning_rate": 0.0004246443148688046, + "loss": 3.5185, + "step": 20150 + }, + { + "epoch": 5.880845611787316, + "grad_norm": 0.3184104561805725, + "learning_rate": 0.00042420699708454806, + "loss": 3.5069, + "step": 20200 + }, + { + "epoch": 5.895405043387106, + "grad_norm": 0.31885817646980286, + "learning_rate": 0.00042376967930029153, + "loss": 3.5122, + "step": 20250 + }, + { + "epoch": 5.9099644749868965, + "grad_norm": 0.3231607973575592, + "learning_rate": 0.00042333236151603494, + "loss": 3.522, + "step": 20300 + }, + { + "epoch": 5.924523906586687, + "grad_norm": 0.3280011713504791, + "learning_rate": 0.0004228950437317784, + "loss": 3.5191, + "step": 20350 + }, + { + "epoch": 5.939083338186477, + "grad_norm": 0.32695943117141724, + "learning_rate": 0.0004224577259475218, + "loss": 3.5189, + "step": 20400 + }, + { + "epoch": 5.953642769786267, + "grad_norm": 0.31571418046951294, + "learning_rate": 0.0004220204081632653, + "loss": 3.506, + "step": 20450 + }, + { + "epoch": 5.968202201386058, + "grad_norm": 0.3223441243171692, + "learning_rate": 0.0004215830903790087, + "loss": 3.5298, + "step": 20500 + }, + { + "epoch": 5.982761632985849, + "grad_norm": 0.3090570569038391, + "learning_rate": 0.00042114577259475217, + "loss": 3.5086, + "step": 20550 + }, + { + "epoch": 5.9973210645856385, + "grad_norm": 0.32136136293411255, + "learning_rate": 0.0004207084548104956, + "loss": 3.5194, + "step": 20600 + }, + { + "epoch": 6.011647545279832, + "grad_norm": 0.33823925256729126, + "learning_rate": 0.000420271137026239, + "loss": 3.4225, + "step": 20650 + }, + { + "epoch": 6.026206976879623, + "grad_norm": 0.31168079376220703, + "learning_rate": 0.0004198338192419825, + "loss": 3.4097, + "step": 20700 + }, + { + "epoch": 6.040766408479413, + "grad_norm": 0.33379727602005005, + "learning_rate": 0.0004193965014577259, + "loss": 3.3974, + "step": 20750 + }, + { + "epoch": 6.055325840079203, + "grad_norm": 0.3196876645088196, + "learning_rate": 0.00041895918367346934, + "loss": 3.4086, + "step": 20800 + }, + { + "epoch": 6.069885271678993, + "grad_norm": 0.3263348937034607, + "learning_rate": 0.00041852186588921275, + "loss": 3.4206, + "step": 20850 + }, + { + "epoch": 6.084444703278784, + "grad_norm": 0.3413217067718506, + "learning_rate": 0.00041808454810495627, + "loss": 3.4143, + "step": 20900 + }, + { + "epoch": 6.099004134878574, + "grad_norm": 0.3205811381340027, + "learning_rate": 0.0004176472303206997, + "loss": 3.4236, + "step": 20950 + }, + { + "epoch": 6.113563566478365, + "grad_norm": 0.3402191996574402, + "learning_rate": 0.0004172099125364431, + "loss": 3.4196, + "step": 21000 + }, + { + "epoch": 6.113563566478365, + "eval_accuracy": 0.36323646113684954, + "eval_loss": 3.610100030899048, + "eval_runtime": 179.9878, + "eval_samples_per_second": 92.467, + "eval_steps_per_second": 5.784, + "step": 21000 + }, + { + "epoch": 6.128122998078155, + "grad_norm": 0.35015061497688293, + "learning_rate": 0.00041677259475218656, + "loss": 3.4323, + "step": 21050 + }, + { + "epoch": 6.142682429677945, + "grad_norm": 0.3365619480609894, + "learning_rate": 0.00041633527696792997, + "loss": 3.4316, + "step": 21100 + }, + { + "epoch": 6.157241861277735, + "grad_norm": 0.32558462023735046, + "learning_rate": 0.00041589795918367344, + "loss": 3.4261, + "step": 21150 + }, + { + "epoch": 6.171801292877526, + "grad_norm": 0.3229493498802185, + "learning_rate": 0.0004154606413994169, + "loss": 3.4273, + "step": 21200 + }, + { + "epoch": 6.186360724477316, + "grad_norm": 0.3373366594314575, + "learning_rate": 0.0004150233236151603, + "loss": 3.4241, + "step": 21250 + }, + { + "epoch": 6.200920156077107, + "grad_norm": 0.33470067381858826, + "learning_rate": 0.00041458600583090373, + "loss": 3.436, + "step": 21300 + }, + { + "epoch": 6.215479587676897, + "grad_norm": 0.33129194378852844, + "learning_rate": 0.00041414868804664725, + "loss": 3.4464, + "step": 21350 + }, + { + "epoch": 6.2300390192766875, + "grad_norm": 0.3305993676185608, + "learning_rate": 0.00041371137026239066, + "loss": 3.44, + "step": 21400 + }, + { + "epoch": 6.244598450876477, + "grad_norm": 0.3288079500198364, + "learning_rate": 0.0004132740524781341, + "loss": 3.4417, + "step": 21450 + }, + { + "epoch": 6.259157882476268, + "grad_norm": 0.33732712268829346, + "learning_rate": 0.0004128367346938775, + "loss": 3.445, + "step": 21500 + }, + { + "epoch": 6.273717314076059, + "grad_norm": 0.3398957848548889, + "learning_rate": 0.0004123994169096209, + "loss": 3.4473, + "step": 21550 + }, + { + "epoch": 6.288276745675849, + "grad_norm": 0.3353675305843353, + "learning_rate": 0.0004119620991253644, + "loss": 3.4428, + "step": 21600 + }, + { + "epoch": 6.302836177275639, + "grad_norm": 0.3312719464302063, + "learning_rate": 0.00041152478134110783, + "loss": 3.4346, + "step": 21650 + }, + { + "epoch": 6.3173956088754295, + "grad_norm": 0.32870662212371826, + "learning_rate": 0.00041108746355685125, + "loss": 3.4397, + "step": 21700 + }, + { + "epoch": 6.33195504047522, + "grad_norm": 0.3326077461242676, + "learning_rate": 0.0004106501457725947, + "loss": 3.4494, + "step": 21750 + }, + { + "epoch": 6.34651447207501, + "grad_norm": 0.32431626319885254, + "learning_rate": 0.0004102128279883382, + "loss": 3.435, + "step": 21800 + }, + { + "epoch": 6.3610739036748, + "grad_norm": 0.32606053352355957, + "learning_rate": 0.0004097755102040816, + "loss": 3.4515, + "step": 21850 + }, + { + "epoch": 6.375633335274591, + "grad_norm": 0.33837705850601196, + "learning_rate": 0.00040933819241982506, + "loss": 3.4578, + "step": 21900 + }, + { + "epoch": 6.390192766874382, + "grad_norm": 0.35296231508255005, + "learning_rate": 0.00040890087463556847, + "loss": 3.4563, + "step": 21950 + }, + { + "epoch": 6.4047521984741715, + "grad_norm": 0.3277094066143036, + "learning_rate": 0.0004084635568513119, + "loss": 3.4499, + "step": 22000 + }, + { + "epoch": 6.4047521984741715, + "eval_accuracy": 0.3639197405913266, + "eval_loss": 3.6030333042144775, + "eval_runtime": 180.1326, + "eval_samples_per_second": 92.393, + "eval_steps_per_second": 5.779, + "step": 22000 + }, + { + "epoch": 6.419311630073962, + "grad_norm": 0.3193458020687103, + "learning_rate": 0.0004080262390670554, + "loss": 3.4559, + "step": 22050 + }, + { + "epoch": 6.433871061673752, + "grad_norm": 0.3288237452507019, + "learning_rate": 0.0004075889212827988, + "loss": 3.4597, + "step": 22100 + }, + { + "epoch": 6.448430493273543, + "grad_norm": 0.3396027088165283, + "learning_rate": 0.00040715160349854223, + "loss": 3.4572, + "step": 22150 + }, + { + "epoch": 6.462989924873333, + "grad_norm": 0.3147648572921753, + "learning_rate": 0.00040671428571428564, + "loss": 3.4541, + "step": 22200 + }, + { + "epoch": 6.477549356473124, + "grad_norm": 0.31667134165763855, + "learning_rate": 0.00040627696793002916, + "loss": 3.4621, + "step": 22250 + }, + { + "epoch": 6.492108788072914, + "grad_norm": 0.34657663106918335, + "learning_rate": 0.0004058396501457726, + "loss": 3.4604, + "step": 22300 + }, + { + "epoch": 6.506668219672704, + "grad_norm": 0.32261285185813904, + "learning_rate": 0.000405402332361516, + "loss": 3.4612, + "step": 22350 + }, + { + "epoch": 6.521227651272494, + "grad_norm": 0.3422505855560303, + "learning_rate": 0.00040496501457725945, + "loss": 3.4559, + "step": 22400 + }, + { + "epoch": 6.535787082872285, + "grad_norm": 0.3316167891025543, + "learning_rate": 0.00040452769679300287, + "loss": 3.4496, + "step": 22450 + }, + { + "epoch": 6.550346514472075, + "grad_norm": 0.3352113962173462, + "learning_rate": 0.00040409037900874633, + "loss": 3.4656, + "step": 22500 + }, + { + "epoch": 6.564905946071866, + "grad_norm": 0.3302208185195923, + "learning_rate": 0.00040365306122448974, + "loss": 3.444, + "step": 22550 + }, + { + "epoch": 6.579465377671656, + "grad_norm": 0.3382601737976074, + "learning_rate": 0.0004032157434402332, + "loss": 3.4749, + "step": 22600 + }, + { + "epoch": 6.594024809271446, + "grad_norm": 0.32733502984046936, + "learning_rate": 0.0004027784256559766, + "loss": 3.4676, + "step": 22650 + }, + { + "epoch": 6.608584240871236, + "grad_norm": 0.3271522521972656, + "learning_rate": 0.0004023411078717201, + "loss": 3.4625, + "step": 22700 + }, + { + "epoch": 6.623143672471027, + "grad_norm": 0.35525378584861755, + "learning_rate": 0.00040190379008746356, + "loss": 3.47, + "step": 22750 + }, + { + "epoch": 6.637703104070817, + "grad_norm": 0.34130969643592834, + "learning_rate": 0.00040146647230320697, + "loss": 3.4531, + "step": 22800 + }, + { + "epoch": 6.652262535670608, + "grad_norm": 0.3281981647014618, + "learning_rate": 0.0004010291545189504, + "loss": 3.4675, + "step": 22850 + }, + { + "epoch": 6.666821967270398, + "grad_norm": 0.3403642475605011, + "learning_rate": 0.0004005918367346938, + "loss": 3.4779, + "step": 22900 + }, + { + "epoch": 6.6813813988701884, + "grad_norm": 0.35402730107307434, + "learning_rate": 0.0004001545189504373, + "loss": 3.4748, + "step": 22950 + }, + { + "epoch": 6.695940830469978, + "grad_norm": 0.3444899618625641, + "learning_rate": 0.0003997172011661807, + "loss": 3.4726, + "step": 23000 + }, + { + "epoch": 6.695940830469978, + "eval_accuracy": 0.36462571371896035, + "eval_loss": 3.593517541885376, + "eval_runtime": 180.0181, + "eval_samples_per_second": 92.452, + "eval_steps_per_second": 5.783, + "step": 23000 + }, + { + "epoch": 6.710500262069769, + "grad_norm": 0.3251613676548004, + "learning_rate": 0.00039927988338192414, + "loss": 3.4531, + "step": 23050 + }, + { + "epoch": 6.725059693669559, + "grad_norm": 0.3224335312843323, + "learning_rate": 0.0003988425655976676, + "loss": 3.4575, + "step": 23100 + }, + { + "epoch": 6.73961912526935, + "grad_norm": 0.3301301598548889, + "learning_rate": 0.00039840524781341107, + "loss": 3.4608, + "step": 23150 + }, + { + "epoch": 6.75417855686914, + "grad_norm": 0.3267367482185364, + "learning_rate": 0.0003979679300291545, + "loss": 3.4722, + "step": 23200 + }, + { + "epoch": 6.7687379884689305, + "grad_norm": 0.34791019558906555, + "learning_rate": 0.00039753061224489795, + "loss": 3.4737, + "step": 23250 + }, + { + "epoch": 6.78329742006872, + "grad_norm": 0.3289180099964142, + "learning_rate": 0.00039709329446064136, + "loss": 3.4727, + "step": 23300 + }, + { + "epoch": 6.797856851668511, + "grad_norm": 0.326933890581131, + "learning_rate": 0.0003966559766763848, + "loss": 3.4499, + "step": 23350 + }, + { + "epoch": 6.812416283268301, + "grad_norm": 0.3207686245441437, + "learning_rate": 0.0003962186588921283, + "loss": 3.4833, + "step": 23400 + }, + { + "epoch": 6.826975714868092, + "grad_norm": 0.34312567114830017, + "learning_rate": 0.0003957813411078717, + "loss": 3.4743, + "step": 23450 + }, + { + "epoch": 6.841535146467882, + "grad_norm": 0.32261767983436584, + "learning_rate": 0.0003953440233236151, + "loss": 3.4717, + "step": 23500 + }, + { + "epoch": 6.8560945780676725, + "grad_norm": 0.3239823579788208, + "learning_rate": 0.00039490670553935853, + "loss": 3.4745, + "step": 23550 + }, + { + "epoch": 6.870654009667462, + "grad_norm": 0.3287251889705658, + "learning_rate": 0.00039446938775510195, + "loss": 3.4706, + "step": 23600 + }, + { + "epoch": 6.885213441267253, + "grad_norm": 0.33744481205940247, + "learning_rate": 0.00039403206997084547, + "loss": 3.4712, + "step": 23650 + }, + { + "epoch": 6.899772872867043, + "grad_norm": 0.33784157037734985, + "learning_rate": 0.0003935947521865889, + "loss": 3.4744, + "step": 23700 + }, + { + "epoch": 6.914332304466834, + "grad_norm": 0.33039554953575134, + "learning_rate": 0.0003931574344023323, + "loss": 3.4635, + "step": 23750 + }, + { + "epoch": 6.928891736066624, + "grad_norm": 0.3252994418144226, + "learning_rate": 0.00039272011661807576, + "loss": 3.4699, + "step": 23800 + }, + { + "epoch": 6.943451167666415, + "grad_norm": 0.33139488101005554, + "learning_rate": 0.0003922827988338192, + "loss": 3.4763, + "step": 23850 + }, + { + "epoch": 6.9580105992662045, + "grad_norm": 0.3324434757232666, + "learning_rate": 0.00039184548104956264, + "loss": 3.467, + "step": 23900 + }, + { + "epoch": 6.972570030865995, + "grad_norm": 0.32796579599380493, + "learning_rate": 0.0003914081632653061, + "loss": 3.4705, + "step": 23950 + }, + { + "epoch": 6.987129462465785, + "grad_norm": 0.32173773646354675, + "learning_rate": 0.0003909708454810495, + "loss": 3.4734, + "step": 24000 + }, + { + "epoch": 6.987129462465785, + "eval_accuracy": 0.36566327315904046, + "eval_loss": 3.5826520919799805, + "eval_runtime": 179.8608, + "eval_samples_per_second": 92.533, + "eval_steps_per_second": 5.788, + "step": 24000 + }, + { + "epoch": 7.001455943159979, + "grad_norm": 0.3372614085674286, + "learning_rate": 0.00039053352769679293, + "loss": 3.4591, + "step": 24050 + }, + { + "epoch": 7.016015374759769, + "grad_norm": 0.3403565585613251, + "learning_rate": 0.00039009620991253645, + "loss": 3.3491, + "step": 24100 + }, + { + "epoch": 7.03057480635956, + "grad_norm": 0.34883126616477966, + "learning_rate": 0.00038965889212827986, + "loss": 3.3559, + "step": 24150 + }, + { + "epoch": 7.04513423795935, + "grad_norm": 0.3384236693382263, + "learning_rate": 0.0003892215743440233, + "loss": 3.3671, + "step": 24200 + }, + { + "epoch": 7.059693669559141, + "grad_norm": 0.33237436413764954, + "learning_rate": 0.0003887842565597667, + "loss": 3.3692, + "step": 24250 + }, + { + "epoch": 7.074253101158931, + "grad_norm": 0.35221633315086365, + "learning_rate": 0.0003883469387755102, + "loss": 3.3795, + "step": 24300 + }, + { + "epoch": 7.0888125327587215, + "grad_norm": 0.33727243542671204, + "learning_rate": 0.0003879096209912536, + "loss": 3.3848, + "step": 24350 + }, + { + "epoch": 7.103371964358511, + "grad_norm": 0.34708696603775024, + "learning_rate": 0.00038747230320699703, + "loss": 3.3819, + "step": 24400 + }, + { + "epoch": 7.117931395958302, + "grad_norm": 0.3307049572467804, + "learning_rate": 0.0003870349854227405, + "loss": 3.3914, + "step": 24450 + }, + { + "epoch": 7.132490827558092, + "grad_norm": 0.3246367871761322, + "learning_rate": 0.0003865976676384839, + "loss": 3.3867, + "step": 24500 + }, + { + "epoch": 7.147050259157883, + "grad_norm": 0.33385294675827026, + "learning_rate": 0.0003861603498542274, + "loss": 3.3878, + "step": 24550 + }, + { + "epoch": 7.161609690757673, + "grad_norm": 0.35358157753944397, + "learning_rate": 0.0003857230320699708, + "loss": 3.3848, + "step": 24600 + }, + { + "epoch": 7.1761691223574635, + "grad_norm": 0.3381134271621704, + "learning_rate": 0.00038528571428571426, + "loss": 3.381, + "step": 24650 + }, + { + "epoch": 7.190728553957253, + "grad_norm": 0.33539456129074097, + "learning_rate": 0.00038484839650145767, + "loss": 3.391, + "step": 24700 + }, + { + "epoch": 7.205287985557044, + "grad_norm": 0.3288535475730896, + "learning_rate": 0.00038441107871720114, + "loss": 3.3846, + "step": 24750 + }, + { + "epoch": 7.219847417156834, + "grad_norm": 0.3503969609737396, + "learning_rate": 0.0003839737609329446, + "loss": 3.3942, + "step": 24800 + }, + { + "epoch": 7.234406848756625, + "grad_norm": 0.34089216589927673, + "learning_rate": 0.000383536443148688, + "loss": 3.4039, + "step": 24850 + }, + { + "epoch": 7.248966280356415, + "grad_norm": 0.33822911977767944, + "learning_rate": 0.00038309912536443143, + "loss": 3.3938, + "step": 24900 + }, + { + "epoch": 7.2635257119562056, + "grad_norm": 0.34553372859954834, + "learning_rate": 0.00038266180758017484, + "loss": 3.4007, + "step": 24950 + }, + { + "epoch": 7.2780851435559955, + "grad_norm": 0.34171566367149353, + "learning_rate": 0.00038222448979591836, + "loss": 3.4074, + "step": 25000 + }, + { + "epoch": 7.2780851435559955, + "eval_accuracy": 0.3654761973352454, + "eval_loss": 3.590606212615967, + "eval_runtime": 179.9086, + "eval_samples_per_second": 92.508, + "eval_steps_per_second": 5.786, + "step": 25000 + }, + { + "epoch": 7.292644575155786, + "grad_norm": 0.33524560928344727, + "learning_rate": 0.00038178717201166177, + "loss": 3.4011, + "step": 25050 + }, + { + "epoch": 7.307204006755576, + "grad_norm": 0.3223832845687866, + "learning_rate": 0.0003813498542274052, + "loss": 3.4039, + "step": 25100 + }, + { + "epoch": 7.321763438355367, + "grad_norm": 0.33097726106643677, + "learning_rate": 0.00038091253644314865, + "loss": 3.4045, + "step": 25150 + }, + { + "epoch": 7.336322869955157, + "grad_norm": 0.3478914201259613, + "learning_rate": 0.0003804752186588921, + "loss": 3.4058, + "step": 25200 + }, + { + "epoch": 7.350882301554948, + "grad_norm": 0.36241742968559265, + "learning_rate": 0.00038003790087463553, + "loss": 3.4074, + "step": 25250 + }, + { + "epoch": 7.3654417331547375, + "grad_norm": 0.3258671164512634, + "learning_rate": 0.000379600583090379, + "loss": 3.4111, + "step": 25300 + }, + { + "epoch": 7.380001164754528, + "grad_norm": 0.34953588247299194, + "learning_rate": 0.0003791632653061224, + "loss": 3.4106, + "step": 25350 + }, + { + "epoch": 7.394560596354318, + "grad_norm": 0.33248066902160645, + "learning_rate": 0.0003787259475218658, + "loss": 3.4044, + "step": 25400 + }, + { + "epoch": 7.409120027954109, + "grad_norm": 0.3584959805011749, + "learning_rate": 0.00037828862973760934, + "loss": 3.4181, + "step": 25450 + }, + { + "epoch": 7.423679459553899, + "grad_norm": 0.3388174772262573, + "learning_rate": 0.00037785131195335276, + "loss": 3.4183, + "step": 25500 + }, + { + "epoch": 7.43823889115369, + "grad_norm": 0.3261428773403168, + "learning_rate": 0.00037741399416909617, + "loss": 3.421, + "step": 25550 + }, + { + "epoch": 7.4527983227534795, + "grad_norm": 0.33287757635116577, + "learning_rate": 0.0003769766763848396, + "loss": 3.4161, + "step": 25600 + }, + { + "epoch": 7.46735775435327, + "grad_norm": 0.3487517237663269, + "learning_rate": 0.0003765393586005831, + "loss": 3.4124, + "step": 25650 + }, + { + "epoch": 7.48191718595306, + "grad_norm": 0.3360964357852936, + "learning_rate": 0.0003761020408163265, + "loss": 3.4114, + "step": 25700 + }, + { + "epoch": 7.496476617552851, + "grad_norm": 0.339276522397995, + "learning_rate": 0.0003756647230320699, + "loss": 3.4243, + "step": 25750 + }, + { + "epoch": 7.511036049152641, + "grad_norm": 0.34367257356643677, + "learning_rate": 0.00037522740524781334, + "loss": 3.422, + "step": 25800 + }, + { + "epoch": 7.525595480752432, + "grad_norm": 0.33407968282699585, + "learning_rate": 0.0003747900874635568, + "loss": 3.4223, + "step": 25850 + }, + { + "epoch": 7.540154912352222, + "grad_norm": 0.3418211042881012, + "learning_rate": 0.00037435276967930027, + "loss": 3.4312, + "step": 25900 + }, + { + "epoch": 7.554714343952012, + "grad_norm": 0.3525235950946808, + "learning_rate": 0.0003739154518950437, + "loss": 3.436, + "step": 25950 + }, + { + "epoch": 7.569273775551802, + "grad_norm": 0.35437750816345215, + "learning_rate": 0.00037347813411078715, + "loss": 3.4173, + "step": 26000 + }, + { + "epoch": 7.569273775551802, + "eval_accuracy": 0.36623061498796, + "eval_loss": 3.5847508907318115, + "eval_runtime": 179.8238, + "eval_samples_per_second": 92.552, + "eval_steps_per_second": 5.789, + "step": 26000 + }, + { + "epoch": 7.583833207151593, + "grad_norm": 0.36680832505226135, + "learning_rate": 0.00037304081632653056, + "loss": 3.4282, + "step": 26050 + }, + { + "epoch": 7.598392638751383, + "grad_norm": 0.32975292205810547, + "learning_rate": 0.00037260349854227403, + "loss": 3.4187, + "step": 26100 + }, + { + "epoch": 7.612952070351174, + "grad_norm": 0.33258336782455444, + "learning_rate": 0.0003721661807580175, + "loss": 3.419, + "step": 26150 + }, + { + "epoch": 7.627511501950964, + "grad_norm": 0.3520626723766327, + "learning_rate": 0.0003717288629737609, + "loss": 3.4317, + "step": 26200 + }, + { + "epoch": 7.642070933550754, + "grad_norm": 0.34615185856819153, + "learning_rate": 0.0003712915451895043, + "loss": 3.4397, + "step": 26250 + }, + { + "epoch": 7.656630365150544, + "grad_norm": 0.3472108542919159, + "learning_rate": 0.00037085422740524773, + "loss": 3.4183, + "step": 26300 + }, + { + "epoch": 7.671189796750335, + "grad_norm": 0.3401790261268616, + "learning_rate": 0.00037041690962099125, + "loss": 3.4274, + "step": 26350 + }, + { + "epoch": 7.685749228350125, + "grad_norm": 0.34616005420684814, + "learning_rate": 0.00036997959183673467, + "loss": 3.432, + "step": 26400 + }, + { + "epoch": 7.700308659949916, + "grad_norm": 0.35238298773765564, + "learning_rate": 0.0003695422740524781, + "loss": 3.4308, + "step": 26450 + }, + { + "epoch": 7.714868091549706, + "grad_norm": 0.3595859408378601, + "learning_rate": 0.00036910495626822154, + "loss": 3.4368, + "step": 26500 + }, + { + "epoch": 7.729427523149496, + "grad_norm": 0.3455177843570709, + "learning_rate": 0.000368667638483965, + "loss": 3.4389, + "step": 26550 + }, + { + "epoch": 7.743986954749286, + "grad_norm": 0.3548458516597748, + "learning_rate": 0.0003682303206997084, + "loss": 3.427, + "step": 26600 + }, + { + "epoch": 7.758546386349077, + "grad_norm": 0.34287944436073303, + "learning_rate": 0.00036779300291545184, + "loss": 3.4268, + "step": 26650 + }, + { + "epoch": 7.773105817948867, + "grad_norm": 0.3392084836959839, + "learning_rate": 0.0003673556851311953, + "loss": 3.4348, + "step": 26700 + }, + { + "epoch": 7.787665249548658, + "grad_norm": 0.3404330015182495, + "learning_rate": 0.0003669183673469387, + "loss": 3.4162, + "step": 26750 + }, + { + "epoch": 7.802224681148448, + "grad_norm": 0.34505772590637207, + "learning_rate": 0.0003664810495626822, + "loss": 3.4267, + "step": 26800 + }, + { + "epoch": 7.8167841127482385, + "grad_norm": 0.34470146894454956, + "learning_rate": 0.00036604373177842565, + "loss": 3.444, + "step": 26850 + }, + { + "epoch": 7.831343544348028, + "grad_norm": 0.33952096104621887, + "learning_rate": 0.00036560641399416906, + "loss": 3.4317, + "step": 26900 + }, + { + "epoch": 7.845902975947819, + "grad_norm": 0.35519036650657654, + "learning_rate": 0.0003651690962099125, + "loss": 3.4335, + "step": 26950 + }, + { + "epoch": 7.860462407547609, + "grad_norm": 0.3539314568042755, + "learning_rate": 0.000364731778425656, + "loss": 3.4318, + "step": 27000 + }, + { + "epoch": 7.860462407547609, + "eval_accuracy": 0.36703947393949116, + "eval_loss": 3.572652578353882, + "eval_runtime": 179.9477, + "eval_samples_per_second": 92.488, + "eval_steps_per_second": 5.785, + "step": 27000 + }, + { + "epoch": 7.8750218391474, + "grad_norm": 0.3434327244758606, + "learning_rate": 0.0003642944606413994, + "loss": 3.4372, + "step": 27050 + }, + { + "epoch": 7.88958127074719, + "grad_norm": 0.3285406231880188, + "learning_rate": 0.0003638571428571428, + "loss": 3.4238, + "step": 27100 + }, + { + "epoch": 7.9041407023469805, + "grad_norm": 0.3453764021396637, + "learning_rate": 0.00036341982507288623, + "loss": 3.4369, + "step": 27150 + }, + { + "epoch": 7.91870013394677, + "grad_norm": 0.32807591557502747, + "learning_rate": 0.0003629825072886297, + "loss": 3.4333, + "step": 27200 + }, + { + "epoch": 7.933259565546561, + "grad_norm": 0.33627721667289734, + "learning_rate": 0.00036254518950437316, + "loss": 3.4378, + "step": 27250 + }, + { + "epoch": 7.947818997146351, + "grad_norm": 0.3372686207294464, + "learning_rate": 0.0003621078717201166, + "loss": 3.4311, + "step": 27300 + }, + { + "epoch": 7.962378428746142, + "grad_norm": 0.3440007269382477, + "learning_rate": 0.00036167055393586004, + "loss": 3.4387, + "step": 27350 + }, + { + "epoch": 7.976937860345932, + "grad_norm": 0.3503531813621521, + "learning_rate": 0.00036123323615160346, + "loss": 3.435, + "step": 27400 + }, + { + "epoch": 7.991497291945723, + "grad_norm": 0.32801827788352966, + "learning_rate": 0.0003607959183673469, + "loss": 3.4472, + "step": 27450 + }, + { + "epoch": 8.005823772639916, + "grad_norm": 0.35048505663871765, + "learning_rate": 0.0003603586005830904, + "loss": 3.3812, + "step": 27500 + }, + { + "epoch": 8.020383204239707, + "grad_norm": 0.3340410888195038, + "learning_rate": 0.0003599212827988338, + "loss": 3.3375, + "step": 27550 + }, + { + "epoch": 8.034942635839498, + "grad_norm": 0.34667739272117615, + "learning_rate": 0.0003594839650145772, + "loss": 3.3225, + "step": 27600 + }, + { + "epoch": 8.049502067439287, + "grad_norm": 0.3594329059123993, + "learning_rate": 0.0003590466472303206, + "loss": 3.335, + "step": 27650 + }, + { + "epoch": 8.064061499039077, + "grad_norm": 0.3574943244457245, + "learning_rate": 0.00035860932944606415, + "loss": 3.3357, + "step": 27700 + }, + { + "epoch": 8.078620930638868, + "grad_norm": 0.3481893539428711, + "learning_rate": 0.00035817201166180756, + "loss": 3.3318, + "step": 27750 + }, + { + "epoch": 8.093180362238659, + "grad_norm": 0.3395717144012451, + "learning_rate": 0.00035773469387755097, + "loss": 3.3532, + "step": 27800 + }, + { + "epoch": 8.107739793838448, + "grad_norm": 0.33678963780403137, + "learning_rate": 0.0003572973760932944, + "loss": 3.3457, + "step": 27850 + }, + { + "epoch": 8.122299225438239, + "grad_norm": 0.3517054617404938, + "learning_rate": 0.0003568600583090379, + "loss": 3.3469, + "step": 27900 + }, + { + "epoch": 8.13685865703803, + "grad_norm": 0.3345564603805542, + "learning_rate": 0.0003564227405247813, + "loss": 3.3386, + "step": 27950 + }, + { + "epoch": 8.15141808863782, + "grad_norm": 0.3494488596916199, + "learning_rate": 0.00035598542274052473, + "loss": 3.3639, + "step": 28000 + }, + { + "epoch": 8.15141808863782, + "eval_accuracy": 0.3670094900708125, + "eval_loss": 3.580695867538452, + "eval_runtime": 179.627, + "eval_samples_per_second": 92.653, + "eval_steps_per_second": 5.795, + "step": 28000 + }, + { + "epoch": 8.16597752023761, + "grad_norm": 0.33706334233283997, + "learning_rate": 0.0003555481049562682, + "loss": 3.369, + "step": 28050 + }, + { + "epoch": 8.1805369518374, + "grad_norm": 0.34689971804618835, + "learning_rate": 0.0003551107871720116, + "loss": 3.3533, + "step": 28100 + }, + { + "epoch": 8.19509638343719, + "grad_norm": 0.34161534905433655, + "learning_rate": 0.0003546734693877551, + "loss": 3.375, + "step": 28150 + }, + { + "epoch": 8.209655815036982, + "grad_norm": 0.36119794845581055, + "learning_rate": 0.00035423615160349854, + "loss": 3.3611, + "step": 28200 + }, + { + "epoch": 8.22421524663677, + "grad_norm": 0.3473355174064636, + "learning_rate": 0.00035379883381924195, + "loss": 3.3731, + "step": 28250 + }, + { + "epoch": 8.238774678236561, + "grad_norm": 0.33798742294311523, + "learning_rate": 0.00035336151603498537, + "loss": 3.3692, + "step": 28300 + }, + { + "epoch": 8.253334109836352, + "grad_norm": 0.3432019352912903, + "learning_rate": 0.0003529241982507289, + "loss": 3.3687, + "step": 28350 + }, + { + "epoch": 8.267893541436143, + "grad_norm": 0.35700732469558716, + "learning_rate": 0.0003524868804664723, + "loss": 3.3723, + "step": 28400 + }, + { + "epoch": 8.282452973035932, + "grad_norm": 0.348431795835495, + "learning_rate": 0.0003520495626822157, + "loss": 3.3593, + "step": 28450 + }, + { + "epoch": 8.297012404635723, + "grad_norm": 0.34419500827789307, + "learning_rate": 0.0003516122448979591, + "loss": 3.3778, + "step": 28500 + }, + { + "epoch": 8.311571836235514, + "grad_norm": 0.34864479303359985, + "learning_rate": 0.0003511749271137026, + "loss": 3.3809, + "step": 28550 + }, + { + "epoch": 8.326131267835304, + "grad_norm": 0.35667717456817627, + "learning_rate": 0.00035073760932944606, + "loss": 3.363, + "step": 28600 + }, + { + "epoch": 8.340690699435093, + "grad_norm": 0.3501654863357544, + "learning_rate": 0.00035030029154518947, + "loss": 3.378, + "step": 28650 + }, + { + "epoch": 8.355250131034884, + "grad_norm": 0.3490404486656189, + "learning_rate": 0.0003498629737609329, + "loss": 3.3855, + "step": 28700 + }, + { + "epoch": 8.369809562634675, + "grad_norm": 0.358019083738327, + "learning_rate": 0.00034942565597667635, + "loss": 3.3784, + "step": 28750 + }, + { + "epoch": 8.384368994234466, + "grad_norm": 0.33226025104522705, + "learning_rate": 0.0003489883381924198, + "loss": 3.3714, + "step": 28800 + }, + { + "epoch": 8.398928425834255, + "grad_norm": 0.3402322828769684, + "learning_rate": 0.00034855102040816323, + "loss": 3.3785, + "step": 28850 + }, + { + "epoch": 8.413487857434045, + "grad_norm": 0.36141642928123474, + "learning_rate": 0.0003481137026239067, + "loss": 3.3745, + "step": 28900 + }, + { + "epoch": 8.428047289033836, + "grad_norm": 0.36371850967407227, + "learning_rate": 0.0003476763848396501, + "loss": 3.3874, + "step": 28950 + }, + { + "epoch": 8.442606720633627, + "grad_norm": 0.3497146666049957, + "learning_rate": 0.0003472390670553935, + "loss": 3.3846, + "step": 29000 + }, + { + "epoch": 8.442606720633627, + "eval_accuracy": 0.3678150566759789, + "eval_loss": 3.572382688522339, + "eval_runtime": 179.6973, + "eval_samples_per_second": 92.617, + "eval_steps_per_second": 5.793, + "step": 29000 + }, + { + "epoch": 8.457166152233416, + "grad_norm": 0.3523021936416626, + "learning_rate": 0.00034680174927113704, + "loss": 3.3833, + "step": 29050 + }, + { + "epoch": 8.471725583833207, + "grad_norm": 0.3318672180175781, + "learning_rate": 0.00034636443148688045, + "loss": 3.3856, + "step": 29100 + }, + { + "epoch": 8.486285015432998, + "grad_norm": 0.34436580538749695, + "learning_rate": 0.00034592711370262386, + "loss": 3.392, + "step": 29150 + }, + { + "epoch": 8.500844447032788, + "grad_norm": 0.3374488651752472, + "learning_rate": 0.0003454897959183673, + "loss": 3.3784, + "step": 29200 + }, + { + "epoch": 8.515403878632577, + "grad_norm": 0.3651833236217499, + "learning_rate": 0.0003450524781341108, + "loss": 3.3695, + "step": 29250 + }, + { + "epoch": 8.529963310232368, + "grad_norm": 0.33650752902030945, + "learning_rate": 0.0003446151603498542, + "loss": 3.3818, + "step": 29300 + }, + { + "epoch": 8.544522741832159, + "grad_norm": 0.3391404449939728, + "learning_rate": 0.0003441778425655976, + "loss": 3.3997, + "step": 29350 + }, + { + "epoch": 8.55908217343195, + "grad_norm": 0.3535376787185669, + "learning_rate": 0.0003437405247813411, + "loss": 3.3867, + "step": 29400 + }, + { + "epoch": 8.573641605031739, + "grad_norm": 0.3420208692550659, + "learning_rate": 0.0003433032069970845, + "loss": 3.3834, + "step": 29450 + }, + { + "epoch": 8.58820103663153, + "grad_norm": 0.3331069052219391, + "learning_rate": 0.00034286588921282797, + "loss": 3.3903, + "step": 29500 + }, + { + "epoch": 8.60276046823132, + "grad_norm": 0.3587231934070587, + "learning_rate": 0.00034242857142857143, + "loss": 3.3888, + "step": 29550 + }, + { + "epoch": 8.617319899831111, + "grad_norm": 0.35839417576789856, + "learning_rate": 0.00034199125364431485, + "loss": 3.404, + "step": 29600 + }, + { + "epoch": 8.6318793314309, + "grad_norm": 0.3896600902080536, + "learning_rate": 0.00034155393586005826, + "loss": 3.3906, + "step": 29650 + }, + { + "epoch": 8.646438763030691, + "grad_norm": 0.35471850633621216, + "learning_rate": 0.0003411166180758017, + "loss": 3.3922, + "step": 29700 + }, + { + "epoch": 8.660998194630482, + "grad_norm": 0.3513423800468445, + "learning_rate": 0.0003406793002915452, + "loss": 3.3873, + "step": 29750 + }, + { + "epoch": 8.675557626230272, + "grad_norm": 0.34752732515335083, + "learning_rate": 0.0003402419825072886, + "loss": 3.3855, + "step": 29800 + }, + { + "epoch": 8.690117057830061, + "grad_norm": 0.32745492458343506, + "learning_rate": 0.000339804664723032, + "loss": 3.3934, + "step": 29850 + }, + { + "epoch": 8.704676489429852, + "grad_norm": 0.3485073745250702, + "learning_rate": 0.00033936734693877543, + "loss": 3.3878, + "step": 29900 + }, + { + "epoch": 8.719235921029643, + "grad_norm": 0.3374342620372772, + "learning_rate": 0.00033893002915451895, + "loss": 3.388, + "step": 29950 + }, + { + "epoch": 8.733795352629434, + "grad_norm": 0.3508179187774658, + "learning_rate": 0.00033849271137026236, + "loss": 3.3893, + "step": 30000 + }, + { + "epoch": 8.733795352629434, + "eval_accuracy": 0.3683345418988114, + "eval_loss": 3.5642998218536377, + "eval_runtime": 179.3326, + "eval_samples_per_second": 92.805, + "eval_steps_per_second": 5.805, + "step": 30000 + }, + { + "epoch": 8.748354784229225, + "grad_norm": 0.3522128760814667, + "learning_rate": 0.0003380553935860058, + "loss": 3.4013, + "step": 30050 + }, + { + "epoch": 8.762914215829014, + "grad_norm": 0.3406279385089874, + "learning_rate": 0.00033761807580174924, + "loss": 3.4014, + "step": 30100 + }, + { + "epoch": 8.777473647428804, + "grad_norm": 0.33040550351142883, + "learning_rate": 0.0003371807580174927, + "loss": 3.392, + "step": 30150 + }, + { + "epoch": 8.792033079028595, + "grad_norm": 0.35470637679100037, + "learning_rate": 0.0003367434402332361, + "loss": 3.3986, + "step": 30200 + }, + { + "epoch": 8.806592510628384, + "grad_norm": 0.35664665699005127, + "learning_rate": 0.0003363061224489796, + "loss": 3.4054, + "step": 30250 + }, + { + "epoch": 8.821151942228175, + "grad_norm": 0.35443365573883057, + "learning_rate": 0.000335868804664723, + "loss": 3.3916, + "step": 30300 + }, + { + "epoch": 8.835711373827966, + "grad_norm": 0.3552112579345703, + "learning_rate": 0.0003354314868804664, + "loss": 3.4106, + "step": 30350 + }, + { + "epoch": 8.850270805427757, + "grad_norm": 0.3517363667488098, + "learning_rate": 0.00033499416909620993, + "loss": 3.3959, + "step": 30400 + }, + { + "epoch": 8.864830237027547, + "grad_norm": 0.3412357568740845, + "learning_rate": 0.00033455685131195335, + "loss": 3.3977, + "step": 30450 + }, + { + "epoch": 8.879389668627336, + "grad_norm": 0.3659086227416992, + "learning_rate": 0.00033411953352769676, + "loss": 3.4041, + "step": 30500 + }, + { + "epoch": 8.893949100227127, + "grad_norm": 0.3394777476787567, + "learning_rate": 0.00033368221574344017, + "loss": 3.3925, + "step": 30550 + }, + { + "epoch": 8.908508531826918, + "grad_norm": 0.3358438014984131, + "learning_rate": 0.0003332448979591837, + "loss": 3.3954, + "step": 30600 + }, + { + "epoch": 8.923067963426707, + "grad_norm": 0.3618221879005432, + "learning_rate": 0.0003328075801749271, + "loss": 3.3963, + "step": 30650 + }, + { + "epoch": 8.937627395026498, + "grad_norm": 0.35156282782554626, + "learning_rate": 0.0003323702623906705, + "loss": 3.3973, + "step": 30700 + }, + { + "epoch": 8.952186826626289, + "grad_norm": 0.3404799997806549, + "learning_rate": 0.00033193294460641393, + "loss": 3.4064, + "step": 30750 + }, + { + "epoch": 8.96674625822608, + "grad_norm": 0.3573434352874756, + "learning_rate": 0.0003314956268221574, + "loss": 3.3962, + "step": 30800 + }, + { + "epoch": 8.98130568982587, + "grad_norm": 0.3326402008533478, + "learning_rate": 0.00033105830903790086, + "loss": 3.3904, + "step": 30850 + }, + { + "epoch": 8.995865121425659, + "grad_norm": 0.3333438038825989, + "learning_rate": 0.0003306209912536443, + "loss": 3.4072, + "step": 30900 + }, + { + "epoch": 9.010191602119853, + "grad_norm": 0.35445913672447205, + "learning_rate": 0.00033018367346938774, + "loss": 3.3354, + "step": 30950 + }, + { + "epoch": 9.024751033719644, + "grad_norm": 0.3602832555770874, + "learning_rate": 0.00032974635568513115, + "loss": 3.2997, + "step": 31000 + }, + { + "epoch": 9.024751033719644, + "eval_accuracy": 0.3683871018568481, + "eval_loss": 3.56929087638855, + "eval_runtime": 180.7113, + "eval_samples_per_second": 92.097, + "eval_steps_per_second": 5.761, + "step": 31000 + }, + { + "epoch": 9.039310465319433, + "grad_norm": 0.33829203248023987, + "learning_rate": 0.0003293090379008746, + "loss": 3.2994, + "step": 31050 + }, + { + "epoch": 9.053869896919224, + "grad_norm": 0.36634117364883423, + "learning_rate": 0.0003288717201166181, + "loss": 3.3039, + "step": 31100 + }, + { + "epoch": 9.068429328519015, + "grad_norm": 0.34743866324424744, + "learning_rate": 0.0003284344023323615, + "loss": 3.3133, + "step": 31150 + }, + { + "epoch": 9.082988760118806, + "grad_norm": 0.3573026657104492, + "learning_rate": 0.0003279970845481049, + "loss": 3.3067, + "step": 31200 + }, + { + "epoch": 9.097548191718595, + "grad_norm": 0.3499259650707245, + "learning_rate": 0.0003275597667638483, + "loss": 3.311, + "step": 31250 + }, + { + "epoch": 9.112107623318385, + "grad_norm": 0.3550528287887573, + "learning_rate": 0.00032712244897959184, + "loss": 3.3146, + "step": 31300 + }, + { + "epoch": 9.126667054918176, + "grad_norm": 0.3766951262950897, + "learning_rate": 0.00032668513119533526, + "loss": 3.3203, + "step": 31350 + }, + { + "epoch": 9.141226486517967, + "grad_norm": 0.3506350517272949, + "learning_rate": 0.00032624781341107867, + "loss": 3.3383, + "step": 31400 + }, + { + "epoch": 9.155785918117756, + "grad_norm": 0.36587440967559814, + "learning_rate": 0.00032581049562682213, + "loss": 3.3249, + "step": 31450 + }, + { + "epoch": 9.170345349717547, + "grad_norm": 0.3548264503479004, + "learning_rate": 0.0003253731778425656, + "loss": 3.3173, + "step": 31500 + }, + { + "epoch": 9.184904781317337, + "grad_norm": 0.3574599325656891, + "learning_rate": 0.000324935860058309, + "loss": 3.3277, + "step": 31550 + }, + { + "epoch": 9.199464212917128, + "grad_norm": 0.3559187948703766, + "learning_rate": 0.0003244985422740524, + "loss": 3.3302, + "step": 31600 + }, + { + "epoch": 9.214023644516917, + "grad_norm": 0.3626471757888794, + "learning_rate": 0.0003240612244897959, + "loss": 3.3249, + "step": 31650 + }, + { + "epoch": 9.228583076116708, + "grad_norm": 0.34642550349235535, + "learning_rate": 0.0003236239067055393, + "loss": 3.3288, + "step": 31700 + }, + { + "epoch": 9.243142507716499, + "grad_norm": 0.3562052249908447, + "learning_rate": 0.00032318658892128277, + "loss": 3.3381, + "step": 31750 + }, + { + "epoch": 9.25770193931629, + "grad_norm": 0.35299643874168396, + "learning_rate": 0.00032274927113702624, + "loss": 3.3398, + "step": 31800 + }, + { + "epoch": 9.272261370916079, + "grad_norm": 0.3579034209251404, + "learning_rate": 0.00032231195335276965, + "loss": 3.3376, + "step": 31850 + }, + { + "epoch": 9.28682080251587, + "grad_norm": 0.3582768738269806, + "learning_rate": 0.00032187463556851306, + "loss": 3.3417, + "step": 31900 + }, + { + "epoch": 9.30138023411566, + "grad_norm": 0.3462630808353424, + "learning_rate": 0.0003214373177842565, + "loss": 3.3388, + "step": 31950 + }, + { + "epoch": 9.315939665715451, + "grad_norm": 0.35994312167167664, + "learning_rate": 0.000321, + "loss": 3.3417, + "step": 32000 + }, + { + "epoch": 9.315939665715451, + "eval_accuracy": 0.3688413280713799, + "eval_loss": 3.5689337253570557, + "eval_runtime": 181.1092, + "eval_samples_per_second": 91.895, + "eval_steps_per_second": 5.748, + "step": 32000 + }, + { + "epoch": 9.33049909731524, + "grad_norm": 0.34968388080596924, + "learning_rate": 0.0003205626822157434, + "loss": 3.3376, + "step": 32050 + }, + { + "epoch": 9.34505852891503, + "grad_norm": 0.35291755199432373, + "learning_rate": 0.0003201253644314868, + "loss": 3.3412, + "step": 32100 + }, + { + "epoch": 9.359617960514822, + "grad_norm": 0.3643549978733063, + "learning_rate": 0.0003196880466472303, + "loss": 3.3411, + "step": 32150 + }, + { + "epoch": 9.374177392114612, + "grad_norm": 0.3537770211696625, + "learning_rate": 0.00031925072886297375, + "loss": 3.3642, + "step": 32200 + }, + { + "epoch": 9.388736823714403, + "grad_norm": 0.3553234338760376, + "learning_rate": 0.00031881341107871717, + "loss": 3.353, + "step": 32250 + }, + { + "epoch": 9.403296255314192, + "grad_norm": 0.35173216462135315, + "learning_rate": 0.00031837609329446063, + "loss": 3.3433, + "step": 32300 + }, + { + "epoch": 9.417855686913983, + "grad_norm": 0.3561984598636627, + "learning_rate": 0.00031793877551020405, + "loss": 3.3459, + "step": 32350 + }, + { + "epoch": 9.432415118513774, + "grad_norm": 0.3734908699989319, + "learning_rate": 0.00031750145772594746, + "loss": 3.3495, + "step": 32400 + }, + { + "epoch": 9.446974550113563, + "grad_norm": 0.3848966658115387, + "learning_rate": 0.000317064139941691, + "loss": 3.3486, + "step": 32450 + }, + { + "epoch": 9.461533981713353, + "grad_norm": 0.36939284205436707, + "learning_rate": 0.0003166268221574344, + "loss": 3.3527, + "step": 32500 + }, + { + "epoch": 9.476093413313144, + "grad_norm": 0.3429546654224396, + "learning_rate": 0.0003161895043731778, + "loss": 3.3402, + "step": 32550 + }, + { + "epoch": 9.490652844912935, + "grad_norm": 0.34233972430229187, + "learning_rate": 0.0003157521865889212, + "loss": 3.3508, + "step": 32600 + }, + { + "epoch": 9.505212276512726, + "grad_norm": 0.3572950065135956, + "learning_rate": 0.00031531486880466474, + "loss": 3.3599, + "step": 32650 + }, + { + "epoch": 9.519771708112515, + "grad_norm": 0.34846094250679016, + "learning_rate": 0.00031487755102040815, + "loss": 3.3585, + "step": 32700 + }, + { + "epoch": 9.534331139712306, + "grad_norm": 0.3666765093803406, + "learning_rate": 0.00031444023323615156, + "loss": 3.3535, + "step": 32750 + }, + { + "epoch": 9.548890571312096, + "grad_norm": 0.3483474850654602, + "learning_rate": 0.000314002915451895, + "loss": 3.3492, + "step": 32800 + }, + { + "epoch": 9.563450002911885, + "grad_norm": 0.3478499948978424, + "learning_rate": 0.00031356559766763844, + "loss": 3.3664, + "step": 32850 + }, + { + "epoch": 9.578009434511676, + "grad_norm": 0.3615437150001526, + "learning_rate": 0.0003131282798833819, + "loss": 3.3739, + "step": 32900 + }, + { + "epoch": 9.592568866111467, + "grad_norm": 0.35250964760780334, + "learning_rate": 0.0003126909620991253, + "loss": 3.3745, + "step": 32950 + }, + { + "epoch": 9.607128297711258, + "grad_norm": 0.35164180397987366, + "learning_rate": 0.0003122536443148688, + "loss": 3.3562, + "step": 33000 + }, + { + "epoch": 9.607128297711258, + "eval_accuracy": 0.3692384085597243, + "eval_loss": 3.5602471828460693, + "eval_runtime": 180.7731, + "eval_samples_per_second": 92.066, + "eval_steps_per_second": 5.759, + "step": 33000 + }, + { + "epoch": 9.621687729311049, + "grad_norm": 0.37672215700149536, + "learning_rate": 0.0003118163265306122, + "loss": 3.3735, + "step": 33050 + }, + { + "epoch": 9.636247160910838, + "grad_norm": 0.36633849143981934, + "learning_rate": 0.00031137900874635566, + "loss": 3.3498, + "step": 33100 + }, + { + "epoch": 9.650806592510628, + "grad_norm": 0.3514011800289154, + "learning_rate": 0.00031094169096209913, + "loss": 3.3604, + "step": 33150 + }, + { + "epoch": 9.66536602411042, + "grad_norm": 0.35586225986480713, + "learning_rate": 0.00031050437317784254, + "loss": 3.3574, + "step": 33200 + }, + { + "epoch": 9.67992545571021, + "grad_norm": 0.33317190408706665, + "learning_rate": 0.00031006705539358596, + "loss": 3.3546, + "step": 33250 + }, + { + "epoch": 9.694484887309999, + "grad_norm": 0.35271352529525757, + "learning_rate": 0.00030962973760932937, + "loss": 3.3632, + "step": 33300 + }, + { + "epoch": 9.70904431890979, + "grad_norm": 0.3521358370780945, + "learning_rate": 0.0003091924198250729, + "loss": 3.3584, + "step": 33350 + }, + { + "epoch": 9.72360375050958, + "grad_norm": 0.3574683666229248, + "learning_rate": 0.0003087551020408163, + "loss": 3.3581, + "step": 33400 + }, + { + "epoch": 9.738163182109371, + "grad_norm": 0.3643791377544403, + "learning_rate": 0.0003083177842565597, + "loss": 3.3691, + "step": 33450 + }, + { + "epoch": 9.75272261370916, + "grad_norm": 0.35385361313819885, + "learning_rate": 0.0003078804664723032, + "loss": 3.3547, + "step": 33500 + }, + { + "epoch": 9.767282045308951, + "grad_norm": 0.35955286026000977, + "learning_rate": 0.00030744314868804665, + "loss": 3.3496, + "step": 33550 + }, + { + "epoch": 9.781841476908742, + "grad_norm": 0.3493342697620392, + "learning_rate": 0.00030700583090379006, + "loss": 3.3629, + "step": 33600 + }, + { + "epoch": 9.796400908508533, + "grad_norm": 0.3883078396320343, + "learning_rate": 0.00030656851311953347, + "loss": 3.3643, + "step": 33650 + }, + { + "epoch": 9.810960340108322, + "grad_norm": 0.34926533699035645, + "learning_rate": 0.00030613119533527694, + "loss": 3.3662, + "step": 33700 + }, + { + "epoch": 9.825519771708112, + "grad_norm": 0.37770354747772217, + "learning_rate": 0.00030569387755102035, + "loss": 3.3813, + "step": 33750 + }, + { + "epoch": 9.840079203307903, + "grad_norm": 0.3666662275791168, + "learning_rate": 0.0003052565597667638, + "loss": 3.3669, + "step": 33800 + }, + { + "epoch": 9.854638634907694, + "grad_norm": 0.3690825402736664, + "learning_rate": 0.0003048192419825073, + "loss": 3.3718, + "step": 33850 + }, + { + "epoch": 9.869198066507483, + "grad_norm": 0.3668816387653351, + "learning_rate": 0.0003043819241982507, + "loss": 3.3572, + "step": 33900 + }, + { + "epoch": 9.883757498107274, + "grad_norm": 0.35026848316192627, + "learning_rate": 0.0003039446064139941, + "loss": 3.3714, + "step": 33950 + }, + { + "epoch": 9.898316929707065, + "grad_norm": 0.36591610312461853, + "learning_rate": 0.00030350728862973763, + "loss": 3.3759, + "step": 34000 + }, + { + "epoch": 9.898316929707065, + "eval_accuracy": 0.36981339333556196, + "eval_loss": 3.553095817565918, + "eval_runtime": 180.8581, + "eval_samples_per_second": 92.022, + "eval_steps_per_second": 5.756, + "step": 34000 + }, + { + "epoch": 9.912876361306855, + "grad_norm": 0.3765810430049896, + "learning_rate": 0.00030306997084548104, + "loss": 3.363, + "step": 34050 + }, + { + "epoch": 9.927435792906644, + "grad_norm": 0.3594549000263214, + "learning_rate": 0.00030263265306122445, + "loss": 3.3671, + "step": 34100 + }, + { + "epoch": 9.941995224506435, + "grad_norm": 0.35946381092071533, + "learning_rate": 0.00030219533527696787, + "loss": 3.3735, + "step": 34150 + }, + { + "epoch": 9.956554656106226, + "grad_norm": 0.37179645895957947, + "learning_rate": 0.00030175801749271133, + "loss": 3.3874, + "step": 34200 + }, + { + "epoch": 9.971114087706017, + "grad_norm": 0.36117124557495117, + "learning_rate": 0.0003013206997084548, + "loss": 3.3806, + "step": 34250 + }, + { + "epoch": 9.985673519305806, + "grad_norm": 0.34759020805358887, + "learning_rate": 0.0003008833819241982, + "loss": 3.3681, + "step": 34300 + }, + { + "epoch": 10.0, + "grad_norm": Infinity, + "learning_rate": 0.0003004460641399417, + "loss": 3.3643, + "step": 34350 + }, + { + "epoch": 10.01455943159979, + "grad_norm": 0.35527414083480835, + "learning_rate": 0.0003000087463556851, + "loss": 3.2614, + "step": 34400 + }, + { + "epoch": 10.029118863199582, + "grad_norm": 0.3797459304332733, + "learning_rate": 0.00029957142857142856, + "loss": 3.269, + "step": 34450 + }, + { + "epoch": 10.04367829479937, + "grad_norm": 0.36752596497535706, + "learning_rate": 0.000299134110787172, + "loss": 3.2838, + "step": 34500 + }, + { + "epoch": 10.058237726399161, + "grad_norm": 0.34516459703445435, + "learning_rate": 0.00029869679300291544, + "loss": 3.272, + "step": 34550 + }, + { + "epoch": 10.072797157998952, + "grad_norm": 0.3728445768356323, + "learning_rate": 0.00029825947521865885, + "loss": 3.2696, + "step": 34600 + }, + { + "epoch": 10.087356589598743, + "grad_norm": 0.3747389018535614, + "learning_rate": 0.0002978221574344023, + "loss": 3.2888, + "step": 34650 + }, + { + "epoch": 10.101916021198532, + "grad_norm": 0.34447789192199707, + "learning_rate": 0.00029738483965014573, + "loss": 3.2916, + "step": 34700 + }, + { + "epoch": 10.116475452798323, + "grad_norm": 0.35870856046676636, + "learning_rate": 0.0002969475218658892, + "loss": 3.2913, + "step": 34750 + }, + { + "epoch": 10.131034884398114, + "grad_norm": 0.35672426223754883, + "learning_rate": 0.0002965102040816326, + "loss": 3.2963, + "step": 34800 + }, + { + "epoch": 10.145594315997904, + "grad_norm": 0.36722877621650696, + "learning_rate": 0.0002960728862973761, + "loss": 3.2886, + "step": 34850 + }, + { + "epoch": 10.160153747597693, + "grad_norm": 0.3597167432308197, + "learning_rate": 0.0002956355685131195, + "loss": 3.3118, + "step": 34900 + }, + { + "epoch": 10.174713179197484, + "grad_norm": 0.3561251759529114, + "learning_rate": 0.00029519825072886295, + "loss": 3.2997, + "step": 34950 + }, + { + "epoch": 10.189272610797275, + "grad_norm": 0.37824273109436035, + "learning_rate": 0.00029476093294460637, + "loss": 3.3017, + "step": 35000 + }, + { + "epoch": 10.189272610797275, + "eval_accuracy": 0.3693485845791435, + "eval_loss": 3.5645644664764404, + "eval_runtime": 180.1256, + "eval_samples_per_second": 92.397, + "eval_steps_per_second": 5.779, + "step": 35000 + }, + { + "epoch": 10.203832042397066, + "grad_norm": 0.37217044830322266, + "learning_rate": 0.00029432361516034983, + "loss": 3.3035, + "step": 35050 + }, + { + "epoch": 10.218391473996855, + "grad_norm": 0.3471571207046509, + "learning_rate": 0.0002938862973760933, + "loss": 3.3116, + "step": 35100 + }, + { + "epoch": 10.232950905596645, + "grad_norm": 0.3539142310619354, + "learning_rate": 0.0002934489795918367, + "loss": 3.296, + "step": 35150 + }, + { + "epoch": 10.247510337196436, + "grad_norm": 0.36773473024368286, + "learning_rate": 0.0002930116618075802, + "loss": 3.3028, + "step": 35200 + }, + { + "epoch": 10.262069768796227, + "grad_norm": 0.3689476549625397, + "learning_rate": 0.0002925743440233236, + "loss": 3.3111, + "step": 35250 + }, + { + "epoch": 10.276629200396016, + "grad_norm": 0.3640798032283783, + "learning_rate": 0.00029213702623906706, + "loss": 3.3157, + "step": 35300 + }, + { + "epoch": 10.291188631995807, + "grad_norm": 0.3602818250656128, + "learning_rate": 0.00029169970845481047, + "loss": 3.3138, + "step": 35350 + }, + { + "epoch": 10.305748063595598, + "grad_norm": 0.38390350341796875, + "learning_rate": 0.00029126239067055394, + "loss": 3.3077, + "step": 35400 + }, + { + "epoch": 10.320307495195388, + "grad_norm": 0.36689597368240356, + "learning_rate": 0.00029082507288629735, + "loss": 3.309, + "step": 35450 + }, + { + "epoch": 10.334866926795177, + "grad_norm": 0.3611031770706177, + "learning_rate": 0.00029038775510204076, + "loss": 3.3119, + "step": 35500 + }, + { + "epoch": 10.349426358394968, + "grad_norm": 0.36774659156799316, + "learning_rate": 0.0002899504373177842, + "loss": 3.3189, + "step": 35550 + }, + { + "epoch": 10.363985789994759, + "grad_norm": 0.36395514011383057, + "learning_rate": 0.00028951311953352764, + "loss": 3.3109, + "step": 35600 + }, + { + "epoch": 10.37854522159455, + "grad_norm": 0.362166166305542, + "learning_rate": 0.0002890758017492711, + "loss": 3.3192, + "step": 35650 + }, + { + "epoch": 10.393104653194339, + "grad_norm": 0.3618522882461548, + "learning_rate": 0.0002886384839650145, + "loss": 3.3183, + "step": 35700 + }, + { + "epoch": 10.40766408479413, + "grad_norm": 0.3681625723838806, + "learning_rate": 0.000288201166180758, + "loss": 3.318, + "step": 35750 + }, + { + "epoch": 10.42222351639392, + "grad_norm": 0.3899301588535309, + "learning_rate": 0.00028776384839650145, + "loss": 3.3182, + "step": 35800 + }, + { + "epoch": 10.436782947993711, + "grad_norm": 0.35318905115127563, + "learning_rate": 0.00028732653061224486, + "loss": 3.3284, + "step": 35850 + }, + { + "epoch": 10.4513423795935, + "grad_norm": 0.38061952590942383, + "learning_rate": 0.00028688921282798833, + "loss": 3.3173, + "step": 35900 + }, + { + "epoch": 10.46590181119329, + "grad_norm": 0.3645211160182953, + "learning_rate": 0.00028645189504373174, + "loss": 3.3272, + "step": 35950 + }, + { + "epoch": 10.480461242793082, + "grad_norm": 0.36433538794517517, + "learning_rate": 0.0002860145772594752, + "loss": 3.3164, + "step": 36000 + }, + { + "epoch": 10.480461242793082, + "eval_accuracy": 0.37010747041621017, + "eval_loss": 3.556314706802368, + "eval_runtime": 180.1676, + "eval_samples_per_second": 92.375, + "eval_steps_per_second": 5.778, + "step": 36000 + }, + { + "epoch": 10.495020674392872, + "grad_norm": 0.36834511160850525, + "learning_rate": 0.0002855772594752186, + "loss": 3.3291, + "step": 36050 + }, + { + "epoch": 10.509580105992661, + "grad_norm": 0.3711186945438385, + "learning_rate": 0.0002851399416909621, + "loss": 3.3161, + "step": 36100 + }, + { + "epoch": 10.524139537592452, + "grad_norm": 0.356585294008255, + "learning_rate": 0.0002847026239067055, + "loss": 3.3329, + "step": 36150 + }, + { + "epoch": 10.538698969192243, + "grad_norm": 0.36765870451927185, + "learning_rate": 0.00028426530612244897, + "loss": 3.3352, + "step": 36200 + }, + { + "epoch": 10.553258400792034, + "grad_norm": 0.3481246531009674, + "learning_rate": 0.0002838279883381924, + "loss": 3.3271, + "step": 36250 + }, + { + "epoch": 10.567817832391823, + "grad_norm": 0.35420429706573486, + "learning_rate": 0.00028339067055393585, + "loss": 3.3268, + "step": 36300 + }, + { + "epoch": 10.582377263991614, + "grad_norm": 0.3609519302845001, + "learning_rate": 0.00028295335276967926, + "loss": 3.3295, + "step": 36350 + }, + { + "epoch": 10.596936695591404, + "grad_norm": 0.3677191138267517, + "learning_rate": 0.0002825160349854227, + "loss": 3.3333, + "step": 36400 + }, + { + "epoch": 10.611496127191195, + "grad_norm": 0.37628525495529175, + "learning_rate": 0.00028207871720116614, + "loss": 3.3399, + "step": 36450 + }, + { + "epoch": 10.626055558790984, + "grad_norm": 0.3637225925922394, + "learning_rate": 0.0002816413994169096, + "loss": 3.3349, + "step": 36500 + }, + { + "epoch": 10.640614990390775, + "grad_norm": 0.3519335091114044, + "learning_rate": 0.00028120408163265307, + "loss": 3.3346, + "step": 36550 + }, + { + "epoch": 10.655174421990566, + "grad_norm": 0.348203182220459, + "learning_rate": 0.0002807667638483965, + "loss": 3.3289, + "step": 36600 + }, + { + "epoch": 10.669733853590357, + "grad_norm": 0.36233091354370117, + "learning_rate": 0.00028032944606413995, + "loss": 3.3264, + "step": 36650 + }, + { + "epoch": 10.684293285190146, + "grad_norm": 0.3718380630016327, + "learning_rate": 0.00027989212827988336, + "loss": 3.3317, + "step": 36700 + }, + { + "epoch": 10.698852716789936, + "grad_norm": 0.35991501808166504, + "learning_rate": 0.00027945481049562683, + "loss": 3.3365, + "step": 36750 + }, + { + "epoch": 10.713412148389727, + "grad_norm": 0.37417152523994446, + "learning_rate": 0.00027901749271137024, + "loss": 3.3266, + "step": 36800 + }, + { + "epoch": 10.727971579989518, + "grad_norm": 0.3618806004524231, + "learning_rate": 0.00027858017492711365, + "loss": 3.3338, + "step": 36850 + }, + { + "epoch": 10.742531011589307, + "grad_norm": 0.3808761239051819, + "learning_rate": 0.0002781428571428571, + "loss": 3.3322, + "step": 36900 + }, + { + "epoch": 10.757090443189098, + "grad_norm": 0.35829290747642517, + "learning_rate": 0.00027770553935860053, + "loss": 3.3405, + "step": 36950 + }, + { + "epoch": 10.771649874788888, + "grad_norm": 0.35556626319885254, + "learning_rate": 0.000277268221574344, + "loss": 3.3349, + "step": 37000 + }, + { + "epoch": 10.771649874788888, + "eval_accuracy": 0.3708527165326231, + "eval_loss": 3.548063039779663, + "eval_runtime": 180.2516, + "eval_samples_per_second": 92.332, + "eval_steps_per_second": 5.775, + "step": 37000 + }, + { + "epoch": 10.78620930638868, + "grad_norm": 0.36781635880470276, + "learning_rate": 0.0002768309037900874, + "loss": 3.3393, + "step": 37050 + }, + { + "epoch": 10.800768737988468, + "grad_norm": 0.3739968538284302, + "learning_rate": 0.0002763935860058309, + "loss": 3.3362, + "step": 37100 + }, + { + "epoch": 10.815328169588259, + "grad_norm": 0.37725409865379333, + "learning_rate": 0.0002759562682215743, + "loss": 3.343, + "step": 37150 + }, + { + "epoch": 10.82988760118805, + "grad_norm": 0.3467895984649658, + "learning_rate": 0.00027551895043731776, + "loss": 3.3424, + "step": 37200 + }, + { + "epoch": 10.84444703278784, + "grad_norm": 0.3589009940624237, + "learning_rate": 0.0002750816326530612, + "loss": 3.3149, + "step": 37250 + }, + { + "epoch": 10.85900646438763, + "grad_norm": 0.36413517594337463, + "learning_rate": 0.00027464431486880464, + "loss": 3.3448, + "step": 37300 + }, + { + "epoch": 10.87356589598742, + "grad_norm": 0.3594954311847687, + "learning_rate": 0.0002742069970845481, + "loss": 3.3452, + "step": 37350 + }, + { + "epoch": 10.888125327587211, + "grad_norm": 0.36977705359458923, + "learning_rate": 0.0002737696793002915, + "loss": 3.3387, + "step": 37400 + }, + { + "epoch": 10.902684759187002, + "grad_norm": 0.3728332817554474, + "learning_rate": 0.000273332361516035, + "loss": 3.3554, + "step": 37450 + }, + { + "epoch": 10.917244190786791, + "grad_norm": 0.3603312075138092, + "learning_rate": 0.0002728950437317784, + "loss": 3.3495, + "step": 37500 + }, + { + "epoch": 10.931803622386582, + "grad_norm": 0.37357112765312195, + "learning_rate": 0.00027245772594752186, + "loss": 3.3509, + "step": 37550 + }, + { + "epoch": 10.946363053986373, + "grad_norm": 0.3870396316051483, + "learning_rate": 0.00027202040816326527, + "loss": 3.3451, + "step": 37600 + }, + { + "epoch": 10.960922485586163, + "grad_norm": 0.36924847960472107, + "learning_rate": 0.00027158309037900874, + "loss": 3.3482, + "step": 37650 + }, + { + "epoch": 10.975481917185952, + "grad_norm": 0.3659966289997101, + "learning_rate": 0.00027114577259475215, + "loss": 3.3429, + "step": 37700 + }, + { + "epoch": 10.990041348785743, + "grad_norm": 0.3750581741333008, + "learning_rate": 0.00027070845481049556, + "loss": 3.3467, + "step": 37750 + }, + { + "epoch": 11.004367829479937, + "grad_norm": 0.3540584444999695, + "learning_rate": 0.00027027113702623903, + "loss": 3.3111, + "step": 37800 + }, + { + "epoch": 11.018927261079728, + "grad_norm": 0.36422842741012573, + "learning_rate": 0.0002698338192419825, + "loss": 3.2475, + "step": 37850 + }, + { + "epoch": 11.033486692679517, + "grad_norm": 0.36595383286476135, + "learning_rate": 0.0002693965014577259, + "loss": 3.2508, + "step": 37900 + }, + { + "epoch": 11.048046124279308, + "grad_norm": 0.3714156448841095, + "learning_rate": 0.0002689591836734694, + "loss": 3.2548, + "step": 37950 + }, + { + "epoch": 11.062605555879099, + "grad_norm": 0.38618841767311096, + "learning_rate": 0.00026852186588921284, + "loss": 3.2621, + "step": 38000 + }, + { + "epoch": 11.062605555879099, + "eval_accuracy": 0.370781343166788, + "eval_loss": 3.5563323497772217, + "eval_runtime": 179.9746, + "eval_samples_per_second": 92.474, + "eval_steps_per_second": 5.784, + "step": 38000 + }, + { + "epoch": 11.07716498747889, + "grad_norm": 0.3659396469593048, + "learning_rate": 0.00026808454810495625, + "loss": 3.2429, + "step": 38050 + }, + { + "epoch": 11.091724419078679, + "grad_norm": 0.3699627220630646, + "learning_rate": 0.0002676472303206997, + "loss": 3.2574, + "step": 38100 + }, + { + "epoch": 11.10628385067847, + "grad_norm": 0.371509313583374, + "learning_rate": 0.00026720991253644313, + "loss": 3.2581, + "step": 38150 + }, + { + "epoch": 11.12084328227826, + "grad_norm": 0.3545081317424774, + "learning_rate": 0.00026677259475218655, + "loss": 3.2631, + "step": 38200 + }, + { + "epoch": 11.135402713878051, + "grad_norm": 0.36414968967437744, + "learning_rate": 0.00026633527696793, + "loss": 3.2715, + "step": 38250 + }, + { + "epoch": 11.14996214547784, + "grad_norm": 0.36221858859062195, + "learning_rate": 0.0002658979591836734, + "loss": 3.2702, + "step": 38300 + }, + { + "epoch": 11.16452157707763, + "grad_norm": 0.35454094409942627, + "learning_rate": 0.0002654606413994169, + "loss": 3.2636, + "step": 38350 + }, + { + "epoch": 11.179081008677421, + "grad_norm": 0.38314637541770935, + "learning_rate": 0.0002650233236151603, + "loss": 3.2647, + "step": 38400 + }, + { + "epoch": 11.193640440277212, + "grad_norm": 0.36567234992980957, + "learning_rate": 0.00026458600583090377, + "loss": 3.2764, + "step": 38450 + }, + { + "epoch": 11.208199871877001, + "grad_norm": 0.36688846349716187, + "learning_rate": 0.0002641486880466472, + "loss": 3.2722, + "step": 38500 + }, + { + "epoch": 11.222759303476792, + "grad_norm": 0.38438311219215393, + "learning_rate": 0.00026371137026239065, + "loss": 3.2675, + "step": 38550 + }, + { + "epoch": 11.237318735076583, + "grad_norm": 0.3896602392196655, + "learning_rate": 0.0002632740524781341, + "loss": 3.2684, + "step": 38600 + }, + { + "epoch": 11.251878166676374, + "grad_norm": 0.3787361681461334, + "learning_rate": 0.00026283673469387753, + "loss": 3.2774, + "step": 38650 + }, + { + "epoch": 11.266437598276163, + "grad_norm": 0.36523544788360596, + "learning_rate": 0.000262399416909621, + "loss": 3.2731, + "step": 38700 + }, + { + "epoch": 11.280997029875953, + "grad_norm": 0.38595035672187805, + "learning_rate": 0.0002619620991253644, + "loss": 3.2802, + "step": 38750 + }, + { + "epoch": 11.295556461475744, + "grad_norm": 0.3597980737686157, + "learning_rate": 0.0002615247813411079, + "loss": 3.2836, + "step": 38800 + }, + { + "epoch": 11.310115893075535, + "grad_norm": 0.375411719083786, + "learning_rate": 0.0002610874635568513, + "loss": 3.2809, + "step": 38850 + }, + { + "epoch": 11.324675324675324, + "grad_norm": 0.3683791756629944, + "learning_rate": 0.00026065014577259475, + "loss": 3.2832, + "step": 38900 + }, + { + "epoch": 11.339234756275115, + "grad_norm": 0.36232179403305054, + "learning_rate": 0.00026021282798833817, + "loss": 3.279, + "step": 38950 + }, + { + "epoch": 11.353794187874906, + "grad_norm": 0.3584194779396057, + "learning_rate": 0.0002597755102040816, + "loss": 3.2832, + "step": 39000 + }, + { + "epoch": 11.353794187874906, + "eval_accuracy": 0.37091233151858416, + "eval_loss": 3.553581953048706, + "eval_runtime": 179.822, + "eval_samples_per_second": 92.553, + "eval_steps_per_second": 5.789, + "step": 39000 + }, + { + "epoch": 11.368353619474696, + "grad_norm": 0.36148691177368164, + "learning_rate": 0.00025933819241982504, + "loss": 3.2926, + "step": 39050 + }, + { + "epoch": 11.382913051074485, + "grad_norm": 0.36825209856033325, + "learning_rate": 0.00025890087463556846, + "loss": 3.2974, + "step": 39100 + }, + { + "epoch": 11.397472482674276, + "grad_norm": 0.3690287470817566, + "learning_rate": 0.0002584635568513119, + "loss": 3.285, + "step": 39150 + }, + { + "epoch": 11.412031914274067, + "grad_norm": 0.37193694710731506, + "learning_rate": 0.00025802623906705534, + "loss": 3.3057, + "step": 39200 + }, + { + "epoch": 11.426591345873858, + "grad_norm": 0.3798997700214386, + "learning_rate": 0.0002575889212827988, + "loss": 3.2901, + "step": 39250 + }, + { + "epoch": 11.441150777473647, + "grad_norm": 0.3867810368537903, + "learning_rate": 0.00025715160349854227, + "loss": 3.2994, + "step": 39300 + }, + { + "epoch": 11.455710209073438, + "grad_norm": 0.3750901520252228, + "learning_rate": 0.0002567142857142857, + "loss": 3.2932, + "step": 39350 + }, + { + "epoch": 11.470269640673228, + "grad_norm": 0.35880762338638306, + "learning_rate": 0.00025627696793002915, + "loss": 3.2873, + "step": 39400 + }, + { + "epoch": 11.484829072273019, + "grad_norm": 0.3917964994907379, + "learning_rate": 0.00025583965014577256, + "loss": 3.2952, + "step": 39450 + }, + { + "epoch": 11.499388503872808, + "grad_norm": 0.3772904574871063, + "learning_rate": 0.000255402332361516, + "loss": 3.3044, + "step": 39500 + }, + { + "epoch": 11.513947935472599, + "grad_norm": 0.3691461980342865, + "learning_rate": 0.00025496501457725944, + "loss": 3.3156, + "step": 39550 + }, + { + "epoch": 11.52850736707239, + "grad_norm": 0.36424410343170166, + "learning_rate": 0.0002545276967930029, + "loss": 3.3019, + "step": 39600 + }, + { + "epoch": 11.54306679867218, + "grad_norm": 0.3689974844455719, + "learning_rate": 0.0002540903790087463, + "loss": 3.3016, + "step": 39650 + }, + { + "epoch": 11.55762623027197, + "grad_norm": 0.38458317518234253, + "learning_rate": 0.0002536530612244898, + "loss": 3.3063, + "step": 39700 + }, + { + "epoch": 11.57218566187176, + "grad_norm": 0.3871372640132904, + "learning_rate": 0.0002532157434402332, + "loss": 3.3034, + "step": 39750 + }, + { + "epoch": 11.586745093471551, + "grad_norm": 0.3936833143234253, + "learning_rate": 0.00025277842565597666, + "loss": 3.3101, + "step": 39800 + }, + { + "epoch": 11.601304525071342, + "grad_norm": 0.3917473256587982, + "learning_rate": 0.0002523411078717201, + "loss": 3.3225, + "step": 39850 + }, + { + "epoch": 11.61586395667113, + "grad_norm": 0.3640928864479065, + "learning_rate": 0.00025190379008746354, + "loss": 3.304, + "step": 39900 + }, + { + "epoch": 11.630423388270922, + "grad_norm": 0.4092429578304291, + "learning_rate": 0.00025146647230320696, + "loss": 3.3123, + "step": 39950 + }, + { + "epoch": 11.644982819870712, + "grad_norm": 0.3751949071884155, + "learning_rate": 0.0002510291545189504, + "loss": 3.3111, + "step": 40000 + }, + { + "epoch": 11.644982819870712, + "eval_accuracy": 0.371191240289195, + "eval_loss": 3.5466434955596924, + "eval_runtime": 179.6345, + "eval_samples_per_second": 92.649, + "eval_steps_per_second": 5.795, + "step": 40000 + } + ], + "logging_steps": 50, + "max_steps": 68700, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 10000, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 20, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.35916062261248e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}