diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12796 @@ +{ + "best_global_step": 1800, + "best_metric": 0.74, + "best_model_checkpoint": "/mnt/parscratch/users/acr24wz/etu/topcon/qwen3_4B/cpt_model/balanced/finetuned/all/checkpoint-1800", + "epoch": 2.0642662458757712, + "eval_steps": 100, + "global_step": 1800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0011476115334959117, + "grad_norm": 201.0, + "learning_rate": 0.0, + "loss": 18.5701, + "step": 1 + }, + { + "epoch": 0.0022952230669918234, + "grad_norm": 1392.0, + "learning_rate": 4.587155963302753e-08, + "loss": 12.0441, + "step": 2 + }, + { + "epoch": 0.0034428346004877347, + "grad_norm": 161.0, + "learning_rate": 9.174311926605506e-08, + "loss": 14.3223, + "step": 3 + }, + { + "epoch": 0.004590446133983647, + "grad_norm": 150.0, + "learning_rate": 1.376146788990826e-07, + "loss": 10.3759, + "step": 4 + }, + { + "epoch": 0.005738057667479558, + "grad_norm": 141.0, + "learning_rate": 1.8348623853211012e-07, + "loss": 11.4624, + "step": 5 + }, + { + "epoch": 0.006885669200975469, + "grad_norm": 159.0, + "learning_rate": 2.2935779816513764e-07, + "loss": 10.5127, + "step": 6 + }, + { + "epoch": 0.008033280734471382, + "grad_norm": 166.0, + "learning_rate": 2.752293577981652e-07, + "loss": 15.7339, + "step": 7 + }, + { + "epoch": 0.009180892267967294, + "grad_norm": 310.0, + "learning_rate": 3.211009174311927e-07, + "loss": 21.2237, + "step": 8 + }, + { + "epoch": 0.010328503801463204, + "grad_norm": 153.0, + "learning_rate": 3.6697247706422023e-07, + "loss": 11.4438, + "step": 9 + }, + { + "epoch": 0.011476115334959116, + "grad_norm": 146.0, + "learning_rate": 4.128440366972478e-07, + "loss": 14.27, + "step": 10 + }, + { + "epoch": 0.012623726868455028, + "grad_norm": 185.0, + "learning_rate": 4.587155963302753e-07, + "loss": 17.4331, + "step": 11 + }, + { + "epoch": 0.013771338401950939, + "grad_norm": 157.0, + "learning_rate": 5.045871559633028e-07, + "loss": 16.0972, + "step": 12 + }, + { + "epoch": 0.014918949935446851, + "grad_norm": 146.0, + "learning_rate": 5.504587155963304e-07, + "loss": 10.9198, + "step": 13 + }, + { + "epoch": 0.016066561468942763, + "grad_norm": 194.0, + "learning_rate": 5.963302752293579e-07, + "loss": 14.1635, + "step": 14 + }, + { + "epoch": 0.017214173002438674, + "grad_norm": 162.0, + "learning_rate": 6.422018348623854e-07, + "loss": 11.853, + "step": 15 + }, + { + "epoch": 0.018361784535934587, + "grad_norm": 160.0, + "learning_rate": 6.880733944954129e-07, + "loss": 12.7435, + "step": 16 + }, + { + "epoch": 0.019509396069430498, + "grad_norm": 158.0, + "learning_rate": 7.339449541284405e-07, + "loss": 11.8396, + "step": 17 + }, + { + "epoch": 0.020657007602926408, + "grad_norm": 165.0, + "learning_rate": 7.79816513761468e-07, + "loss": 11.5206, + "step": 18 + }, + { + "epoch": 0.021804619136422322, + "grad_norm": 177.0, + "learning_rate": 8.256880733944956e-07, + "loss": 11.5111, + "step": 19 + }, + { + "epoch": 0.022952230669918233, + "grad_norm": 157.0, + "learning_rate": 8.71559633027523e-07, + "loss": 14.7131, + "step": 20 + }, + { + "epoch": 0.024099842203414143, + "grad_norm": 172.0, + "learning_rate": 9.174311926605506e-07, + "loss": 11.1313, + "step": 21 + }, + { + "epoch": 0.025247453736910057, + "grad_norm": 194.0, + "learning_rate": 9.633027522935782e-07, + "loss": 16.6221, + "step": 22 + }, + { + "epoch": 0.026395065270405967, + "grad_norm": 268.0, + "learning_rate": 1.0091743119266057e-06, + "loss": 15.6201, + "step": 23 + }, + { + "epoch": 0.027542676803901878, + "grad_norm": 218.0, + "learning_rate": 1.055045871559633e-06, + "loss": 11.6275, + "step": 24 + }, + { + "epoch": 0.02869028833739779, + "grad_norm": 158.0, + "learning_rate": 1.1009174311926608e-06, + "loss": 15.5678, + "step": 25 + }, + { + "epoch": 0.029837899870893702, + "grad_norm": 236.0, + "learning_rate": 1.1467889908256882e-06, + "loss": 15.4133, + "step": 26 + }, + { + "epoch": 0.030985511404389616, + "grad_norm": 131.0, + "learning_rate": 1.1926605504587159e-06, + "loss": 12.6406, + "step": 27 + }, + { + "epoch": 0.032133122937885526, + "grad_norm": 139.0, + "learning_rate": 1.2385321100917433e-06, + "loss": 15.0131, + "step": 28 + }, + { + "epoch": 0.03328073447138144, + "grad_norm": 150.0, + "learning_rate": 1.2844036697247707e-06, + "loss": 13.4583, + "step": 29 + }, + { + "epoch": 0.03442834600487735, + "grad_norm": 166.0, + "learning_rate": 1.3302752293577984e-06, + "loss": 15.6894, + "step": 30 + }, + { + "epoch": 0.03557595753837326, + "grad_norm": 168.0, + "learning_rate": 1.3761467889908258e-06, + "loss": 13.8435, + "step": 31 + }, + { + "epoch": 0.036723569071869175, + "grad_norm": 191.0, + "learning_rate": 1.4220183486238535e-06, + "loss": 8.6607, + "step": 32 + }, + { + "epoch": 0.03787118060536508, + "grad_norm": 140.0, + "learning_rate": 1.467889908256881e-06, + "loss": 12.4132, + "step": 33 + }, + { + "epoch": 0.039018792138860996, + "grad_norm": 201.0, + "learning_rate": 1.5137614678899084e-06, + "loss": 13.8843, + "step": 34 + }, + { + "epoch": 0.04016640367235691, + "grad_norm": 164.0, + "learning_rate": 1.559633027522936e-06, + "loss": 15.1008, + "step": 35 + }, + { + "epoch": 0.041314015205852817, + "grad_norm": 284.0, + "learning_rate": 1.6055045871559635e-06, + "loss": 21.6636, + "step": 36 + }, + { + "epoch": 0.04246162673934873, + "grad_norm": 394.0, + "learning_rate": 1.6513761467889911e-06, + "loss": 14.0359, + "step": 37 + }, + { + "epoch": 0.043609238272844644, + "grad_norm": 161.0, + "learning_rate": 1.6972477064220186e-06, + "loss": 13.4677, + "step": 38 + }, + { + "epoch": 0.04475684980634055, + "grad_norm": 164.0, + "learning_rate": 1.743119266055046e-06, + "loss": 8.3447, + "step": 39 + }, + { + "epoch": 0.045904461339836465, + "grad_norm": 148.0, + "learning_rate": 1.7889908256880737e-06, + "loss": 6.5679, + "step": 40 + }, + { + "epoch": 0.04705207287333238, + "grad_norm": 168.0, + "learning_rate": 1.8348623853211011e-06, + "loss": 15.6762, + "step": 41 + }, + { + "epoch": 0.048199684406828286, + "grad_norm": 164.0, + "learning_rate": 1.8807339449541288e-06, + "loss": 6.2052, + "step": 42 + }, + { + "epoch": 0.0493472959403242, + "grad_norm": 103.5, + "learning_rate": 1.9266055045871564e-06, + "loss": 8.8464, + "step": 43 + }, + { + "epoch": 0.050494907473820114, + "grad_norm": 372.0, + "learning_rate": 1.9724770642201837e-06, + "loss": 15.5934, + "step": 44 + }, + { + "epoch": 0.05164251900731602, + "grad_norm": 210.0, + "learning_rate": 2.0183486238532113e-06, + "loss": 11.6431, + "step": 45 + }, + { + "epoch": 0.052790130540811935, + "grad_norm": 125.5, + "learning_rate": 2.064220183486239e-06, + "loss": 8.5935, + "step": 46 + }, + { + "epoch": 0.05393774207430785, + "grad_norm": 102.5, + "learning_rate": 2.110091743119266e-06, + "loss": 9.1192, + "step": 47 + }, + { + "epoch": 0.055085353607803755, + "grad_norm": 143.0, + "learning_rate": 2.155963302752294e-06, + "loss": 8.8696, + "step": 48 + }, + { + "epoch": 0.05623296514129967, + "grad_norm": 137.0, + "learning_rate": 2.2018348623853215e-06, + "loss": 11.2497, + "step": 49 + }, + { + "epoch": 0.05738057667479558, + "grad_norm": 129.0, + "learning_rate": 2.2477064220183487e-06, + "loss": 11.6115, + "step": 50 + }, + { + "epoch": 0.05852818820829149, + "grad_norm": 149.0, + "learning_rate": 2.2935779816513764e-06, + "loss": 13.9466, + "step": 51 + }, + { + "epoch": 0.059675799741787404, + "grad_norm": 120.5, + "learning_rate": 2.339449541284404e-06, + "loss": 9.2116, + "step": 52 + }, + { + "epoch": 0.06082341127528332, + "grad_norm": 240.0, + "learning_rate": 2.3853211009174317e-06, + "loss": 21.5138, + "step": 53 + }, + { + "epoch": 0.06197102280877923, + "grad_norm": 112.0, + "learning_rate": 2.431192660550459e-06, + "loss": 9.7849, + "step": 54 + }, + { + "epoch": 0.06311863434227515, + "grad_norm": 135.0, + "learning_rate": 2.4770642201834866e-06, + "loss": 12.795, + "step": 55 + }, + { + "epoch": 0.06426624587577105, + "grad_norm": 222.0, + "learning_rate": 2.522935779816514e-06, + "loss": 15.822, + "step": 56 + }, + { + "epoch": 0.06541385740926696, + "grad_norm": 137.0, + "learning_rate": 2.5688073394495415e-06, + "loss": 14.4134, + "step": 57 + }, + { + "epoch": 0.06656146894276288, + "grad_norm": 100.0, + "learning_rate": 2.6146788990825687e-06, + "loss": 10.1907, + "step": 58 + }, + { + "epoch": 0.06770908047625879, + "grad_norm": 154.0, + "learning_rate": 2.6605504587155968e-06, + "loss": 14.2824, + "step": 59 + }, + { + "epoch": 0.0688566920097547, + "grad_norm": 109.0, + "learning_rate": 2.706422018348624e-06, + "loss": 8.6855, + "step": 60 + }, + { + "epoch": 0.07000430354325061, + "grad_norm": 198.0, + "learning_rate": 2.7522935779816517e-06, + "loss": 14.2233, + "step": 61 + }, + { + "epoch": 0.07115191507674652, + "grad_norm": 134.0, + "learning_rate": 2.798165137614679e-06, + "loss": 11.8776, + "step": 62 + }, + { + "epoch": 0.07229952661024243, + "grad_norm": 112.5, + "learning_rate": 2.844036697247707e-06, + "loss": 9.321, + "step": 63 + }, + { + "epoch": 0.07344713814373835, + "grad_norm": 118.0, + "learning_rate": 2.8899082568807342e-06, + "loss": 10.727, + "step": 64 + }, + { + "epoch": 0.07459474967723426, + "grad_norm": 167.0, + "learning_rate": 2.935779816513762e-06, + "loss": 13.9879, + "step": 65 + }, + { + "epoch": 0.07574236121073016, + "grad_norm": 100.0, + "learning_rate": 2.981651376146789e-06, + "loss": 7.9334, + "step": 66 + }, + { + "epoch": 0.07688997274422608, + "grad_norm": 153.0, + "learning_rate": 3.0275229357798168e-06, + "loss": 14.9343, + "step": 67 + }, + { + "epoch": 0.07803758427772199, + "grad_norm": 119.0, + "learning_rate": 3.073394495412844e-06, + "loss": 8.6676, + "step": 68 + }, + { + "epoch": 0.0791851958112179, + "grad_norm": 117.5, + "learning_rate": 3.119266055045872e-06, + "loss": 10.101, + "step": 69 + }, + { + "epoch": 0.08033280734471382, + "grad_norm": 85.5, + "learning_rate": 3.1651376146788993e-06, + "loss": 7.5899, + "step": 70 + }, + { + "epoch": 0.08148041887820973, + "grad_norm": 174.0, + "learning_rate": 3.211009174311927e-06, + "loss": 15.9673, + "step": 71 + }, + { + "epoch": 0.08262803041170563, + "grad_norm": 221.0, + "learning_rate": 3.256880733944954e-06, + "loss": 14.1455, + "step": 72 + }, + { + "epoch": 0.08377564194520155, + "grad_norm": 172.0, + "learning_rate": 3.3027522935779823e-06, + "loss": 15.9228, + "step": 73 + }, + { + "epoch": 0.08492325347869746, + "grad_norm": 144.0, + "learning_rate": 3.3486238532110095e-06, + "loss": 12.6043, + "step": 74 + }, + { + "epoch": 0.08607086501219337, + "grad_norm": 118.5, + "learning_rate": 3.394495412844037e-06, + "loss": 9.2068, + "step": 75 + }, + { + "epoch": 0.08721847654568929, + "grad_norm": 147.0, + "learning_rate": 3.4403669724770644e-06, + "loss": 11.8722, + "step": 76 + }, + { + "epoch": 0.0883660880791852, + "grad_norm": 119.5, + "learning_rate": 3.486238532110092e-06, + "loss": 10.4207, + "step": 77 + }, + { + "epoch": 0.0895136996126811, + "grad_norm": 170.0, + "learning_rate": 3.5321100917431193e-06, + "loss": 14.4936, + "step": 78 + }, + { + "epoch": 0.09066131114617702, + "grad_norm": 183.0, + "learning_rate": 3.5779816513761473e-06, + "loss": 14.2192, + "step": 79 + }, + { + "epoch": 0.09180892267967293, + "grad_norm": 128.0, + "learning_rate": 3.6238532110091746e-06, + "loss": 12.4628, + "step": 80 + }, + { + "epoch": 0.09295653421316884, + "grad_norm": 100.0, + "learning_rate": 3.6697247706422022e-06, + "loss": 5.9004, + "step": 81 + }, + { + "epoch": 0.09410414574666476, + "grad_norm": 163.0, + "learning_rate": 3.7155963302752295e-06, + "loss": 12.766, + "step": 82 + }, + { + "epoch": 0.09525175728016066, + "grad_norm": 202.0, + "learning_rate": 3.7614678899082575e-06, + "loss": 14.3118, + "step": 83 + }, + { + "epoch": 0.09639936881365657, + "grad_norm": 314.0, + "learning_rate": 3.8073394495412848e-06, + "loss": 12.7559, + "step": 84 + }, + { + "epoch": 0.09754698034715249, + "grad_norm": 100.0, + "learning_rate": 3.853211009174313e-06, + "loss": 7.6448, + "step": 85 + }, + { + "epoch": 0.0986945918806484, + "grad_norm": 135.0, + "learning_rate": 3.89908256880734e-06, + "loss": 11.1222, + "step": 86 + }, + { + "epoch": 0.0998422034141443, + "grad_norm": 176.0, + "learning_rate": 3.944954128440367e-06, + "loss": 11.0153, + "step": 87 + }, + { + "epoch": 0.10098981494764023, + "grad_norm": 130.0, + "learning_rate": 3.9908256880733945e-06, + "loss": 11.3109, + "step": 88 + }, + { + "epoch": 0.10213742648113613, + "grad_norm": 176.0, + "learning_rate": 4.036697247706423e-06, + "loss": 11.3729, + "step": 89 + }, + { + "epoch": 0.10328503801463204, + "grad_norm": 132.0, + "learning_rate": 4.08256880733945e-06, + "loss": 10.5579, + "step": 90 + }, + { + "epoch": 0.10443264954812796, + "grad_norm": 126.0, + "learning_rate": 4.128440366972478e-06, + "loss": 9.2442, + "step": 91 + }, + { + "epoch": 0.10558026108162387, + "grad_norm": 151.0, + "learning_rate": 4.174311926605505e-06, + "loss": 13.6998, + "step": 92 + }, + { + "epoch": 0.10672787261511978, + "grad_norm": 99.5, + "learning_rate": 4.220183486238532e-06, + "loss": 8.6871, + "step": 93 + }, + { + "epoch": 0.1078754841486157, + "grad_norm": 128.0, + "learning_rate": 4.26605504587156e-06, + "loss": 8.4919, + "step": 94 + }, + { + "epoch": 0.1090230956821116, + "grad_norm": 132.0, + "learning_rate": 4.311926605504588e-06, + "loss": 8.9568, + "step": 95 + }, + { + "epoch": 0.11017070721560751, + "grad_norm": 135.0, + "learning_rate": 4.357798165137615e-06, + "loss": 11.2536, + "step": 96 + }, + { + "epoch": 0.11131831874910343, + "grad_norm": 141.0, + "learning_rate": 4.403669724770643e-06, + "loss": 10.4686, + "step": 97 + }, + { + "epoch": 0.11246593028259934, + "grad_norm": 78.5, + "learning_rate": 4.44954128440367e-06, + "loss": 4.7855, + "step": 98 + }, + { + "epoch": 0.11361354181609525, + "grad_norm": 126.5, + "learning_rate": 4.4954128440366975e-06, + "loss": 8.6237, + "step": 99 + }, + { + "epoch": 0.11476115334959117, + "grad_norm": 104.5, + "learning_rate": 4.541284403669725e-06, + "loss": 6.5662, + "step": 100 + }, + { + "epoch": 0.11476115334959117, + "eval_accuracy": 0.46, + "eval_loss": 10.765486717224121, + "eval_runtime": 49.6485, + "eval_samples_per_second": 2.014, + "eval_steps_per_second": 2.014, + "step": 100 + }, + { + "epoch": 0.11590876488308707, + "grad_norm": 103.0, + "learning_rate": 4.587155963302753e-06, + "loss": 6.5649, + "step": 101 + }, + { + "epoch": 0.11705637641658298, + "grad_norm": 144.0, + "learning_rate": 4.63302752293578e-06, + "loss": 8.2535, + "step": 102 + }, + { + "epoch": 0.1182039879500789, + "grad_norm": 135.0, + "learning_rate": 4.678899082568808e-06, + "loss": 11.0001, + "step": 103 + }, + { + "epoch": 0.11935159948357481, + "grad_norm": 109.0, + "learning_rate": 4.724770642201835e-06, + "loss": 8.2321, + "step": 104 + }, + { + "epoch": 0.12049921101707071, + "grad_norm": 134.0, + "learning_rate": 4.770642201834863e-06, + "loss": 10.8236, + "step": 105 + }, + { + "epoch": 0.12164682255056664, + "grad_norm": 133.0, + "learning_rate": 4.816513761467891e-06, + "loss": 10.03, + "step": 106 + }, + { + "epoch": 0.12279443408406254, + "grad_norm": 148.0, + "learning_rate": 4.862385321100918e-06, + "loss": 13.1908, + "step": 107 + }, + { + "epoch": 0.12394204561755846, + "grad_norm": 64.0, + "learning_rate": 4.908256880733945e-06, + "loss": 3.6086, + "step": 108 + }, + { + "epoch": 0.12508965715105436, + "grad_norm": 139.0, + "learning_rate": 4.954128440366973e-06, + "loss": 10.9146, + "step": 109 + }, + { + "epoch": 0.1262372686845503, + "grad_norm": 100.5, + "learning_rate": 5e-06, + "loss": 7.6266, + "step": 110 + }, + { + "epoch": 0.1273848802180462, + "grad_norm": 102.0, + "learning_rate": 5.045871559633028e-06, + "loss": 8.3553, + "step": 111 + }, + { + "epoch": 0.1285324917515421, + "grad_norm": 145.0, + "learning_rate": 5.091743119266055e-06, + "loss": 8.7646, + "step": 112 + }, + { + "epoch": 0.129680103285038, + "grad_norm": 178.0, + "learning_rate": 5.137614678899083e-06, + "loss": 12.6374, + "step": 113 + }, + { + "epoch": 0.13082771481853392, + "grad_norm": 91.5, + "learning_rate": 5.18348623853211e-06, + "loss": 5.8455, + "step": 114 + }, + { + "epoch": 0.13197532635202983, + "grad_norm": 122.0, + "learning_rate": 5.229357798165137e-06, + "loss": 9.7438, + "step": 115 + }, + { + "epoch": 0.13312293788552576, + "grad_norm": 192.0, + "learning_rate": 5.275229357798165e-06, + "loss": 9.9915, + "step": 116 + }, + { + "epoch": 0.13427054941902167, + "grad_norm": 108.0, + "learning_rate": 5.3211009174311936e-06, + "loss": 7.6686, + "step": 117 + }, + { + "epoch": 0.13541816095251757, + "grad_norm": 153.0, + "learning_rate": 5.366972477064221e-06, + "loss": 10.4111, + "step": 118 + }, + { + "epoch": 0.13656577248601348, + "grad_norm": 139.0, + "learning_rate": 5.412844036697248e-06, + "loss": 6.46, + "step": 119 + }, + { + "epoch": 0.1377133840195094, + "grad_norm": 79.0, + "learning_rate": 5.458715596330275e-06, + "loss": 5.2337, + "step": 120 + }, + { + "epoch": 0.1388609955530053, + "grad_norm": 114.5, + "learning_rate": 5.504587155963303e-06, + "loss": 5.2836, + "step": 121 + }, + { + "epoch": 0.14000860708650123, + "grad_norm": 99.5, + "learning_rate": 5.5504587155963306e-06, + "loss": 7.6412, + "step": 122 + }, + { + "epoch": 0.14115621861999714, + "grad_norm": 147.0, + "learning_rate": 5.596330275229358e-06, + "loss": 9.4328, + "step": 123 + }, + { + "epoch": 0.14230383015349304, + "grad_norm": 114.5, + "learning_rate": 5.642201834862385e-06, + "loss": 7.6121, + "step": 124 + }, + { + "epoch": 0.14345144168698895, + "grad_norm": 131.0, + "learning_rate": 5.688073394495414e-06, + "loss": 8.1481, + "step": 125 + }, + { + "epoch": 0.14459905322048486, + "grad_norm": 124.5, + "learning_rate": 5.733944954128441e-06, + "loss": 6.9154, + "step": 126 + }, + { + "epoch": 0.14574666475398076, + "grad_norm": 125.0, + "learning_rate": 5.7798165137614684e-06, + "loss": 7.5579, + "step": 127 + }, + { + "epoch": 0.1468942762874767, + "grad_norm": 69.0, + "learning_rate": 5.825688073394496e-06, + "loss": 4.5767, + "step": 128 + }, + { + "epoch": 0.1480418878209726, + "grad_norm": 136.0, + "learning_rate": 5.871559633027524e-06, + "loss": 8.226, + "step": 129 + }, + { + "epoch": 0.1491894993544685, + "grad_norm": 592.0, + "learning_rate": 5.917431192660551e-06, + "loss": 4.7686, + "step": 130 + }, + { + "epoch": 0.15033711088796442, + "grad_norm": 199.0, + "learning_rate": 5.963302752293578e-06, + "loss": 8.1914, + "step": 131 + }, + { + "epoch": 0.15148472242146033, + "grad_norm": 99.0, + "learning_rate": 6.0091743119266054e-06, + "loss": 4.6827, + "step": 132 + }, + { + "epoch": 0.15263233395495623, + "grad_norm": 79.0, + "learning_rate": 6.0550458715596335e-06, + "loss": 4.0036, + "step": 133 + }, + { + "epoch": 0.15377994548845217, + "grad_norm": 104.0, + "learning_rate": 6.100917431192661e-06, + "loss": 5.5383, + "step": 134 + }, + { + "epoch": 0.15492755702194808, + "grad_norm": 89.5, + "learning_rate": 6.146788990825688e-06, + "loss": 5.6737, + "step": 135 + }, + { + "epoch": 0.15607516855544398, + "grad_norm": 126.0, + "learning_rate": 6.192660550458715e-06, + "loss": 6.3379, + "step": 136 + }, + { + "epoch": 0.1572227800889399, + "grad_norm": 106.0, + "learning_rate": 6.238532110091744e-06, + "loss": 5.8609, + "step": 137 + }, + { + "epoch": 0.1583703916224358, + "grad_norm": 74.0, + "learning_rate": 6.284403669724771e-06, + "loss": 2.5903, + "step": 138 + }, + { + "epoch": 0.1595180031559317, + "grad_norm": 166.0, + "learning_rate": 6.330275229357799e-06, + "loss": 6.5836, + "step": 139 + }, + { + "epoch": 0.16066561468942764, + "grad_norm": 132.0, + "learning_rate": 6.376146788990826e-06, + "loss": 4.7941, + "step": 140 + }, + { + "epoch": 0.16181322622292355, + "grad_norm": 79.5, + "learning_rate": 6.422018348623854e-06, + "loss": 3.4315, + "step": 141 + }, + { + "epoch": 0.16296083775641945, + "grad_norm": 90.0, + "learning_rate": 6.467889908256881e-06, + "loss": 2.8439, + "step": 142 + }, + { + "epoch": 0.16410844928991536, + "grad_norm": 147.0, + "learning_rate": 6.513761467889908e-06, + "loss": 5.9459, + "step": 143 + }, + { + "epoch": 0.16525606082341127, + "grad_norm": 127.5, + "learning_rate": 6.559633027522936e-06, + "loss": 5.9421, + "step": 144 + }, + { + "epoch": 0.1664036723569072, + "grad_norm": 108.5, + "learning_rate": 6.6055045871559645e-06, + "loss": 4.3347, + "step": 145 + }, + { + "epoch": 0.1675512838904031, + "grad_norm": 110.0, + "learning_rate": 6.651376146788992e-06, + "loss": 2.862, + "step": 146 + }, + { + "epoch": 0.16869889542389901, + "grad_norm": 91.5, + "learning_rate": 6.697247706422019e-06, + "loss": 3.0382, + "step": 147 + }, + { + "epoch": 0.16984650695739492, + "grad_norm": 90.0, + "learning_rate": 6.743119266055046e-06, + "loss": 2.4137, + "step": 148 + }, + { + "epoch": 0.17099411849089083, + "grad_norm": 205.0, + "learning_rate": 6.788990825688074e-06, + "loss": 3.6585, + "step": 149 + }, + { + "epoch": 0.17214173002438674, + "grad_norm": 132.0, + "learning_rate": 6.8348623853211015e-06, + "loss": 3.3452, + "step": 150 + }, + { + "epoch": 0.17328934155788267, + "grad_norm": 102.5, + "learning_rate": 6.880733944954129e-06, + "loss": 2.6872, + "step": 151 + }, + { + "epoch": 0.17443695309137858, + "grad_norm": 92.5, + "learning_rate": 6.926605504587156e-06, + "loss": 2.7081, + "step": 152 + }, + { + "epoch": 0.17558456462487448, + "grad_norm": 97.0, + "learning_rate": 6.972477064220184e-06, + "loss": 1.789, + "step": 153 + }, + { + "epoch": 0.1767321761583704, + "grad_norm": 96.5, + "learning_rate": 7.018348623853211e-06, + "loss": 2.1933, + "step": 154 + }, + { + "epoch": 0.1778797876918663, + "grad_norm": 117.0, + "learning_rate": 7.0642201834862385e-06, + "loss": 1.5972, + "step": 155 + }, + { + "epoch": 0.1790273992253622, + "grad_norm": 70.0, + "learning_rate": 7.110091743119267e-06, + "loss": 1.6302, + "step": 156 + }, + { + "epoch": 0.18017501075885814, + "grad_norm": 50.0, + "learning_rate": 7.155963302752295e-06, + "loss": 1.1936, + "step": 157 + }, + { + "epoch": 0.18132262229235405, + "grad_norm": 71.5, + "learning_rate": 7.201834862385322e-06, + "loss": 1.2134, + "step": 158 + }, + { + "epoch": 0.18247023382584995, + "grad_norm": 37.75, + "learning_rate": 7.247706422018349e-06, + "loss": 0.8042, + "step": 159 + }, + { + "epoch": 0.18361784535934586, + "grad_norm": 54.75, + "learning_rate": 7.293577981651376e-06, + "loss": 0.7016, + "step": 160 + }, + { + "epoch": 0.18476545689284177, + "grad_norm": 141.0, + "learning_rate": 7.3394495412844045e-06, + "loss": 1.6214, + "step": 161 + }, + { + "epoch": 0.18591306842633767, + "grad_norm": 118.0, + "learning_rate": 7.385321100917432e-06, + "loss": 1.4091, + "step": 162 + }, + { + "epoch": 0.1870606799598336, + "grad_norm": 42.0, + "learning_rate": 7.431192660550459e-06, + "loss": 0.636, + "step": 163 + }, + { + "epoch": 0.18820829149332952, + "grad_norm": 165.0, + "learning_rate": 7.477064220183486e-06, + "loss": 1.8344, + "step": 164 + }, + { + "epoch": 0.18935590302682542, + "grad_norm": 62.0, + "learning_rate": 7.522935779816515e-06, + "loss": 0.5202, + "step": 165 + }, + { + "epoch": 0.19050351456032133, + "grad_norm": 87.5, + "learning_rate": 7.568807339449542e-06, + "loss": 1.1639, + "step": 166 + }, + { + "epoch": 0.19165112609381724, + "grad_norm": 53.0, + "learning_rate": 7.6146788990825695e-06, + "loss": 0.907, + "step": 167 + }, + { + "epoch": 0.19279873762731314, + "grad_norm": 59.5, + "learning_rate": 7.660550458715596e-06, + "loss": 1.0624, + "step": 168 + }, + { + "epoch": 0.19394634916080908, + "grad_norm": 37.0, + "learning_rate": 7.706422018348626e-06, + "loss": 0.6051, + "step": 169 + }, + { + "epoch": 0.19509396069430499, + "grad_norm": 50.75, + "learning_rate": 7.752293577981652e-06, + "loss": 0.9568, + "step": 170 + }, + { + "epoch": 0.1962415722278009, + "grad_norm": 99.5, + "learning_rate": 7.79816513761468e-06, + "loss": 1.0009, + "step": 171 + }, + { + "epoch": 0.1973891837612968, + "grad_norm": 58.75, + "learning_rate": 7.844036697247707e-06, + "loss": 1.2179, + "step": 172 + }, + { + "epoch": 0.1985367952947927, + "grad_norm": 30.375, + "learning_rate": 7.889908256880735e-06, + "loss": 0.2789, + "step": 173 + }, + { + "epoch": 0.1996844068282886, + "grad_norm": 48.5, + "learning_rate": 7.935779816513763e-06, + "loss": 0.7911, + "step": 174 + }, + { + "epoch": 0.20083201836178455, + "grad_norm": 42.25, + "learning_rate": 7.981651376146789e-06, + "loss": 0.8686, + "step": 175 + }, + { + "epoch": 0.20197962989528045, + "grad_norm": 141.0, + "learning_rate": 8.027522935779817e-06, + "loss": 1.1276, + "step": 176 + }, + { + "epoch": 0.20312724142877636, + "grad_norm": 156.0, + "learning_rate": 8.073394495412845e-06, + "loss": 0.8758, + "step": 177 + }, + { + "epoch": 0.20427485296227227, + "grad_norm": 32.75, + "learning_rate": 8.119266055045872e-06, + "loss": 0.6642, + "step": 178 + }, + { + "epoch": 0.20542246449576818, + "grad_norm": 56.25, + "learning_rate": 8.1651376146789e-06, + "loss": 1.0594, + "step": 179 + }, + { + "epoch": 0.20657007602926408, + "grad_norm": 34.5, + "learning_rate": 8.211009174311926e-06, + "loss": 0.6556, + "step": 180 + }, + { + "epoch": 0.20771768756276002, + "grad_norm": 80.5, + "learning_rate": 8.256880733944956e-06, + "loss": 0.8868, + "step": 181 + }, + { + "epoch": 0.20886529909625592, + "grad_norm": 47.5, + "learning_rate": 8.302752293577982e-06, + "loss": 0.7725, + "step": 182 + }, + { + "epoch": 0.21001291062975183, + "grad_norm": 23.0, + "learning_rate": 8.34862385321101e-06, + "loss": 0.6719, + "step": 183 + }, + { + "epoch": 0.21116052216324774, + "grad_norm": 72.0, + "learning_rate": 8.394495412844037e-06, + "loss": 0.8492, + "step": 184 + }, + { + "epoch": 0.21230813369674364, + "grad_norm": 73.0, + "learning_rate": 8.440366972477065e-06, + "loss": 0.7163, + "step": 185 + }, + { + "epoch": 0.21345574523023955, + "grad_norm": 82.0, + "learning_rate": 8.486238532110093e-06, + "loss": 0.7227, + "step": 186 + }, + { + "epoch": 0.2146033567637355, + "grad_norm": 48.75, + "learning_rate": 8.53211009174312e-06, + "loss": 0.8237, + "step": 187 + }, + { + "epoch": 0.2157509682972314, + "grad_norm": 30.75, + "learning_rate": 8.577981651376147e-06, + "loss": 0.7007, + "step": 188 + }, + { + "epoch": 0.2168985798307273, + "grad_norm": 76.0, + "learning_rate": 8.623853211009175e-06, + "loss": 0.7568, + "step": 189 + }, + { + "epoch": 0.2180461913642232, + "grad_norm": 126.0, + "learning_rate": 8.669724770642203e-06, + "loss": 0.8139, + "step": 190 + }, + { + "epoch": 0.21919380289771911, + "grad_norm": 67.5, + "learning_rate": 8.71559633027523e-06, + "loss": 0.7062, + "step": 191 + }, + { + "epoch": 0.22034141443121502, + "grad_norm": 26.375, + "learning_rate": 8.761467889908258e-06, + "loss": 0.5425, + "step": 192 + }, + { + "epoch": 0.22148902596471096, + "grad_norm": 105.5, + "learning_rate": 8.807339449541286e-06, + "loss": 0.8822, + "step": 193 + }, + { + "epoch": 0.22263663749820686, + "grad_norm": 131.0, + "learning_rate": 8.853211009174312e-06, + "loss": 0.9047, + "step": 194 + }, + { + "epoch": 0.22378424903170277, + "grad_norm": 56.5, + "learning_rate": 8.89908256880734e-06, + "loss": 0.5039, + "step": 195 + }, + { + "epoch": 0.22493186056519868, + "grad_norm": 73.5, + "learning_rate": 8.944954128440367e-06, + "loss": 0.7597, + "step": 196 + }, + { + "epoch": 0.22607947209869458, + "grad_norm": 56.25, + "learning_rate": 8.990825688073395e-06, + "loss": 0.742, + "step": 197 + }, + { + "epoch": 0.2272270836321905, + "grad_norm": 71.5, + "learning_rate": 9.036697247706423e-06, + "loss": 0.892, + "step": 198 + }, + { + "epoch": 0.22837469516568643, + "grad_norm": 33.0, + "learning_rate": 9.08256880733945e-06, + "loss": 0.6746, + "step": 199 + }, + { + "epoch": 0.22952230669918233, + "grad_norm": 95.0, + "learning_rate": 9.128440366972477e-06, + "loss": 0.8428, + "step": 200 + }, + { + "epoch": 0.22952230669918233, + "eval_accuracy": 0.23, + "eval_loss": 0.7526699900627136, + "eval_runtime": 49.2923, + "eval_samples_per_second": 2.029, + "eval_steps_per_second": 2.029, + "step": 200 + }, + { + "epoch": 0.23066991823267824, + "grad_norm": 43.0, + "learning_rate": 9.174311926605506e-06, + "loss": 0.6504, + "step": 201 + }, + { + "epoch": 0.23181752976617415, + "grad_norm": 46.75, + "learning_rate": 9.220183486238534e-06, + "loss": 0.7568, + "step": 202 + }, + { + "epoch": 0.23296514129967005, + "grad_norm": 76.5, + "learning_rate": 9.26605504587156e-06, + "loss": 0.5601, + "step": 203 + }, + { + "epoch": 0.23411275283316596, + "grad_norm": 82.0, + "learning_rate": 9.311926605504588e-06, + "loss": 0.6661, + "step": 204 + }, + { + "epoch": 0.2352603643666619, + "grad_norm": 63.75, + "learning_rate": 9.357798165137616e-06, + "loss": 0.7619, + "step": 205 + }, + { + "epoch": 0.2364079759001578, + "grad_norm": 28.5, + "learning_rate": 9.403669724770643e-06, + "loss": 0.6332, + "step": 206 + }, + { + "epoch": 0.2375555874336537, + "grad_norm": 48.75, + "learning_rate": 9.44954128440367e-06, + "loss": 0.8103, + "step": 207 + }, + { + "epoch": 0.23870319896714962, + "grad_norm": 32.25, + "learning_rate": 9.495412844036697e-06, + "loss": 0.8623, + "step": 208 + }, + { + "epoch": 0.23985081050064552, + "grad_norm": 51.25, + "learning_rate": 9.541284403669727e-06, + "loss": 0.6734, + "step": 209 + }, + { + "epoch": 0.24099842203414143, + "grad_norm": 106.0, + "learning_rate": 9.587155963302753e-06, + "loss": 0.7637, + "step": 210 + }, + { + "epoch": 0.24214603356763736, + "grad_norm": 43.5, + "learning_rate": 9.633027522935781e-06, + "loss": 0.6827, + "step": 211 + }, + { + "epoch": 0.24329364510113327, + "grad_norm": 56.25, + "learning_rate": 9.678899082568808e-06, + "loss": 0.9193, + "step": 212 + }, + { + "epoch": 0.24444125663462918, + "grad_norm": 67.5, + "learning_rate": 9.724770642201836e-06, + "loss": 0.8784, + "step": 213 + }, + { + "epoch": 0.24558886816812509, + "grad_norm": 61.0, + "learning_rate": 9.770642201834864e-06, + "loss": 0.6853, + "step": 214 + }, + { + "epoch": 0.246736479701621, + "grad_norm": 33.5, + "learning_rate": 9.81651376146789e-06, + "loss": 0.6893, + "step": 215 + }, + { + "epoch": 0.24788409123511693, + "grad_norm": 20.5, + "learning_rate": 9.862385321100918e-06, + "loss": 0.6858, + "step": 216 + }, + { + "epoch": 0.24903170276861283, + "grad_norm": 51.0, + "learning_rate": 9.908256880733946e-06, + "loss": 0.5894, + "step": 217 + }, + { + "epoch": 0.2501793143021087, + "grad_norm": 61.0, + "learning_rate": 9.954128440366973e-06, + "loss": 0.9096, + "step": 218 + }, + { + "epoch": 0.2513269258356047, + "grad_norm": 28.625, + "learning_rate": 1e-05, + "loss": 0.801, + "step": 219 + }, + { + "epoch": 0.2524745373691006, + "grad_norm": 41.0, + "learning_rate": 1.0045871559633029e-05, + "loss": 0.6585, + "step": 220 + }, + { + "epoch": 0.2536221489025965, + "grad_norm": 39.75, + "learning_rate": 1.0091743119266055e-05, + "loss": 0.7587, + "step": 221 + }, + { + "epoch": 0.2547697604360924, + "grad_norm": 40.0, + "learning_rate": 1.0137614678899083e-05, + "loss": 0.7094, + "step": 222 + }, + { + "epoch": 0.2559173719695883, + "grad_norm": 684.0, + "learning_rate": 1.018348623853211e-05, + "loss": 0.7388, + "step": 223 + }, + { + "epoch": 0.2570649835030842, + "grad_norm": 54.5, + "learning_rate": 1.0229357798165138e-05, + "loss": 0.7495, + "step": 224 + }, + { + "epoch": 0.2582125950365801, + "grad_norm": 65.5, + "learning_rate": 1.0275229357798166e-05, + "loss": 0.834, + "step": 225 + }, + { + "epoch": 0.259360206570076, + "grad_norm": 68.5, + "learning_rate": 1.0321100917431192e-05, + "loss": 0.9911, + "step": 226 + }, + { + "epoch": 0.26050781810357193, + "grad_norm": 59.75, + "learning_rate": 1.036697247706422e-05, + "loss": 0.7996, + "step": 227 + }, + { + "epoch": 0.26165542963706784, + "grad_norm": 39.0, + "learning_rate": 1.041284403669725e-05, + "loss": 0.7586, + "step": 228 + }, + { + "epoch": 0.26280304117056374, + "grad_norm": 41.0, + "learning_rate": 1.0458715596330275e-05, + "loss": 0.6575, + "step": 229 + }, + { + "epoch": 0.26395065270405965, + "grad_norm": 25.75, + "learning_rate": 1.0504587155963305e-05, + "loss": 0.5676, + "step": 230 + }, + { + "epoch": 0.2650982642375556, + "grad_norm": 38.5, + "learning_rate": 1.055045871559633e-05, + "loss": 0.7107, + "step": 231 + }, + { + "epoch": 0.2662458757710515, + "grad_norm": 29.0, + "learning_rate": 1.0596330275229359e-05, + "loss": 0.5768, + "step": 232 + }, + { + "epoch": 0.26739348730454743, + "grad_norm": 67.0, + "learning_rate": 1.0642201834862387e-05, + "loss": 0.8002, + "step": 233 + }, + { + "epoch": 0.26854109883804334, + "grad_norm": 92.0, + "learning_rate": 1.0688073394495414e-05, + "loss": 0.9373, + "step": 234 + }, + { + "epoch": 0.26968871037153924, + "grad_norm": 95.0, + "learning_rate": 1.0733944954128442e-05, + "loss": 0.9883, + "step": 235 + }, + { + "epoch": 0.27083632190503515, + "grad_norm": 32.25, + "learning_rate": 1.077981651376147e-05, + "loss": 0.3327, + "step": 236 + }, + { + "epoch": 0.27198393343853106, + "grad_norm": 38.75, + "learning_rate": 1.0825688073394496e-05, + "loss": 0.9128, + "step": 237 + }, + { + "epoch": 0.27313154497202696, + "grad_norm": 113.5, + "learning_rate": 1.0871559633027524e-05, + "loss": 0.7185, + "step": 238 + }, + { + "epoch": 0.27427915650552287, + "grad_norm": 78.5, + "learning_rate": 1.091743119266055e-05, + "loss": 0.7406, + "step": 239 + }, + { + "epoch": 0.2754267680390188, + "grad_norm": 54.25, + "learning_rate": 1.0963302752293579e-05, + "loss": 0.5355, + "step": 240 + }, + { + "epoch": 0.2765743795725147, + "grad_norm": 88.0, + "learning_rate": 1.1009174311926607e-05, + "loss": 0.7876, + "step": 241 + }, + { + "epoch": 0.2777219911060106, + "grad_norm": 25.125, + "learning_rate": 1.1055045871559633e-05, + "loss": 0.7005, + "step": 242 + }, + { + "epoch": 0.27886960263950655, + "grad_norm": 62.0, + "learning_rate": 1.1100917431192661e-05, + "loss": 0.6772, + "step": 243 + }, + { + "epoch": 0.28001721417300246, + "grad_norm": 88.5, + "learning_rate": 1.114678899082569e-05, + "loss": 0.7296, + "step": 244 + }, + { + "epoch": 0.28116482570649837, + "grad_norm": 29.375, + "learning_rate": 1.1192660550458716e-05, + "loss": 0.7339, + "step": 245 + }, + { + "epoch": 0.2823124372399943, + "grad_norm": 21.75, + "learning_rate": 1.1238532110091744e-05, + "loss": 0.5743, + "step": 246 + }, + { + "epoch": 0.2834600487734902, + "grad_norm": 127.5, + "learning_rate": 1.128440366972477e-05, + "loss": 0.9532, + "step": 247 + }, + { + "epoch": 0.2846076603069861, + "grad_norm": 97.0, + "learning_rate": 1.1330275229357798e-05, + "loss": 0.9855, + "step": 248 + }, + { + "epoch": 0.285755271840482, + "grad_norm": 54.25, + "learning_rate": 1.1376146788990828e-05, + "loss": 0.6011, + "step": 249 + }, + { + "epoch": 0.2869028833739779, + "grad_norm": 27.125, + "learning_rate": 1.1422018348623853e-05, + "loss": 0.4934, + "step": 250 + }, + { + "epoch": 0.2880504949074738, + "grad_norm": 156.0, + "learning_rate": 1.1467889908256882e-05, + "loss": 1.0312, + "step": 251 + }, + { + "epoch": 0.2891981064409697, + "grad_norm": 31.5, + "learning_rate": 1.151376146788991e-05, + "loss": 0.6735, + "step": 252 + }, + { + "epoch": 0.2903457179744656, + "grad_norm": 26.0, + "learning_rate": 1.1559633027522937e-05, + "loss": 0.5176, + "step": 253 + }, + { + "epoch": 0.29149332950796153, + "grad_norm": 28.0, + "learning_rate": 1.1605504587155965e-05, + "loss": 0.7067, + "step": 254 + }, + { + "epoch": 0.2926409410414575, + "grad_norm": 50.75, + "learning_rate": 1.1651376146788991e-05, + "loss": 0.5816, + "step": 255 + }, + { + "epoch": 0.2937885525749534, + "grad_norm": 33.0, + "learning_rate": 1.169724770642202e-05, + "loss": 0.5099, + "step": 256 + }, + { + "epoch": 0.2949361641084493, + "grad_norm": 63.25, + "learning_rate": 1.1743119266055047e-05, + "loss": 0.6038, + "step": 257 + }, + { + "epoch": 0.2960837756419452, + "grad_norm": 152.0, + "learning_rate": 1.1788990825688074e-05, + "loss": 1.2612, + "step": 258 + }, + { + "epoch": 0.2972313871754411, + "grad_norm": 55.5, + "learning_rate": 1.1834862385321102e-05, + "loss": 0.8309, + "step": 259 + }, + { + "epoch": 0.298378998708937, + "grad_norm": 49.75, + "learning_rate": 1.188073394495413e-05, + "loss": 0.7434, + "step": 260 + }, + { + "epoch": 0.29952661024243293, + "grad_norm": 38.25, + "learning_rate": 1.1926605504587156e-05, + "loss": 0.6988, + "step": 261 + }, + { + "epoch": 0.30067422177592884, + "grad_norm": 31.25, + "learning_rate": 1.1972477064220184e-05, + "loss": 0.674, + "step": 262 + }, + { + "epoch": 0.30182183330942475, + "grad_norm": 61.25, + "learning_rate": 1.2018348623853211e-05, + "loss": 0.8105, + "step": 263 + }, + { + "epoch": 0.30296944484292065, + "grad_norm": 67.0, + "learning_rate": 1.2064220183486239e-05, + "loss": 0.7834, + "step": 264 + }, + { + "epoch": 0.30411705637641656, + "grad_norm": 34.0, + "learning_rate": 1.2110091743119267e-05, + "loss": 0.6694, + "step": 265 + }, + { + "epoch": 0.30526466790991247, + "grad_norm": 48.75, + "learning_rate": 1.2155963302752293e-05, + "loss": 0.4389, + "step": 266 + }, + { + "epoch": 0.30641227944340843, + "grad_norm": 45.0, + "learning_rate": 1.2201834862385321e-05, + "loss": 0.9619, + "step": 267 + }, + { + "epoch": 0.30755989097690434, + "grad_norm": 27.25, + "learning_rate": 1.2247706422018351e-05, + "loss": 0.8181, + "step": 268 + }, + { + "epoch": 0.30870750251040024, + "grad_norm": 78.5, + "learning_rate": 1.2293577981651376e-05, + "loss": 0.8289, + "step": 269 + }, + { + "epoch": 0.30985511404389615, + "grad_norm": 29.625, + "learning_rate": 1.2339449541284406e-05, + "loss": 0.66, + "step": 270 + }, + { + "epoch": 0.31100272557739206, + "grad_norm": 51.25, + "learning_rate": 1.238532110091743e-05, + "loss": 0.6833, + "step": 271 + }, + { + "epoch": 0.31215033711088797, + "grad_norm": 45.0, + "learning_rate": 1.243119266055046e-05, + "loss": 0.6545, + "step": 272 + }, + { + "epoch": 0.3132979486443839, + "grad_norm": 35.5, + "learning_rate": 1.2477064220183488e-05, + "loss": 0.6642, + "step": 273 + }, + { + "epoch": 0.3144455601778798, + "grad_norm": 27.75, + "learning_rate": 1.2522935779816515e-05, + "loss": 0.7786, + "step": 274 + }, + { + "epoch": 0.3155931717113757, + "grad_norm": 103.0, + "learning_rate": 1.2568807339449543e-05, + "loss": 0.9578, + "step": 275 + }, + { + "epoch": 0.3167407832448716, + "grad_norm": 61.75, + "learning_rate": 1.261467889908257e-05, + "loss": 0.5513, + "step": 276 + }, + { + "epoch": 0.3178883947783675, + "grad_norm": 86.5, + "learning_rate": 1.2660550458715597e-05, + "loss": 0.855, + "step": 277 + }, + { + "epoch": 0.3190360063118634, + "grad_norm": 47.0, + "learning_rate": 1.2706422018348625e-05, + "loss": 0.7903, + "step": 278 + }, + { + "epoch": 0.32018361784535937, + "grad_norm": 21.125, + "learning_rate": 1.2752293577981652e-05, + "loss": 0.6084, + "step": 279 + }, + { + "epoch": 0.3213312293788553, + "grad_norm": 53.0, + "learning_rate": 1.279816513761468e-05, + "loss": 0.7655, + "step": 280 + }, + { + "epoch": 0.3224788409123512, + "grad_norm": 69.0, + "learning_rate": 1.2844036697247708e-05, + "loss": 0.7763, + "step": 281 + }, + { + "epoch": 0.3236264524458471, + "grad_norm": 98.0, + "learning_rate": 1.2889908256880734e-05, + "loss": 0.8355, + "step": 282 + }, + { + "epoch": 0.324774063979343, + "grad_norm": 65.0, + "learning_rate": 1.2935779816513762e-05, + "loss": 0.7071, + "step": 283 + }, + { + "epoch": 0.3259216755128389, + "grad_norm": 25.75, + "learning_rate": 1.298165137614679e-05, + "loss": 0.8358, + "step": 284 + }, + { + "epoch": 0.3270692870463348, + "grad_norm": 48.25, + "learning_rate": 1.3027522935779817e-05, + "loss": 0.7069, + "step": 285 + }, + { + "epoch": 0.3282168985798307, + "grad_norm": 27.75, + "learning_rate": 1.3073394495412845e-05, + "loss": 0.601, + "step": 286 + }, + { + "epoch": 0.3293645101133266, + "grad_norm": 44.25, + "learning_rate": 1.3119266055045871e-05, + "loss": 0.6844, + "step": 287 + }, + { + "epoch": 0.33051212164682253, + "grad_norm": 73.0, + "learning_rate": 1.31651376146789e-05, + "loss": 1.5458, + "step": 288 + }, + { + "epoch": 0.33165973318031844, + "grad_norm": 36.0, + "learning_rate": 1.3211009174311929e-05, + "loss": 0.8631, + "step": 289 + }, + { + "epoch": 0.3328073447138144, + "grad_norm": 60.25, + "learning_rate": 1.3256880733944954e-05, + "loss": 0.7894, + "step": 290 + }, + { + "epoch": 0.3339549562473103, + "grad_norm": 46.75, + "learning_rate": 1.3302752293577984e-05, + "loss": 0.7715, + "step": 291 + }, + { + "epoch": 0.3351025677808062, + "grad_norm": 17.5, + "learning_rate": 1.3348623853211012e-05, + "loss": 0.5756, + "step": 292 + }, + { + "epoch": 0.3362501793143021, + "grad_norm": 32.0, + "learning_rate": 1.3394495412844038e-05, + "loss": 0.7155, + "step": 293 + }, + { + "epoch": 0.33739779084779803, + "grad_norm": 96.5, + "learning_rate": 1.3440366972477066e-05, + "loss": 0.6518, + "step": 294 + }, + { + "epoch": 0.33854540238129394, + "grad_norm": 37.75, + "learning_rate": 1.3486238532110092e-05, + "loss": 0.6962, + "step": 295 + }, + { + "epoch": 0.33969301391478984, + "grad_norm": 57.5, + "learning_rate": 1.353211009174312e-05, + "loss": 0.5687, + "step": 296 + }, + { + "epoch": 0.34084062544828575, + "grad_norm": 20.125, + "learning_rate": 1.3577981651376149e-05, + "loss": 0.7647, + "step": 297 + }, + { + "epoch": 0.34198823698178166, + "grad_norm": 22.75, + "learning_rate": 1.3623853211009175e-05, + "loss": 0.7313, + "step": 298 + }, + { + "epoch": 0.34313584851527756, + "grad_norm": 71.0, + "learning_rate": 1.3669724770642203e-05, + "loss": 0.8702, + "step": 299 + }, + { + "epoch": 0.34428346004877347, + "grad_norm": 70.0, + "learning_rate": 1.3715596330275231e-05, + "loss": 0.7895, + "step": 300 + }, + { + "epoch": 0.34428346004877347, + "eval_accuracy": 0.22, + "eval_loss": 0.6987403631210327, + "eval_runtime": 49.3136, + "eval_samples_per_second": 2.028, + "eval_steps_per_second": 2.028, + "step": 300 + }, + { + "epoch": 0.3454310715822694, + "grad_norm": 32.0, + "learning_rate": 1.3761467889908258e-05, + "loss": 0.6857, + "step": 301 + }, + { + "epoch": 0.34657868311576534, + "grad_norm": 58.25, + "learning_rate": 1.3807339449541286e-05, + "loss": 0.6662, + "step": 302 + }, + { + "epoch": 0.34772629464926125, + "grad_norm": 26.875, + "learning_rate": 1.3853211009174312e-05, + "loss": 0.5594, + "step": 303 + }, + { + "epoch": 0.34887390618275715, + "grad_norm": 36.5, + "learning_rate": 1.389908256880734e-05, + "loss": 0.6889, + "step": 304 + }, + { + "epoch": 0.35002151771625306, + "grad_norm": 49.0, + "learning_rate": 1.3944954128440368e-05, + "loss": 0.6969, + "step": 305 + }, + { + "epoch": 0.35116912924974897, + "grad_norm": 173.0, + "learning_rate": 1.3990825688073395e-05, + "loss": 0.6462, + "step": 306 + }, + { + "epoch": 0.3523167407832449, + "grad_norm": 60.0, + "learning_rate": 1.4036697247706423e-05, + "loss": 0.539, + "step": 307 + }, + { + "epoch": 0.3534643523167408, + "grad_norm": 27.125, + "learning_rate": 1.4082568807339452e-05, + "loss": 0.855, + "step": 308 + }, + { + "epoch": 0.3546119638502367, + "grad_norm": 61.5, + "learning_rate": 1.4128440366972477e-05, + "loss": 0.7295, + "step": 309 + }, + { + "epoch": 0.3557595753837326, + "grad_norm": 56.25, + "learning_rate": 1.4174311926605507e-05, + "loss": 0.8013, + "step": 310 + }, + { + "epoch": 0.3569071869172285, + "grad_norm": 19.375, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.6061, + "step": 311 + }, + { + "epoch": 0.3580547984507244, + "grad_norm": 22.75, + "learning_rate": 1.4266055045871561e-05, + "loss": 0.2982, + "step": 312 + }, + { + "epoch": 0.3592024099842203, + "grad_norm": 102.0, + "learning_rate": 1.431192660550459e-05, + "loss": 1.1087, + "step": 313 + }, + { + "epoch": 0.3603500215177163, + "grad_norm": 42.25, + "learning_rate": 1.4357798165137616e-05, + "loss": 0.8645, + "step": 314 + }, + { + "epoch": 0.3614976330512122, + "grad_norm": 112.0, + "learning_rate": 1.4403669724770644e-05, + "loss": 1.2745, + "step": 315 + }, + { + "epoch": 0.3626452445847081, + "grad_norm": 21.125, + "learning_rate": 1.4449541284403672e-05, + "loss": 0.3714, + "step": 316 + }, + { + "epoch": 0.363792856118204, + "grad_norm": 87.0, + "learning_rate": 1.4495412844036698e-05, + "loss": 0.9653, + "step": 317 + }, + { + "epoch": 0.3649404676516999, + "grad_norm": 82.0, + "learning_rate": 1.4541284403669726e-05, + "loss": 0.982, + "step": 318 + }, + { + "epoch": 0.3660880791851958, + "grad_norm": 27.375, + "learning_rate": 1.4587155963302753e-05, + "loss": 0.4173, + "step": 319 + }, + { + "epoch": 0.3672356907186917, + "grad_norm": 74.0, + "learning_rate": 1.463302752293578e-05, + "loss": 0.9199, + "step": 320 + }, + { + "epoch": 0.3683833022521876, + "grad_norm": 57.0, + "learning_rate": 1.4678899082568809e-05, + "loss": 0.6555, + "step": 321 + }, + { + "epoch": 0.36953091378568353, + "grad_norm": 40.25, + "learning_rate": 1.4724770642201835e-05, + "loss": 0.5512, + "step": 322 + }, + { + "epoch": 0.37067852531917944, + "grad_norm": 51.0, + "learning_rate": 1.4770642201834863e-05, + "loss": 0.8541, + "step": 323 + }, + { + "epoch": 0.37182613685267535, + "grad_norm": 112.0, + "learning_rate": 1.4816513761467891e-05, + "loss": 1.031, + "step": 324 + }, + { + "epoch": 0.37297374838617126, + "grad_norm": 17.625, + "learning_rate": 1.4862385321100918e-05, + "loss": 0.448, + "step": 325 + }, + { + "epoch": 0.3741213599196672, + "grad_norm": 109.0, + "learning_rate": 1.4908256880733946e-05, + "loss": 1.0731, + "step": 326 + }, + { + "epoch": 0.3752689714531631, + "grad_norm": 76.0, + "learning_rate": 1.4954128440366972e-05, + "loss": 0.9293, + "step": 327 + }, + { + "epoch": 0.37641658298665903, + "grad_norm": 142.0, + "learning_rate": 1.5000000000000002e-05, + "loss": 1.2647, + "step": 328 + }, + { + "epoch": 0.37756419452015494, + "grad_norm": 17.0, + "learning_rate": 1.504587155963303e-05, + "loss": 0.5858, + "step": 329 + }, + { + "epoch": 0.37871180605365085, + "grad_norm": 63.5, + "learning_rate": 1.5091743119266057e-05, + "loss": 1.1073, + "step": 330 + }, + { + "epoch": 0.37985941758714675, + "grad_norm": 23.125, + "learning_rate": 1.5137614678899085e-05, + "loss": 0.6616, + "step": 331 + }, + { + "epoch": 0.38100702912064266, + "grad_norm": 23.5, + "learning_rate": 1.5183486238532111e-05, + "loss": 0.81, + "step": 332 + }, + { + "epoch": 0.38215464065413857, + "grad_norm": 44.5, + "learning_rate": 1.5229357798165139e-05, + "loss": 0.7774, + "step": 333 + }, + { + "epoch": 0.3833022521876345, + "grad_norm": 53.0, + "learning_rate": 1.5275229357798167e-05, + "loss": 0.7527, + "step": 334 + }, + { + "epoch": 0.3844498637211304, + "grad_norm": 26.0, + "learning_rate": 1.5321100917431192e-05, + "loss": 0.5953, + "step": 335 + }, + { + "epoch": 0.3855974752546263, + "grad_norm": 81.5, + "learning_rate": 1.536697247706422e-05, + "loss": 1.1549, + "step": 336 + }, + { + "epoch": 0.3867450867881222, + "grad_norm": 40.25, + "learning_rate": 1.541284403669725e-05, + "loss": 0.6953, + "step": 337 + }, + { + "epoch": 0.38789269832161816, + "grad_norm": 59.5, + "learning_rate": 1.5458715596330276e-05, + "loss": 0.9157, + "step": 338 + }, + { + "epoch": 0.38904030985511406, + "grad_norm": 37.25, + "learning_rate": 1.5504587155963304e-05, + "loss": 0.6101, + "step": 339 + }, + { + "epoch": 0.39018792138860997, + "grad_norm": 47.75, + "learning_rate": 1.555045871559633e-05, + "loss": 0.6971, + "step": 340 + }, + { + "epoch": 0.3913355329221059, + "grad_norm": 35.5, + "learning_rate": 1.559633027522936e-05, + "loss": 0.6038, + "step": 341 + }, + { + "epoch": 0.3924831444556018, + "grad_norm": 258.0, + "learning_rate": 1.564220183486239e-05, + "loss": 0.7838, + "step": 342 + }, + { + "epoch": 0.3936307559890977, + "grad_norm": 19.625, + "learning_rate": 1.5688073394495413e-05, + "loss": 0.6458, + "step": 343 + }, + { + "epoch": 0.3947783675225936, + "grad_norm": 78.5, + "learning_rate": 1.573394495412844e-05, + "loss": 0.8405, + "step": 344 + }, + { + "epoch": 0.3959259790560895, + "grad_norm": 118.5, + "learning_rate": 1.577981651376147e-05, + "loss": 1.0364, + "step": 345 + }, + { + "epoch": 0.3970735905895854, + "grad_norm": 30.0, + "learning_rate": 1.5825688073394497e-05, + "loss": 0.5703, + "step": 346 + }, + { + "epoch": 0.3982212021230813, + "grad_norm": 60.75, + "learning_rate": 1.5871559633027525e-05, + "loss": 0.8595, + "step": 347 + }, + { + "epoch": 0.3993688136565772, + "grad_norm": 78.5, + "learning_rate": 1.591743119266055e-05, + "loss": 0.8161, + "step": 348 + }, + { + "epoch": 0.40051642519007313, + "grad_norm": 33.75, + "learning_rate": 1.5963302752293578e-05, + "loss": 0.7062, + "step": 349 + }, + { + "epoch": 0.4016640367235691, + "grad_norm": 20.75, + "learning_rate": 1.6009174311926606e-05, + "loss": 0.825, + "step": 350 + }, + { + "epoch": 0.402811648257065, + "grad_norm": 20.375, + "learning_rate": 1.6055045871559634e-05, + "loss": 0.5635, + "step": 351 + }, + { + "epoch": 0.4039592597905609, + "grad_norm": 26.0, + "learning_rate": 1.6100917431192662e-05, + "loss": 0.7392, + "step": 352 + }, + { + "epoch": 0.4051068713240568, + "grad_norm": 39.75, + "learning_rate": 1.614678899082569e-05, + "loss": 0.6261, + "step": 353 + }, + { + "epoch": 0.4062544828575527, + "grad_norm": 40.0, + "learning_rate": 1.6192660550458715e-05, + "loss": 0.6046, + "step": 354 + }, + { + "epoch": 0.40740209439104863, + "grad_norm": 106.0, + "learning_rate": 1.6238532110091743e-05, + "loss": 0.9682, + "step": 355 + }, + { + "epoch": 0.40854970592454454, + "grad_norm": 51.25, + "learning_rate": 1.628440366972477e-05, + "loss": 0.7811, + "step": 356 + }, + { + "epoch": 0.40969731745804044, + "grad_norm": 38.0, + "learning_rate": 1.63302752293578e-05, + "loss": 0.8129, + "step": 357 + }, + { + "epoch": 0.41084492899153635, + "grad_norm": 25.375, + "learning_rate": 1.6376146788990827e-05, + "loss": 0.5273, + "step": 358 + }, + { + "epoch": 0.41199254052503226, + "grad_norm": 34.0, + "learning_rate": 1.6422018348623852e-05, + "loss": 0.5413, + "step": 359 + }, + { + "epoch": 0.41314015205852817, + "grad_norm": 19.25, + "learning_rate": 1.6467889908256884e-05, + "loss": 0.5278, + "step": 360 + }, + { + "epoch": 0.4142877635920241, + "grad_norm": 24.0, + "learning_rate": 1.6513761467889912e-05, + "loss": 0.6991, + "step": 361 + }, + { + "epoch": 0.41543537512552003, + "grad_norm": 180.0, + "learning_rate": 1.6559633027522936e-05, + "loss": 0.7462, + "step": 362 + }, + { + "epoch": 0.41658298665901594, + "grad_norm": 27.125, + "learning_rate": 1.6605504587155964e-05, + "loss": 0.7057, + "step": 363 + }, + { + "epoch": 0.41773059819251185, + "grad_norm": 75.0, + "learning_rate": 1.6651376146788993e-05, + "loss": 0.8487, + "step": 364 + }, + { + "epoch": 0.41887820972600776, + "grad_norm": 52.5, + "learning_rate": 1.669724770642202e-05, + "loss": 0.83, + "step": 365 + }, + { + "epoch": 0.42002582125950366, + "grad_norm": 18.5, + "learning_rate": 1.674311926605505e-05, + "loss": 0.4915, + "step": 366 + }, + { + "epoch": 0.42117343279299957, + "grad_norm": 76.0, + "learning_rate": 1.6788990825688073e-05, + "loss": 0.6684, + "step": 367 + }, + { + "epoch": 0.4223210443264955, + "grad_norm": 21.625, + "learning_rate": 1.68348623853211e-05, + "loss": 0.5262, + "step": 368 + }, + { + "epoch": 0.4234686558599914, + "grad_norm": 13.375, + "learning_rate": 1.688073394495413e-05, + "loss": 0.5778, + "step": 369 + }, + { + "epoch": 0.4246162673934873, + "grad_norm": 20.375, + "learning_rate": 1.6926605504587158e-05, + "loss": 0.5191, + "step": 370 + }, + { + "epoch": 0.4257638789269832, + "grad_norm": 91.5, + "learning_rate": 1.6972477064220186e-05, + "loss": 1.0079, + "step": 371 + }, + { + "epoch": 0.4269114904604791, + "grad_norm": 55.25, + "learning_rate": 1.701834862385321e-05, + "loss": 0.6378, + "step": 372 + }, + { + "epoch": 0.42805910199397507, + "grad_norm": 39.5, + "learning_rate": 1.706422018348624e-05, + "loss": 0.836, + "step": 373 + }, + { + "epoch": 0.429206713527471, + "grad_norm": 42.75, + "learning_rate": 1.7110091743119267e-05, + "loss": 0.5683, + "step": 374 + }, + { + "epoch": 0.4303543250609669, + "grad_norm": 60.25, + "learning_rate": 1.7155963302752295e-05, + "loss": 0.4543, + "step": 375 + }, + { + "epoch": 0.4315019365944628, + "grad_norm": 21.375, + "learning_rate": 1.7201834862385323e-05, + "loss": 0.5242, + "step": 376 + }, + { + "epoch": 0.4326495481279587, + "grad_norm": 17.625, + "learning_rate": 1.724770642201835e-05, + "loss": 0.6393, + "step": 377 + }, + { + "epoch": 0.4337971596614546, + "grad_norm": 21.875, + "learning_rate": 1.7293577981651376e-05, + "loss": 0.5476, + "step": 378 + }, + { + "epoch": 0.4349447711949505, + "grad_norm": 56.25, + "learning_rate": 1.7339449541284407e-05, + "loss": 0.7973, + "step": 379 + }, + { + "epoch": 0.4360923827284464, + "grad_norm": 80.0, + "learning_rate": 1.738532110091743e-05, + "loss": 0.8487, + "step": 380 + }, + { + "epoch": 0.4372399942619423, + "grad_norm": 46.5, + "learning_rate": 1.743119266055046e-05, + "loss": 0.8605, + "step": 381 + }, + { + "epoch": 0.43838760579543823, + "grad_norm": 65.0, + "learning_rate": 1.7477064220183488e-05, + "loss": 0.8858, + "step": 382 + }, + { + "epoch": 0.43953521732893414, + "grad_norm": 87.0, + "learning_rate": 1.7522935779816516e-05, + "loss": 0.7342, + "step": 383 + }, + { + "epoch": 0.44068282886243004, + "grad_norm": 108.5, + "learning_rate": 1.7568807339449544e-05, + "loss": 0.8372, + "step": 384 + }, + { + "epoch": 0.441830440395926, + "grad_norm": 38.5, + "learning_rate": 1.7614678899082572e-05, + "loss": 1.0963, + "step": 385 + }, + { + "epoch": 0.4429780519294219, + "grad_norm": 21.875, + "learning_rate": 1.7660550458715597e-05, + "loss": 0.737, + "step": 386 + }, + { + "epoch": 0.4441256634629178, + "grad_norm": 34.25, + "learning_rate": 1.7706422018348625e-05, + "loss": 0.7902, + "step": 387 + }, + { + "epoch": 0.4452732749964137, + "grad_norm": 116.0, + "learning_rate": 1.7752293577981653e-05, + "loss": 0.875, + "step": 388 + }, + { + "epoch": 0.44642088652990963, + "grad_norm": 66.5, + "learning_rate": 1.779816513761468e-05, + "loss": 0.9535, + "step": 389 + }, + { + "epoch": 0.44756849806340554, + "grad_norm": 46.25, + "learning_rate": 1.784403669724771e-05, + "loss": 0.7879, + "step": 390 + }, + { + "epoch": 0.44871610959690145, + "grad_norm": 48.75, + "learning_rate": 1.7889908256880734e-05, + "loss": 0.6081, + "step": 391 + }, + { + "epoch": 0.44986372113039735, + "grad_norm": 32.5, + "learning_rate": 1.7935779816513762e-05, + "loss": 0.6908, + "step": 392 + }, + { + "epoch": 0.45101133266389326, + "grad_norm": 34.0, + "learning_rate": 1.798165137614679e-05, + "loss": 0.6664, + "step": 393 + }, + { + "epoch": 0.45215894419738917, + "grad_norm": 74.5, + "learning_rate": 1.8027522935779818e-05, + "loss": 0.6012, + "step": 394 + }, + { + "epoch": 0.4533065557308851, + "grad_norm": 33.25, + "learning_rate": 1.8073394495412846e-05, + "loss": 0.6278, + "step": 395 + }, + { + "epoch": 0.454454167264381, + "grad_norm": 19.25, + "learning_rate": 1.811926605504587e-05, + "loss": 0.6279, + "step": 396 + }, + { + "epoch": 0.45560177879787694, + "grad_norm": 33.5, + "learning_rate": 1.81651376146789e-05, + "loss": 0.689, + "step": 397 + }, + { + "epoch": 0.45674939033137285, + "grad_norm": 34.25, + "learning_rate": 1.821100917431193e-05, + "loss": 0.4764, + "step": 398 + }, + { + "epoch": 0.45789700186486876, + "grad_norm": 144.0, + "learning_rate": 1.8256880733944955e-05, + "loss": 1.3598, + "step": 399 + }, + { + "epoch": 0.45904461339836466, + "grad_norm": 105.5, + "learning_rate": 1.8302752293577983e-05, + "loss": 0.9441, + "step": 400 + }, + { + "epoch": 0.45904461339836466, + "eval_accuracy": 0.37, + "eval_loss": 0.7958357334136963, + "eval_runtime": 49.9294, + "eval_samples_per_second": 2.003, + "eval_steps_per_second": 2.003, + "step": 400 + }, + { + "epoch": 0.46019222493186057, + "grad_norm": 125.5, + "learning_rate": 1.834862385321101e-05, + "loss": 1.1444, + "step": 401 + }, + { + "epoch": 0.4613398364653565, + "grad_norm": 67.0, + "learning_rate": 1.839449541284404e-05, + "loss": 0.694, + "step": 402 + }, + { + "epoch": 0.4624874479988524, + "grad_norm": 48.75, + "learning_rate": 1.8440366972477067e-05, + "loss": 0.7125, + "step": 403 + }, + { + "epoch": 0.4636350595323483, + "grad_norm": 60.5, + "learning_rate": 1.8486238532110092e-05, + "loss": 0.6703, + "step": 404 + }, + { + "epoch": 0.4647826710658442, + "grad_norm": 51.0, + "learning_rate": 1.853211009174312e-05, + "loss": 0.6631, + "step": 405 + }, + { + "epoch": 0.4659302825993401, + "grad_norm": 51.75, + "learning_rate": 1.8577981651376148e-05, + "loss": 0.7814, + "step": 406 + }, + { + "epoch": 0.467077894132836, + "grad_norm": 22.875, + "learning_rate": 1.8623853211009176e-05, + "loss": 0.5642, + "step": 407 + }, + { + "epoch": 0.4682255056663319, + "grad_norm": 105.0, + "learning_rate": 1.8669724770642204e-05, + "loss": 0.884, + "step": 408 + }, + { + "epoch": 0.4693731171998279, + "grad_norm": 48.5, + "learning_rate": 1.8715596330275232e-05, + "loss": 0.5543, + "step": 409 + }, + { + "epoch": 0.4705207287333238, + "grad_norm": 26.875, + "learning_rate": 1.8761467889908257e-05, + "loss": 0.6461, + "step": 410 + }, + { + "epoch": 0.4716683402668197, + "grad_norm": 23.125, + "learning_rate": 1.8807339449541285e-05, + "loss": 0.5786, + "step": 411 + }, + { + "epoch": 0.4728159518003156, + "grad_norm": 37.75, + "learning_rate": 1.8853211009174313e-05, + "loss": 0.6921, + "step": 412 + }, + { + "epoch": 0.4739635633338115, + "grad_norm": 23.625, + "learning_rate": 1.889908256880734e-05, + "loss": 0.4189, + "step": 413 + }, + { + "epoch": 0.4751111748673074, + "grad_norm": 108.5, + "learning_rate": 1.894495412844037e-05, + "loss": 1.0126, + "step": 414 + }, + { + "epoch": 0.4762587864008033, + "grad_norm": 138.0, + "learning_rate": 1.8990825688073394e-05, + "loss": 1.2399, + "step": 415 + }, + { + "epoch": 0.47740639793429923, + "grad_norm": 31.375, + "learning_rate": 1.9036697247706422e-05, + "loss": 0.4347, + "step": 416 + }, + { + "epoch": 0.47855400946779514, + "grad_norm": 120.5, + "learning_rate": 1.9082568807339454e-05, + "loss": 1.1874, + "step": 417 + }, + { + "epoch": 0.47970162100129105, + "grad_norm": 25.5, + "learning_rate": 1.912844036697248e-05, + "loss": 0.6172, + "step": 418 + }, + { + "epoch": 0.48084923253478695, + "grad_norm": 29.0, + "learning_rate": 1.9174311926605506e-05, + "loss": 0.7072, + "step": 419 + }, + { + "epoch": 0.48199684406828286, + "grad_norm": 40.75, + "learning_rate": 1.9220183486238534e-05, + "loss": 0.8408, + "step": 420 + }, + { + "epoch": 0.4831444556017788, + "grad_norm": 17.5, + "learning_rate": 1.9266055045871563e-05, + "loss": 0.6384, + "step": 421 + }, + { + "epoch": 0.48429206713527473, + "grad_norm": 26.375, + "learning_rate": 1.931192660550459e-05, + "loss": 0.7132, + "step": 422 + }, + { + "epoch": 0.48543967866877064, + "grad_norm": 41.0, + "learning_rate": 1.9357798165137615e-05, + "loss": 0.6823, + "step": 423 + }, + { + "epoch": 0.48658729020226654, + "grad_norm": 36.0, + "learning_rate": 1.9403669724770643e-05, + "loss": 0.5629, + "step": 424 + }, + { + "epoch": 0.48773490173576245, + "grad_norm": 93.5, + "learning_rate": 1.944954128440367e-05, + "loss": 1.028, + "step": 425 + }, + { + "epoch": 0.48888251326925836, + "grad_norm": 70.5, + "learning_rate": 1.94954128440367e-05, + "loss": 0.7085, + "step": 426 + }, + { + "epoch": 0.49003012480275426, + "grad_norm": 17.875, + "learning_rate": 1.9541284403669728e-05, + "loss": 0.5299, + "step": 427 + }, + { + "epoch": 0.49117773633625017, + "grad_norm": 58.75, + "learning_rate": 1.9587155963302752e-05, + "loss": 0.9028, + "step": 428 + }, + { + "epoch": 0.4923253478697461, + "grad_norm": 45.25, + "learning_rate": 1.963302752293578e-05, + "loss": 0.8021, + "step": 429 + }, + { + "epoch": 0.493472959403242, + "grad_norm": 69.0, + "learning_rate": 1.967889908256881e-05, + "loss": 0.696, + "step": 430 + }, + { + "epoch": 0.4946205709367379, + "grad_norm": 44.0, + "learning_rate": 1.9724770642201837e-05, + "loss": 0.5913, + "step": 431 + }, + { + "epoch": 0.49576818247023385, + "grad_norm": 71.5, + "learning_rate": 1.9770642201834865e-05, + "loss": 0.8661, + "step": 432 + }, + { + "epoch": 0.49691579400372976, + "grad_norm": 80.5, + "learning_rate": 1.9816513761467893e-05, + "loss": 1.109, + "step": 433 + }, + { + "epoch": 0.49806340553722567, + "grad_norm": 86.0, + "learning_rate": 1.9862385321100917e-05, + "loss": 1.0316, + "step": 434 + }, + { + "epoch": 0.4992110170707216, + "grad_norm": 30.375, + "learning_rate": 1.9908256880733945e-05, + "loss": 0.7336, + "step": 435 + }, + { + "epoch": 0.5003586286042174, + "grad_norm": 42.75, + "learning_rate": 1.9954128440366974e-05, + "loss": 0.7081, + "step": 436 + }, + { + "epoch": 0.5015062401377134, + "grad_norm": 20.75, + "learning_rate": 2e-05, + "loss": 0.5407, + "step": 437 + }, + { + "epoch": 0.5026538516712094, + "grad_norm": 117.0, + "learning_rate": 1.999490316004078e-05, + "loss": 1.1049, + "step": 438 + }, + { + "epoch": 0.5038014632047052, + "grad_norm": 126.5, + "learning_rate": 1.998980632008155e-05, + "loss": 1.1828, + "step": 439 + }, + { + "epoch": 0.5049490747382012, + "grad_norm": 120.5, + "learning_rate": 1.9984709480122327e-05, + "loss": 1.3274, + "step": 440 + }, + { + "epoch": 0.506096686271697, + "grad_norm": 80.0, + "learning_rate": 1.99796126401631e-05, + "loss": 0.9327, + "step": 441 + }, + { + "epoch": 0.507244297805193, + "grad_norm": 105.0, + "learning_rate": 1.9974515800203875e-05, + "loss": 1.1003, + "step": 442 + }, + { + "epoch": 0.5083919093386888, + "grad_norm": 106.5, + "learning_rate": 1.9969418960244652e-05, + "loss": 1.0261, + "step": 443 + }, + { + "epoch": 0.5095395208721848, + "grad_norm": 36.5, + "learning_rate": 1.9964322120285426e-05, + "loss": 0.695, + "step": 444 + }, + { + "epoch": 0.5106871324056806, + "grad_norm": 15.8125, + "learning_rate": 1.99592252803262e-05, + "loss": 0.6983, + "step": 445 + }, + { + "epoch": 0.5118347439391766, + "grad_norm": 53.5, + "learning_rate": 1.9954128440366974e-05, + "loss": 0.7186, + "step": 446 + }, + { + "epoch": 0.5129823554726725, + "grad_norm": 12.25, + "learning_rate": 1.9949031600407747e-05, + "loss": 0.6272, + "step": 447 + }, + { + "epoch": 0.5141299670061684, + "grad_norm": 16.125, + "learning_rate": 1.9943934760448525e-05, + "loss": 0.5849, + "step": 448 + }, + { + "epoch": 0.5152775785396643, + "grad_norm": 36.25, + "learning_rate": 1.99388379204893e-05, + "loss": 0.5905, + "step": 449 + }, + { + "epoch": 0.5164251900731602, + "grad_norm": 24.25, + "learning_rate": 1.9933741080530073e-05, + "loss": 0.4751, + "step": 450 + }, + { + "epoch": 0.5175728016066562, + "grad_norm": 15.9375, + "learning_rate": 1.9928644240570846e-05, + "loss": 0.4372, + "step": 451 + }, + { + "epoch": 0.518720413140152, + "grad_norm": 18.375, + "learning_rate": 1.9923547400611624e-05, + "loss": 0.6552, + "step": 452 + }, + { + "epoch": 0.519868024673648, + "grad_norm": 13.6875, + "learning_rate": 1.9918450560652398e-05, + "loss": 0.6515, + "step": 453 + }, + { + "epoch": 0.5210156362071439, + "grad_norm": 13.6875, + "learning_rate": 1.991335372069317e-05, + "loss": 0.5219, + "step": 454 + }, + { + "epoch": 0.5221632477406398, + "grad_norm": 70.0, + "learning_rate": 1.9908256880733945e-05, + "loss": 0.694, + "step": 455 + }, + { + "epoch": 0.5233108592741357, + "grad_norm": 47.75, + "learning_rate": 1.990316004077472e-05, + "loss": 1.0051, + "step": 456 + }, + { + "epoch": 0.5244584708076316, + "grad_norm": 98.5, + "learning_rate": 1.9898063200815497e-05, + "loss": 0.8809, + "step": 457 + }, + { + "epoch": 0.5256060823411275, + "grad_norm": 22.875, + "learning_rate": 1.989296636085627e-05, + "loss": 0.6882, + "step": 458 + }, + { + "epoch": 0.5267536938746235, + "grad_norm": 103.0, + "learning_rate": 1.9887869520897044e-05, + "loss": 0.8227, + "step": 459 + }, + { + "epoch": 0.5279013054081193, + "grad_norm": 41.5, + "learning_rate": 1.9882772680937822e-05, + "loss": 0.5851, + "step": 460 + }, + { + "epoch": 0.5290489169416153, + "grad_norm": 16.125, + "learning_rate": 1.9877675840978596e-05, + "loss": 0.6286, + "step": 461 + }, + { + "epoch": 0.5301965284751112, + "grad_norm": 40.0, + "learning_rate": 1.987257900101937e-05, + "loss": 0.3909, + "step": 462 + }, + { + "epoch": 0.5313441400086071, + "grad_norm": 180.0, + "learning_rate": 1.9867482161060147e-05, + "loss": 1.4089, + "step": 463 + }, + { + "epoch": 0.532491751542103, + "grad_norm": 67.5, + "learning_rate": 1.9862385321100917e-05, + "loss": 0.7342, + "step": 464 + }, + { + "epoch": 0.5336393630755989, + "grad_norm": 146.0, + "learning_rate": 1.9857288481141695e-05, + "loss": 1.1024, + "step": 465 + }, + { + "epoch": 0.5347869746090949, + "grad_norm": 112.0, + "learning_rate": 1.985219164118247e-05, + "loss": 0.7631, + "step": 466 + }, + { + "epoch": 0.5359345861425907, + "grad_norm": 51.25, + "learning_rate": 1.9847094801223243e-05, + "loss": 0.8592, + "step": 467 + }, + { + "epoch": 0.5370821976760867, + "grad_norm": 53.5, + "learning_rate": 1.984199796126402e-05, + "loss": 0.772, + "step": 468 + }, + { + "epoch": 0.5382298092095825, + "grad_norm": 168.0, + "learning_rate": 1.9836901121304794e-05, + "loss": 1.3238, + "step": 469 + }, + { + "epoch": 0.5393774207430785, + "grad_norm": 56.25, + "learning_rate": 1.9831804281345568e-05, + "loss": 0.686, + "step": 470 + }, + { + "epoch": 0.5405250322765743, + "grad_norm": 140.0, + "learning_rate": 1.982670744138634e-05, + "loss": 1.3487, + "step": 471 + }, + { + "epoch": 0.5416726438100703, + "grad_norm": 26.25, + "learning_rate": 1.9821610601427115e-05, + "loss": 0.654, + "step": 472 + }, + { + "epoch": 0.5428202553435661, + "grad_norm": 96.0, + "learning_rate": 1.9816513761467893e-05, + "loss": 0.8932, + "step": 473 + }, + { + "epoch": 0.5439678668770621, + "grad_norm": 118.5, + "learning_rate": 1.9811416921508667e-05, + "loss": 0.9886, + "step": 474 + }, + { + "epoch": 0.5451154784105581, + "grad_norm": 34.75, + "learning_rate": 1.980632008154944e-05, + "loss": 0.5023, + "step": 475 + }, + { + "epoch": 0.5462630899440539, + "grad_norm": 28.125, + "learning_rate": 1.9801223241590214e-05, + "loss": 0.4678, + "step": 476 + }, + { + "epoch": 0.5474107014775499, + "grad_norm": 27.875, + "learning_rate": 1.9796126401630992e-05, + "loss": 0.5802, + "step": 477 + }, + { + "epoch": 0.5485583130110457, + "grad_norm": 9.5625, + "learning_rate": 1.9791029561671766e-05, + "loss": 0.3969, + "step": 478 + }, + { + "epoch": 0.5497059245445417, + "grad_norm": 30.875, + "learning_rate": 1.978593272171254e-05, + "loss": 0.4119, + "step": 479 + }, + { + "epoch": 0.5508535360780376, + "grad_norm": 17.25, + "learning_rate": 1.9780835881753317e-05, + "loss": 0.5645, + "step": 480 + }, + { + "epoch": 0.5520011476115335, + "grad_norm": 47.75, + "learning_rate": 1.9775739041794087e-05, + "loss": 0.6168, + "step": 481 + }, + { + "epoch": 0.5531487591450294, + "grad_norm": 14.1875, + "learning_rate": 1.9770642201834865e-05, + "loss": 0.5325, + "step": 482 + }, + { + "epoch": 0.5542963706785253, + "grad_norm": 41.25, + "learning_rate": 1.976554536187564e-05, + "loss": 0.5013, + "step": 483 + }, + { + "epoch": 0.5554439822120212, + "grad_norm": 18.75, + "learning_rate": 1.9760448521916412e-05, + "loss": 0.4441, + "step": 484 + }, + { + "epoch": 0.5565915937455171, + "grad_norm": 147.0, + "learning_rate": 1.975535168195719e-05, + "loss": 1.585, + "step": 485 + }, + { + "epoch": 0.5577392052790131, + "grad_norm": 64.0, + "learning_rate": 1.9750254841997964e-05, + "loss": 0.958, + "step": 486 + }, + { + "epoch": 0.558886816812509, + "grad_norm": 135.0, + "learning_rate": 1.9745158002038738e-05, + "loss": 1.4838, + "step": 487 + }, + { + "epoch": 0.5600344283460049, + "grad_norm": 141.0, + "learning_rate": 1.974006116207951e-05, + "loss": 1.6651, + "step": 488 + }, + { + "epoch": 0.5611820398795008, + "grad_norm": 108.0, + "learning_rate": 1.9734964322120285e-05, + "loss": 0.9729, + "step": 489 + }, + { + "epoch": 0.5623296514129967, + "grad_norm": 35.25, + "learning_rate": 1.9729867482161063e-05, + "loss": 0.5966, + "step": 490 + }, + { + "epoch": 0.5634772629464926, + "grad_norm": 34.5, + "learning_rate": 1.9724770642201837e-05, + "loss": 0.7337, + "step": 491 + }, + { + "epoch": 0.5646248744799885, + "grad_norm": 19.875, + "learning_rate": 1.971967380224261e-05, + "loss": 0.4022, + "step": 492 + }, + { + "epoch": 0.5657724860134844, + "grad_norm": 29.375, + "learning_rate": 1.9714576962283384e-05, + "loss": 0.618, + "step": 493 + }, + { + "epoch": 0.5669200975469804, + "grad_norm": 75.5, + "learning_rate": 1.970948012232416e-05, + "loss": 0.6627, + "step": 494 + }, + { + "epoch": 0.5680677090804762, + "grad_norm": 104.5, + "learning_rate": 1.9704383282364936e-05, + "loss": 0.9524, + "step": 495 + }, + { + "epoch": 0.5692153206139722, + "grad_norm": 91.0, + "learning_rate": 1.969928644240571e-05, + "loss": 0.7282, + "step": 496 + }, + { + "epoch": 0.570362932147468, + "grad_norm": 95.5, + "learning_rate": 1.9694189602446487e-05, + "loss": 0.9184, + "step": 497 + }, + { + "epoch": 0.571510543680964, + "grad_norm": 23.625, + "learning_rate": 1.9689092762487257e-05, + "loss": 0.6252, + "step": 498 + }, + { + "epoch": 0.57265815521446, + "grad_norm": 55.25, + "learning_rate": 1.9683995922528035e-05, + "loss": 0.77, + "step": 499 + }, + { + "epoch": 0.5738057667479558, + "grad_norm": 49.75, + "learning_rate": 1.967889908256881e-05, + "loss": 0.5024, + "step": 500 + }, + { + "epoch": 0.5738057667479558, + "eval_accuracy": 0.56, + "eval_loss": 0.5818310379981995, + "eval_runtime": 49.317, + "eval_samples_per_second": 2.028, + "eval_steps_per_second": 2.028, + "step": 500 + }, + { + "epoch": 0.5749533782814518, + "grad_norm": 72.0, + "learning_rate": 1.9673802242609582e-05, + "loss": 0.5565, + "step": 501 + }, + { + "epoch": 0.5761009898149476, + "grad_norm": 17.0, + "learning_rate": 1.966870540265036e-05, + "loss": 0.5465, + "step": 502 + }, + { + "epoch": 0.5772486013484436, + "grad_norm": 16.5, + "learning_rate": 1.9663608562691134e-05, + "loss": 0.7208, + "step": 503 + }, + { + "epoch": 0.5783962128819394, + "grad_norm": 13.75, + "learning_rate": 1.9658511722731907e-05, + "loss": 0.5784, + "step": 504 + }, + { + "epoch": 0.5795438244154354, + "grad_norm": 17.875, + "learning_rate": 1.9653414882772685e-05, + "loss": 0.7433, + "step": 505 + }, + { + "epoch": 0.5806914359489312, + "grad_norm": 55.5, + "learning_rate": 1.9648318042813455e-05, + "loss": 0.5593, + "step": 506 + }, + { + "epoch": 0.5818390474824272, + "grad_norm": 26.0, + "learning_rate": 1.9643221202854233e-05, + "loss": 0.4981, + "step": 507 + }, + { + "epoch": 0.5829866590159231, + "grad_norm": 45.5, + "learning_rate": 1.9638124362895006e-05, + "loss": 0.6998, + "step": 508 + }, + { + "epoch": 0.584134270549419, + "grad_norm": 64.5, + "learning_rate": 1.963302752293578e-05, + "loss": 0.331, + "step": 509 + }, + { + "epoch": 0.585281882082915, + "grad_norm": 15.75, + "learning_rate": 1.9627930682976558e-05, + "loss": 0.5757, + "step": 510 + }, + { + "epoch": 0.5864294936164108, + "grad_norm": 78.0, + "learning_rate": 1.962283384301733e-05, + "loss": 0.5458, + "step": 511 + }, + { + "epoch": 0.5875771051499068, + "grad_norm": 12.6875, + "learning_rate": 1.9617737003058106e-05, + "loss": 0.4577, + "step": 512 + }, + { + "epoch": 0.5887247166834026, + "grad_norm": 94.5, + "learning_rate": 1.961264016309888e-05, + "loss": 1.0295, + "step": 513 + }, + { + "epoch": 0.5898723282168986, + "grad_norm": 62.75, + "learning_rate": 1.9607543323139657e-05, + "loss": 0.6586, + "step": 514 + }, + { + "epoch": 0.5910199397503945, + "grad_norm": 12.8125, + "learning_rate": 1.960244648318043e-05, + "loss": 0.4499, + "step": 515 + }, + { + "epoch": 0.5921675512838904, + "grad_norm": 41.25, + "learning_rate": 1.9597349643221205e-05, + "loss": 0.6115, + "step": 516 + }, + { + "epoch": 0.5933151628173863, + "grad_norm": 33.5, + "learning_rate": 1.959225280326198e-05, + "loss": 0.6823, + "step": 517 + }, + { + "epoch": 0.5944627743508822, + "grad_norm": 67.5, + "learning_rate": 1.9587155963302752e-05, + "loss": 0.7254, + "step": 518 + }, + { + "epoch": 0.5956103858843781, + "grad_norm": 21.75, + "learning_rate": 1.958205912334353e-05, + "loss": 0.6258, + "step": 519 + }, + { + "epoch": 0.596757997417874, + "grad_norm": 20.125, + "learning_rate": 1.9576962283384304e-05, + "loss": 0.782, + "step": 520 + }, + { + "epoch": 0.59790560895137, + "grad_norm": 55.0, + "learning_rate": 1.9571865443425077e-05, + "loss": 0.6427, + "step": 521 + }, + { + "epoch": 0.5990532204848659, + "grad_norm": 21.375, + "learning_rate": 1.9566768603465855e-05, + "loss": 0.5042, + "step": 522 + }, + { + "epoch": 0.6002008320183618, + "grad_norm": 84.0, + "learning_rate": 1.9561671763506625e-05, + "loss": 0.7413, + "step": 523 + }, + { + "epoch": 0.6013484435518577, + "grad_norm": 32.25, + "learning_rate": 1.9556574923547403e-05, + "loss": 0.4809, + "step": 524 + }, + { + "epoch": 0.6024960550853536, + "grad_norm": 29.25, + "learning_rate": 1.9551478083588176e-05, + "loss": 0.5245, + "step": 525 + }, + { + "epoch": 0.6036436666188495, + "grad_norm": 115.0, + "learning_rate": 1.954638124362895e-05, + "loss": 1.0439, + "step": 526 + }, + { + "epoch": 0.6047912781523455, + "grad_norm": 155.0, + "learning_rate": 1.9541284403669728e-05, + "loss": 1.6477, + "step": 527 + }, + { + "epoch": 0.6059388896858413, + "grad_norm": 39.25, + "learning_rate": 1.95361875637105e-05, + "loss": 0.555, + "step": 528 + }, + { + "epoch": 0.6070865012193373, + "grad_norm": 23.0, + "learning_rate": 1.9531090723751275e-05, + "loss": 0.478, + "step": 529 + }, + { + "epoch": 0.6082341127528331, + "grad_norm": 43.25, + "learning_rate": 1.9525993883792053e-05, + "loss": 0.6068, + "step": 530 + }, + { + "epoch": 0.6093817242863291, + "grad_norm": 44.5, + "learning_rate": 1.9520897043832823e-05, + "loss": 0.248, + "step": 531 + }, + { + "epoch": 0.6105293358198249, + "grad_norm": 20.75, + "learning_rate": 1.95158002038736e-05, + "loss": 0.4505, + "step": 532 + }, + { + "epoch": 0.6116769473533209, + "grad_norm": 41.5, + "learning_rate": 1.9510703363914374e-05, + "loss": 0.3902, + "step": 533 + }, + { + "epoch": 0.6128245588868169, + "grad_norm": 38.75, + "learning_rate": 1.950560652395515e-05, + "loss": 0.5029, + "step": 534 + }, + { + "epoch": 0.6139721704203127, + "grad_norm": 131.0, + "learning_rate": 1.9500509683995926e-05, + "loss": 1.2225, + "step": 535 + }, + { + "epoch": 0.6151197819538087, + "grad_norm": 85.5, + "learning_rate": 1.94954128440367e-05, + "loss": 0.8337, + "step": 536 + }, + { + "epoch": 0.6162673934873045, + "grad_norm": 43.25, + "learning_rate": 1.9490316004077473e-05, + "loss": 0.5878, + "step": 537 + }, + { + "epoch": 0.6174150050208005, + "grad_norm": 12.875, + "learning_rate": 1.9485219164118247e-05, + "loss": 0.4961, + "step": 538 + }, + { + "epoch": 0.6185626165542963, + "grad_norm": 77.0, + "learning_rate": 1.9480122324159025e-05, + "loss": 0.9027, + "step": 539 + }, + { + "epoch": 0.6197102280877923, + "grad_norm": 46.5, + "learning_rate": 1.94750254841998e-05, + "loss": 0.7113, + "step": 540 + }, + { + "epoch": 0.6208578396212882, + "grad_norm": 77.5, + "learning_rate": 1.9469928644240572e-05, + "loss": 0.7001, + "step": 541 + }, + { + "epoch": 0.6220054511547841, + "grad_norm": 68.5, + "learning_rate": 1.9464831804281346e-05, + "loss": 0.6916, + "step": 542 + }, + { + "epoch": 0.62315306268828, + "grad_norm": 77.0, + "learning_rate": 1.945973496432212e-05, + "loss": 0.7548, + "step": 543 + }, + { + "epoch": 0.6243006742217759, + "grad_norm": 85.0, + "learning_rate": 1.9454638124362898e-05, + "loss": 0.8164, + "step": 544 + }, + { + "epoch": 0.6254482857552719, + "grad_norm": 15.0625, + "learning_rate": 1.944954128440367e-05, + "loss": 0.603, + "step": 545 + }, + { + "epoch": 0.6265958972887677, + "grad_norm": 23.25, + "learning_rate": 1.9444444444444445e-05, + "loss": 0.4311, + "step": 546 + }, + { + "epoch": 0.6277435088222637, + "grad_norm": 10.1875, + "learning_rate": 1.9439347604485223e-05, + "loss": 0.3436, + "step": 547 + }, + { + "epoch": 0.6288911203557596, + "grad_norm": 75.0, + "learning_rate": 1.9434250764525993e-05, + "loss": 0.8814, + "step": 548 + }, + { + "epoch": 0.6300387318892555, + "grad_norm": 62.0, + "learning_rate": 1.942915392456677e-05, + "loss": 0.939, + "step": 549 + }, + { + "epoch": 0.6311863434227514, + "grad_norm": 52.0, + "learning_rate": 1.9424057084607544e-05, + "loss": 0.411, + "step": 550 + }, + { + "epoch": 0.6323339549562473, + "grad_norm": 127.5, + "learning_rate": 1.9418960244648318e-05, + "loss": 1.655, + "step": 551 + }, + { + "epoch": 0.6334815664897432, + "grad_norm": 96.0, + "learning_rate": 1.9413863404689096e-05, + "loss": 1.4065, + "step": 552 + }, + { + "epoch": 0.6346291780232391, + "grad_norm": 52.5, + "learning_rate": 1.940876656472987e-05, + "loss": 0.7391, + "step": 553 + }, + { + "epoch": 0.635776789556735, + "grad_norm": 78.0, + "learning_rate": 1.9403669724770643e-05, + "loss": 0.9576, + "step": 554 + }, + { + "epoch": 0.636924401090231, + "grad_norm": 91.0, + "learning_rate": 1.9398572884811417e-05, + "loss": 1.0132, + "step": 555 + }, + { + "epoch": 0.6380720126237268, + "grad_norm": 14.8125, + "learning_rate": 1.9393476044852195e-05, + "loss": 0.734, + "step": 556 + }, + { + "epoch": 0.6392196241572228, + "grad_norm": 63.75, + "learning_rate": 1.938837920489297e-05, + "loss": 0.6127, + "step": 557 + }, + { + "epoch": 0.6403672356907187, + "grad_norm": 19.625, + "learning_rate": 1.9383282364933742e-05, + "loss": 0.5999, + "step": 558 + }, + { + "epoch": 0.6415148472242146, + "grad_norm": 21.625, + "learning_rate": 1.9378185524974516e-05, + "loss": 0.7446, + "step": 559 + }, + { + "epoch": 0.6426624587577106, + "grad_norm": 26.375, + "learning_rate": 1.937308868501529e-05, + "loss": 0.6067, + "step": 560 + }, + { + "epoch": 0.6438100702912064, + "grad_norm": 130.0, + "learning_rate": 1.9367991845056068e-05, + "loss": 1.0849, + "step": 561 + }, + { + "epoch": 0.6449576818247024, + "grad_norm": 26.875, + "learning_rate": 1.936289500509684e-05, + "loss": 0.4882, + "step": 562 + }, + { + "epoch": 0.6461052933581982, + "grad_norm": 13.375, + "learning_rate": 1.9357798165137615e-05, + "loss": 0.6071, + "step": 563 + }, + { + "epoch": 0.6472529048916942, + "grad_norm": 23.625, + "learning_rate": 1.9352701325178393e-05, + "loss": 0.7541, + "step": 564 + }, + { + "epoch": 0.64840051642519, + "grad_norm": 22.5, + "learning_rate": 1.9347604485219163e-05, + "loss": 0.6343, + "step": 565 + }, + { + "epoch": 0.649548127958686, + "grad_norm": 37.75, + "learning_rate": 1.934250764525994e-05, + "loss": 0.629, + "step": 566 + }, + { + "epoch": 0.6506957394921818, + "grad_norm": 33.25, + "learning_rate": 1.9337410805300714e-05, + "loss": 0.6112, + "step": 567 + }, + { + "epoch": 0.6518433510256778, + "grad_norm": 23.625, + "learning_rate": 1.9332313965341488e-05, + "loss": 0.6854, + "step": 568 + }, + { + "epoch": 0.6529909625591738, + "grad_norm": 25.125, + "learning_rate": 1.9327217125382266e-05, + "loss": 0.5574, + "step": 569 + }, + { + "epoch": 0.6541385740926696, + "grad_norm": 22.125, + "learning_rate": 1.932212028542304e-05, + "loss": 0.4604, + "step": 570 + }, + { + "epoch": 0.6552861856261656, + "grad_norm": 10.625, + "learning_rate": 1.9317023445463813e-05, + "loss": 0.409, + "step": 571 + }, + { + "epoch": 0.6564337971596614, + "grad_norm": 11.375, + "learning_rate": 1.931192660550459e-05, + "loss": 0.4102, + "step": 572 + }, + { + "epoch": 0.6575814086931574, + "grad_norm": 74.5, + "learning_rate": 1.9306829765545365e-05, + "loss": 0.5481, + "step": 573 + }, + { + "epoch": 0.6587290202266533, + "grad_norm": 109.5, + "learning_rate": 1.930173292558614e-05, + "loss": 0.886, + "step": 574 + }, + { + "epoch": 0.6598766317601492, + "grad_norm": 48.75, + "learning_rate": 1.9296636085626912e-05, + "loss": 0.6536, + "step": 575 + }, + { + "epoch": 0.6610242432936451, + "grad_norm": 57.75, + "learning_rate": 1.9291539245667686e-05, + "loss": 0.902, + "step": 576 + }, + { + "epoch": 0.662171854827141, + "grad_norm": 61.5, + "learning_rate": 1.9286442405708464e-05, + "loss": 0.7151, + "step": 577 + }, + { + "epoch": 0.6633194663606369, + "grad_norm": 36.25, + "learning_rate": 1.9281345565749237e-05, + "loss": 0.6232, + "step": 578 + }, + { + "epoch": 0.6644670778941328, + "grad_norm": 11.6875, + "learning_rate": 1.927624872579001e-05, + "loss": 0.3918, + "step": 579 + }, + { + "epoch": 0.6656146894276288, + "grad_norm": 13.9375, + "learning_rate": 1.9271151885830785e-05, + "loss": 0.7175, + "step": 580 + }, + { + "epoch": 0.6667623009611247, + "grad_norm": 17.875, + "learning_rate": 1.9266055045871563e-05, + "loss": 0.7939, + "step": 581 + }, + { + "epoch": 0.6679099124946206, + "grad_norm": 13.9375, + "learning_rate": 1.9260958205912336e-05, + "loss": 0.6663, + "step": 582 + }, + { + "epoch": 0.6690575240281165, + "grad_norm": 17.75, + "learning_rate": 1.925586136595311e-05, + "loss": 0.512, + "step": 583 + }, + { + "epoch": 0.6702051355616124, + "grad_norm": 24.625, + "learning_rate": 1.9250764525993884e-05, + "loss": 0.8056, + "step": 584 + }, + { + "epoch": 0.6713527470951083, + "grad_norm": 46.75, + "learning_rate": 1.9245667686034658e-05, + "loss": 0.6661, + "step": 585 + }, + { + "epoch": 0.6725003586286042, + "grad_norm": 23.5, + "learning_rate": 1.9240570846075435e-05, + "loss": 0.6705, + "step": 586 + }, + { + "epoch": 0.6736479701621001, + "grad_norm": 55.5, + "learning_rate": 1.923547400611621e-05, + "loss": 0.7411, + "step": 587 + }, + { + "epoch": 0.6747955816955961, + "grad_norm": 34.25, + "learning_rate": 1.9230377166156983e-05, + "loss": 0.6056, + "step": 588 + }, + { + "epoch": 0.6759431932290919, + "grad_norm": 107.5, + "learning_rate": 1.922528032619776e-05, + "loss": 0.9975, + "step": 589 + }, + { + "epoch": 0.6770908047625879, + "grad_norm": 14.1875, + "learning_rate": 1.9220183486238534e-05, + "loss": 0.5988, + "step": 590 + }, + { + "epoch": 0.6782384162960837, + "grad_norm": 132.0, + "learning_rate": 1.921508664627931e-05, + "loss": 1.1598, + "step": 591 + }, + { + "epoch": 0.6793860278295797, + "grad_norm": 15.6875, + "learning_rate": 1.9209989806320086e-05, + "loss": 0.4172, + "step": 592 + }, + { + "epoch": 0.6805336393630756, + "grad_norm": 38.25, + "learning_rate": 1.9204892966360856e-05, + "loss": 0.5714, + "step": 593 + }, + { + "epoch": 0.6816812508965715, + "grad_norm": 14.0, + "learning_rate": 1.9199796126401633e-05, + "loss": 0.4887, + "step": 594 + }, + { + "epoch": 0.6828288624300675, + "grad_norm": 41.5, + "learning_rate": 1.9194699286442407e-05, + "loss": 0.6648, + "step": 595 + }, + { + "epoch": 0.6839764739635633, + "grad_norm": 13.875, + "learning_rate": 1.918960244648318e-05, + "loss": 0.6589, + "step": 596 + }, + { + "epoch": 0.6851240854970593, + "grad_norm": 35.0, + "learning_rate": 1.918450560652396e-05, + "loss": 1.0174, + "step": 597 + }, + { + "epoch": 0.6862716970305551, + "grad_norm": 27.875, + "learning_rate": 1.9179408766564732e-05, + "loss": 0.7711, + "step": 598 + }, + { + "epoch": 0.6874193085640511, + "grad_norm": 48.25, + "learning_rate": 1.9174311926605506e-05, + "loss": 0.4402, + "step": 599 + }, + { + "epoch": 0.6885669200975469, + "grad_norm": 49.0, + "learning_rate": 1.916921508664628e-05, + "loss": 0.4483, + "step": 600 + }, + { + "epoch": 0.6885669200975469, + "eval_accuracy": 0.63, + "eval_loss": 0.6301568150520325, + "eval_runtime": 49.333, + "eval_samples_per_second": 2.027, + "eval_steps_per_second": 2.027, + "step": 600 + }, + { + "epoch": 0.6897145316310429, + "grad_norm": 30.25, + "learning_rate": 1.9164118246687054e-05, + "loss": 0.4694, + "step": 601 + }, + { + "epoch": 0.6908621431645388, + "grad_norm": 29.875, + "learning_rate": 1.915902140672783e-05, + "loss": 0.374, + "step": 602 + }, + { + "epoch": 0.6920097546980347, + "grad_norm": 70.5, + "learning_rate": 1.9153924566768605e-05, + "loss": 0.941, + "step": 603 + }, + { + "epoch": 0.6931573662315307, + "grad_norm": 87.0, + "learning_rate": 1.914882772680938e-05, + "loss": 0.906, + "step": 604 + }, + { + "epoch": 0.6943049777650265, + "grad_norm": 14.0625, + "learning_rate": 1.9143730886850153e-05, + "loss": 0.406, + "step": 605 + }, + { + "epoch": 0.6954525892985225, + "grad_norm": 55.25, + "learning_rate": 1.913863404689093e-05, + "loss": 0.779, + "step": 606 + }, + { + "epoch": 0.6966002008320183, + "grad_norm": 36.5, + "learning_rate": 1.9133537206931704e-05, + "loss": 0.5805, + "step": 607 + }, + { + "epoch": 0.6977478123655143, + "grad_norm": 23.75, + "learning_rate": 1.912844036697248e-05, + "loss": 0.6406, + "step": 608 + }, + { + "epoch": 0.6988954238990102, + "grad_norm": 39.25, + "learning_rate": 1.9123343527013256e-05, + "loss": 0.4615, + "step": 609 + }, + { + "epoch": 0.7000430354325061, + "grad_norm": 67.5, + "learning_rate": 1.9118246687054026e-05, + "loss": 0.8637, + "step": 610 + }, + { + "epoch": 0.701190646966002, + "grad_norm": 22.5, + "learning_rate": 1.9113149847094803e-05, + "loss": 0.4727, + "step": 611 + }, + { + "epoch": 0.7023382584994979, + "grad_norm": 27.125, + "learning_rate": 1.9108053007135577e-05, + "loss": 0.6226, + "step": 612 + }, + { + "epoch": 0.7034858700329938, + "grad_norm": 62.5, + "learning_rate": 1.910295616717635e-05, + "loss": 0.6596, + "step": 613 + }, + { + "epoch": 0.7046334815664897, + "grad_norm": 40.0, + "learning_rate": 1.909785932721713e-05, + "loss": 0.5424, + "step": 614 + }, + { + "epoch": 0.7057810930999856, + "grad_norm": 62.5, + "learning_rate": 1.9092762487257902e-05, + "loss": 0.6348, + "step": 615 + }, + { + "epoch": 0.7069287046334816, + "grad_norm": 80.5, + "learning_rate": 1.9087665647298676e-05, + "loss": 0.9329, + "step": 616 + }, + { + "epoch": 0.7080763161669775, + "grad_norm": 101.5, + "learning_rate": 1.9082568807339454e-05, + "loss": 1.0578, + "step": 617 + }, + { + "epoch": 0.7092239277004734, + "grad_norm": 23.375, + "learning_rate": 1.9077471967380224e-05, + "loss": 0.6725, + "step": 618 + }, + { + "epoch": 0.7103715392339693, + "grad_norm": 42.0, + "learning_rate": 1.9072375127421e-05, + "loss": 0.6087, + "step": 619 + }, + { + "epoch": 0.7115191507674652, + "grad_norm": 37.0, + "learning_rate": 1.9067278287461775e-05, + "loss": 0.6237, + "step": 620 + }, + { + "epoch": 0.7126667623009612, + "grad_norm": 18.5, + "learning_rate": 1.906218144750255e-05, + "loss": 0.6327, + "step": 621 + }, + { + "epoch": 0.713814373834457, + "grad_norm": 27.375, + "learning_rate": 1.9057084607543327e-05, + "loss": 0.7476, + "step": 622 + }, + { + "epoch": 0.714961985367953, + "grad_norm": 28.25, + "learning_rate": 1.90519877675841e-05, + "loss": 0.7312, + "step": 623 + }, + { + "epoch": 0.7161095969014488, + "grad_norm": 57.75, + "learning_rate": 1.9046890927624874e-05, + "loss": 0.6385, + "step": 624 + }, + { + "epoch": 0.7172572084349448, + "grad_norm": 61.75, + "learning_rate": 1.9041794087665648e-05, + "loss": 0.3454, + "step": 625 + }, + { + "epoch": 0.7184048199684406, + "grad_norm": 39.0, + "learning_rate": 1.9036697247706422e-05, + "loss": 0.6641, + "step": 626 + }, + { + "epoch": 0.7195524315019366, + "grad_norm": 62.5, + "learning_rate": 1.9031600407747196e-05, + "loss": 0.8142, + "step": 627 + }, + { + "epoch": 0.7207000430354326, + "grad_norm": 16.375, + "learning_rate": 1.9026503567787973e-05, + "loss": 0.6127, + "step": 628 + }, + { + "epoch": 0.7218476545689284, + "grad_norm": 57.75, + "learning_rate": 1.9021406727828747e-05, + "loss": 0.7968, + "step": 629 + }, + { + "epoch": 0.7229952661024244, + "grad_norm": 25.375, + "learning_rate": 1.901630988786952e-05, + "loss": 0.4927, + "step": 630 + }, + { + "epoch": 0.7241428776359202, + "grad_norm": 48.25, + "learning_rate": 1.90112130479103e-05, + "loss": 0.6833, + "step": 631 + }, + { + "epoch": 0.7252904891694162, + "grad_norm": 54.75, + "learning_rate": 1.9006116207951072e-05, + "loss": 0.6494, + "step": 632 + }, + { + "epoch": 0.726438100702912, + "grad_norm": 54.5, + "learning_rate": 1.9001019367991846e-05, + "loss": 0.2914, + "step": 633 + }, + { + "epoch": 0.727585712236408, + "grad_norm": 10.125, + "learning_rate": 1.8995922528032624e-05, + "loss": 0.745, + "step": 634 + }, + { + "epoch": 0.7287333237699039, + "grad_norm": 45.75, + "learning_rate": 1.8990825688073394e-05, + "loss": 0.8078, + "step": 635 + }, + { + "epoch": 0.7298809353033998, + "grad_norm": 95.0, + "learning_rate": 1.898572884811417e-05, + "loss": 0.9361, + "step": 636 + }, + { + "epoch": 0.7310285468368957, + "grad_norm": 64.5, + "learning_rate": 1.8980632008154945e-05, + "loss": 0.5982, + "step": 637 + }, + { + "epoch": 0.7321761583703916, + "grad_norm": 22.25, + "learning_rate": 1.897553516819572e-05, + "loss": 0.5722, + "step": 638 + }, + { + "epoch": 0.7333237699038875, + "grad_norm": 51.5, + "learning_rate": 1.8970438328236496e-05, + "loss": 0.7216, + "step": 639 + }, + { + "epoch": 0.7344713814373834, + "grad_norm": 18.5, + "learning_rate": 1.896534148827727e-05, + "loss": 0.5961, + "step": 640 + }, + { + "epoch": 0.7356189929708794, + "grad_norm": 47.75, + "learning_rate": 1.8960244648318044e-05, + "loss": 0.6, + "step": 641 + }, + { + "epoch": 0.7367666045043753, + "grad_norm": 67.0, + "learning_rate": 1.8955147808358818e-05, + "loss": 0.7799, + "step": 642 + }, + { + "epoch": 0.7379142160378712, + "grad_norm": 90.5, + "learning_rate": 1.8950050968399592e-05, + "loss": 0.979, + "step": 643 + }, + { + "epoch": 0.7390618275713671, + "grad_norm": 47.0, + "learning_rate": 1.894495412844037e-05, + "loss": 0.6637, + "step": 644 + }, + { + "epoch": 0.740209439104863, + "grad_norm": 28.875, + "learning_rate": 1.8939857288481143e-05, + "loss": 0.7095, + "step": 645 + }, + { + "epoch": 0.7413570506383589, + "grad_norm": 70.5, + "learning_rate": 1.8934760448521917e-05, + "loss": 0.7767, + "step": 646 + }, + { + "epoch": 0.7425046621718548, + "grad_norm": 67.0, + "learning_rate": 1.892966360856269e-05, + "loss": 0.8117, + "step": 647 + }, + { + "epoch": 0.7436522737053507, + "grad_norm": 47.0, + "learning_rate": 1.892456676860347e-05, + "loss": 0.7253, + "step": 648 + }, + { + "epoch": 0.7447998852388467, + "grad_norm": 42.25, + "learning_rate": 1.8919469928644242e-05, + "loss": 0.5712, + "step": 649 + }, + { + "epoch": 0.7459474967723425, + "grad_norm": 10.3125, + "learning_rate": 1.8914373088685016e-05, + "loss": 0.4933, + "step": 650 + }, + { + "epoch": 0.7470951083058385, + "grad_norm": 17.875, + "learning_rate": 1.8909276248725793e-05, + "loss": 0.468, + "step": 651 + }, + { + "epoch": 0.7482427198393344, + "grad_norm": 61.75, + "learning_rate": 1.8904179408766564e-05, + "loss": 0.7518, + "step": 652 + }, + { + "epoch": 0.7493903313728303, + "grad_norm": 71.0, + "learning_rate": 1.889908256880734e-05, + "loss": 0.8373, + "step": 653 + }, + { + "epoch": 0.7505379429063262, + "grad_norm": 26.75, + "learning_rate": 1.8893985728848115e-05, + "loss": 0.7643, + "step": 654 + }, + { + "epoch": 0.7516855544398221, + "grad_norm": 28.25, + "learning_rate": 1.888888888888889e-05, + "loss": 0.5331, + "step": 655 + }, + { + "epoch": 0.7528331659733181, + "grad_norm": 8.375, + "learning_rate": 1.8883792048929666e-05, + "loss": 0.4439, + "step": 656 + }, + { + "epoch": 0.7539807775068139, + "grad_norm": 17.875, + "learning_rate": 1.887869520897044e-05, + "loss": 0.5831, + "step": 657 + }, + { + "epoch": 0.7551283890403099, + "grad_norm": 34.25, + "learning_rate": 1.8873598369011214e-05, + "loss": 0.5412, + "step": 658 + }, + { + "epoch": 0.7562760005738057, + "grad_norm": 50.75, + "learning_rate": 1.886850152905199e-05, + "loss": 0.5549, + "step": 659 + }, + { + "epoch": 0.7574236121073017, + "grad_norm": 56.0, + "learning_rate": 1.8863404689092762e-05, + "loss": 0.4515, + "step": 660 + }, + { + "epoch": 0.7585712236407975, + "grad_norm": 76.5, + "learning_rate": 1.885830784913354e-05, + "loss": 0.8915, + "step": 661 + }, + { + "epoch": 0.7597188351742935, + "grad_norm": 88.0, + "learning_rate": 1.8853211009174313e-05, + "loss": 0.7725, + "step": 662 + }, + { + "epoch": 0.7608664467077895, + "grad_norm": 32.0, + "learning_rate": 1.8848114169215087e-05, + "loss": 0.674, + "step": 663 + }, + { + "epoch": 0.7620140582412853, + "grad_norm": 37.0, + "learning_rate": 1.8843017329255864e-05, + "loss": 0.4771, + "step": 664 + }, + { + "epoch": 0.7631616697747813, + "grad_norm": 47.5, + "learning_rate": 1.883792048929664e-05, + "loss": 0.665, + "step": 665 + }, + { + "epoch": 0.7643092813082771, + "grad_norm": 49.0, + "learning_rate": 1.8832823649337412e-05, + "loss": 0.5971, + "step": 666 + }, + { + "epoch": 0.7654568928417731, + "grad_norm": 17.875, + "learning_rate": 1.8827726809378186e-05, + "loss": 0.507, + "step": 667 + }, + { + "epoch": 0.766604504375269, + "grad_norm": 17.5, + "learning_rate": 1.8822629969418963e-05, + "loss": 0.3678, + "step": 668 + }, + { + "epoch": 0.7677521159087649, + "grad_norm": 39.5, + "learning_rate": 1.8817533129459737e-05, + "loss": 0.5468, + "step": 669 + }, + { + "epoch": 0.7688997274422608, + "grad_norm": 26.375, + "learning_rate": 1.881243628950051e-05, + "loss": 0.4158, + "step": 670 + }, + { + "epoch": 0.7700473389757567, + "grad_norm": 70.5, + "learning_rate": 1.8807339449541285e-05, + "loss": 0.8145, + "step": 671 + }, + { + "epoch": 0.7711949505092526, + "grad_norm": 57.0, + "learning_rate": 1.880224260958206e-05, + "loss": 0.5283, + "step": 672 + }, + { + "epoch": 0.7723425620427485, + "grad_norm": 15.875, + "learning_rate": 1.8797145769622836e-05, + "loss": 0.6116, + "step": 673 + }, + { + "epoch": 0.7734901735762444, + "grad_norm": 10.6875, + "learning_rate": 1.879204892966361e-05, + "loss": 0.6081, + "step": 674 + }, + { + "epoch": 0.7746377851097404, + "grad_norm": 41.0, + "learning_rate": 1.8786952089704384e-05, + "loss": 0.6481, + "step": 675 + }, + { + "epoch": 0.7757853966432363, + "grad_norm": 13.375, + "learning_rate": 1.878185524974516e-05, + "loss": 0.4866, + "step": 676 + }, + { + "epoch": 0.7769330081767322, + "grad_norm": 32.75, + "learning_rate": 1.8776758409785932e-05, + "loss": 0.7627, + "step": 677 + }, + { + "epoch": 0.7780806197102281, + "grad_norm": 83.0, + "learning_rate": 1.877166156982671e-05, + "loss": 0.8497, + "step": 678 + }, + { + "epoch": 0.779228231243724, + "grad_norm": 46.75, + "learning_rate": 1.8766564729867483e-05, + "loss": 0.3555, + "step": 679 + }, + { + "epoch": 0.7803758427772199, + "grad_norm": 67.0, + "learning_rate": 1.8761467889908257e-05, + "loss": 0.7386, + "step": 680 + }, + { + "epoch": 0.7815234543107158, + "grad_norm": 98.5, + "learning_rate": 1.8756371049949034e-05, + "loss": 0.6743, + "step": 681 + }, + { + "epoch": 0.7826710658442118, + "grad_norm": 52.25, + "learning_rate": 1.8751274209989808e-05, + "loss": 0.6193, + "step": 682 + }, + { + "epoch": 0.7838186773777076, + "grad_norm": 18.0, + "learning_rate": 1.8746177370030582e-05, + "loss": 0.4905, + "step": 683 + }, + { + "epoch": 0.7849662889112036, + "grad_norm": 37.75, + "learning_rate": 1.874108053007136e-05, + "loss": 0.4043, + "step": 684 + }, + { + "epoch": 0.7861139004446994, + "grad_norm": 22.25, + "learning_rate": 1.8735983690112133e-05, + "loss": 0.5243, + "step": 685 + }, + { + "epoch": 0.7872615119781954, + "grad_norm": 21.75, + "learning_rate": 1.8730886850152907e-05, + "loss": 0.4342, + "step": 686 + }, + { + "epoch": 0.7884091235116913, + "grad_norm": 31.375, + "learning_rate": 1.872579001019368e-05, + "loss": 0.5206, + "step": 687 + }, + { + "epoch": 0.7895567350451872, + "grad_norm": 21.25, + "learning_rate": 1.8720693170234455e-05, + "loss": 0.6048, + "step": 688 + }, + { + "epoch": 0.7907043465786832, + "grad_norm": 49.0, + "learning_rate": 1.8715596330275232e-05, + "loss": 0.5758, + "step": 689 + }, + { + "epoch": 0.791851958112179, + "grad_norm": 26.75, + "learning_rate": 1.8710499490316006e-05, + "loss": 0.5446, + "step": 690 + }, + { + "epoch": 0.792999569645675, + "grad_norm": 28.5, + "learning_rate": 1.870540265035678e-05, + "loss": 0.4504, + "step": 691 + }, + { + "epoch": 0.7941471811791708, + "grad_norm": 28.375, + "learning_rate": 1.8700305810397554e-05, + "loss": 0.7349, + "step": 692 + }, + { + "epoch": 0.7952947927126668, + "grad_norm": 102.0, + "learning_rate": 1.869520897043833e-05, + "loss": 0.6082, + "step": 693 + }, + { + "epoch": 0.7964424042461626, + "grad_norm": 87.0, + "learning_rate": 1.8690112130479105e-05, + "loss": 0.7548, + "step": 694 + }, + { + "epoch": 0.7975900157796586, + "grad_norm": 24.875, + "learning_rate": 1.868501529051988e-05, + "loss": 0.7732, + "step": 695 + }, + { + "epoch": 0.7987376273131545, + "grad_norm": 44.0, + "learning_rate": 1.8679918450560653e-05, + "loss": 0.5928, + "step": 696 + }, + { + "epoch": 0.7998852388466504, + "grad_norm": 24.375, + "learning_rate": 1.8674821610601427e-05, + "loss": 0.5727, + "step": 697 + }, + { + "epoch": 0.8010328503801463, + "grad_norm": 72.5, + "learning_rate": 1.8669724770642204e-05, + "loss": 0.811, + "step": 698 + }, + { + "epoch": 0.8021804619136422, + "grad_norm": 13.25, + "learning_rate": 1.8664627930682978e-05, + "loss": 0.2618, + "step": 699 + }, + { + "epoch": 0.8033280734471382, + "grad_norm": 34.5, + "learning_rate": 1.8659531090723752e-05, + "loss": 0.8214, + "step": 700 + }, + { + "epoch": 0.8033280734471382, + "eval_accuracy": 0.61, + "eval_loss": 0.5829592347145081, + "eval_runtime": 49.9174, + "eval_samples_per_second": 2.003, + "eval_steps_per_second": 2.003, + "step": 700 + }, + { + "epoch": 0.804475684980634, + "grad_norm": 25.125, + "learning_rate": 1.865443425076453e-05, + "loss": 0.5568, + "step": 701 + }, + { + "epoch": 0.80562329651413, + "grad_norm": 14.9375, + "learning_rate": 1.86493374108053e-05, + "loss": 0.3704, + "step": 702 + }, + { + "epoch": 0.8067709080476259, + "grad_norm": 15.8125, + "learning_rate": 1.8644240570846077e-05, + "loss": 0.4246, + "step": 703 + }, + { + "epoch": 0.8079185195811218, + "grad_norm": 16.125, + "learning_rate": 1.863914373088685e-05, + "loss": 0.3896, + "step": 704 + }, + { + "epoch": 0.8090661311146177, + "grad_norm": 50.5, + "learning_rate": 1.8634046890927625e-05, + "loss": 0.3966, + "step": 705 + }, + { + "epoch": 0.8102137426481136, + "grad_norm": 45.25, + "learning_rate": 1.8628950050968402e-05, + "loss": 0.3742, + "step": 706 + }, + { + "epoch": 0.8113613541816095, + "grad_norm": 39.75, + "learning_rate": 1.8623853211009176e-05, + "loss": 0.4672, + "step": 707 + }, + { + "epoch": 0.8125089657151054, + "grad_norm": 39.25, + "learning_rate": 1.861875637104995e-05, + "loss": 0.6046, + "step": 708 + }, + { + "epoch": 0.8136565772486013, + "grad_norm": 38.25, + "learning_rate": 1.8613659531090724e-05, + "loss": 1.0867, + "step": 709 + }, + { + "epoch": 0.8148041887820973, + "grad_norm": 26.875, + "learning_rate": 1.86085626911315e-05, + "loss": 0.3141, + "step": 710 + }, + { + "epoch": 0.8159518003155932, + "grad_norm": 53.0, + "learning_rate": 1.8603465851172275e-05, + "loss": 0.8153, + "step": 711 + }, + { + "epoch": 0.8170994118490891, + "grad_norm": 35.0, + "learning_rate": 1.859836901121305e-05, + "loss": 0.7676, + "step": 712 + }, + { + "epoch": 0.818247023382585, + "grad_norm": 55.75, + "learning_rate": 1.8593272171253823e-05, + "loss": 0.5664, + "step": 713 + }, + { + "epoch": 0.8193946349160809, + "grad_norm": 14.5, + "learning_rate": 1.8588175331294597e-05, + "loss": 0.6436, + "step": 714 + }, + { + "epoch": 0.8205422464495769, + "grad_norm": 29.0, + "learning_rate": 1.8583078491335374e-05, + "loss": 0.4355, + "step": 715 + }, + { + "epoch": 0.8216898579830727, + "grad_norm": 82.5, + "learning_rate": 1.8577981651376148e-05, + "loss": 1.2766, + "step": 716 + }, + { + "epoch": 0.8228374695165687, + "grad_norm": 37.5, + "learning_rate": 1.8572884811416922e-05, + "loss": 0.4578, + "step": 717 + }, + { + "epoch": 0.8239850810500645, + "grad_norm": 16.875, + "learning_rate": 1.85677879714577e-05, + "loss": 0.5334, + "step": 718 + }, + { + "epoch": 0.8251326925835605, + "grad_norm": 22.875, + "learning_rate": 1.856269113149847e-05, + "loss": 0.5546, + "step": 719 + }, + { + "epoch": 0.8262803041170563, + "grad_norm": 106.0, + "learning_rate": 1.8557594291539247e-05, + "loss": 0.9589, + "step": 720 + }, + { + "epoch": 0.8274279156505523, + "grad_norm": 21.75, + "learning_rate": 1.855249745158002e-05, + "loss": 0.5682, + "step": 721 + }, + { + "epoch": 0.8285755271840483, + "grad_norm": 52.75, + "learning_rate": 1.8547400611620795e-05, + "loss": 0.4809, + "step": 722 + }, + { + "epoch": 0.8297231387175441, + "grad_norm": 19.0, + "learning_rate": 1.8542303771661572e-05, + "loss": 0.3774, + "step": 723 + }, + { + "epoch": 0.8308707502510401, + "grad_norm": 25.125, + "learning_rate": 1.8537206931702346e-05, + "loss": 0.4828, + "step": 724 + }, + { + "epoch": 0.8320183617845359, + "grad_norm": 34.0, + "learning_rate": 1.853211009174312e-05, + "loss": 0.4859, + "step": 725 + }, + { + "epoch": 0.8331659733180319, + "grad_norm": 48.75, + "learning_rate": 1.8527013251783897e-05, + "loss": 0.5612, + "step": 726 + }, + { + "epoch": 0.8343135848515277, + "grad_norm": 45.5, + "learning_rate": 1.852191641182467e-05, + "loss": 0.5976, + "step": 727 + }, + { + "epoch": 0.8354611963850237, + "grad_norm": 52.0, + "learning_rate": 1.8516819571865445e-05, + "loss": 0.6551, + "step": 728 + }, + { + "epoch": 0.8366088079185195, + "grad_norm": 23.75, + "learning_rate": 1.851172273190622e-05, + "loss": 0.5022, + "step": 729 + }, + { + "epoch": 0.8377564194520155, + "grad_norm": 71.5, + "learning_rate": 1.8506625891946993e-05, + "loss": 0.8474, + "step": 730 + }, + { + "epoch": 0.8389040309855114, + "grad_norm": 67.5, + "learning_rate": 1.850152905198777e-05, + "loss": 0.6511, + "step": 731 + }, + { + "epoch": 0.8400516425190073, + "grad_norm": 36.5, + "learning_rate": 1.8496432212028544e-05, + "loss": 0.5485, + "step": 732 + }, + { + "epoch": 0.8411992540525032, + "grad_norm": 63.25, + "learning_rate": 1.8491335372069318e-05, + "loss": 0.7833, + "step": 733 + }, + { + "epoch": 0.8423468655859991, + "grad_norm": 11.625, + "learning_rate": 1.8486238532110092e-05, + "loss": 0.5295, + "step": 734 + }, + { + "epoch": 0.8434944771194951, + "grad_norm": 44.25, + "learning_rate": 1.848114169215087e-05, + "loss": 0.4733, + "step": 735 + }, + { + "epoch": 0.844642088652991, + "grad_norm": 6.90625, + "learning_rate": 1.8476044852191643e-05, + "loss": 0.2207, + "step": 736 + }, + { + "epoch": 0.8457897001864869, + "grad_norm": 63.0, + "learning_rate": 1.8470948012232417e-05, + "loss": 0.5543, + "step": 737 + }, + { + "epoch": 0.8469373117199828, + "grad_norm": 19.5, + "learning_rate": 1.846585117227319e-05, + "loss": 0.4689, + "step": 738 + }, + { + "epoch": 0.8480849232534787, + "grad_norm": 14.6875, + "learning_rate": 1.8460754332313965e-05, + "loss": 0.5446, + "step": 739 + }, + { + "epoch": 0.8492325347869746, + "grad_norm": 68.5, + "learning_rate": 1.8455657492354742e-05, + "loss": 0.6652, + "step": 740 + }, + { + "epoch": 0.8503801463204705, + "grad_norm": 25.0, + "learning_rate": 1.8450560652395516e-05, + "loss": 0.3413, + "step": 741 + }, + { + "epoch": 0.8515277578539664, + "grad_norm": 43.5, + "learning_rate": 1.844546381243629e-05, + "loss": 0.5552, + "step": 742 + }, + { + "epoch": 0.8526753693874624, + "grad_norm": 50.25, + "learning_rate": 1.8440366972477067e-05, + "loss": 0.5051, + "step": 743 + }, + { + "epoch": 0.8538229809209582, + "grad_norm": 30.625, + "learning_rate": 1.843527013251784e-05, + "loss": 0.5442, + "step": 744 + }, + { + "epoch": 0.8549705924544542, + "grad_norm": 21.75, + "learning_rate": 1.8430173292558615e-05, + "loss": 0.724, + "step": 745 + }, + { + "epoch": 0.8561182039879501, + "grad_norm": 23.0, + "learning_rate": 1.8425076452599392e-05, + "loss": 0.6982, + "step": 746 + }, + { + "epoch": 0.857265815521446, + "grad_norm": 34.5, + "learning_rate": 1.8419979612640163e-05, + "loss": 0.4268, + "step": 747 + }, + { + "epoch": 0.858413427054942, + "grad_norm": 12.0, + "learning_rate": 1.841488277268094e-05, + "loss": 0.5611, + "step": 748 + }, + { + "epoch": 0.8595610385884378, + "grad_norm": 20.5, + "learning_rate": 1.8409785932721714e-05, + "loss": 0.552, + "step": 749 + }, + { + "epoch": 0.8607086501219338, + "grad_norm": 35.0, + "learning_rate": 1.8404689092762488e-05, + "loss": 0.5549, + "step": 750 + }, + { + "epoch": 0.8618562616554296, + "grad_norm": 43.5, + "learning_rate": 1.8399592252803265e-05, + "loss": 0.5242, + "step": 751 + }, + { + "epoch": 0.8630038731889256, + "grad_norm": 13.875, + "learning_rate": 1.839449541284404e-05, + "loss": 0.3858, + "step": 752 + }, + { + "epoch": 0.8641514847224214, + "grad_norm": 45.25, + "learning_rate": 1.8389398572884813e-05, + "loss": 0.6547, + "step": 753 + }, + { + "epoch": 0.8652990962559174, + "grad_norm": 66.5, + "learning_rate": 1.8384301732925587e-05, + "loss": 0.5999, + "step": 754 + }, + { + "epoch": 0.8664467077894132, + "grad_norm": 31.25, + "learning_rate": 1.837920489296636e-05, + "loss": 0.6402, + "step": 755 + }, + { + "epoch": 0.8675943193229092, + "grad_norm": 16.125, + "learning_rate": 1.8374108053007138e-05, + "loss": 0.4183, + "step": 756 + }, + { + "epoch": 0.868741930856405, + "grad_norm": 34.75, + "learning_rate": 1.8369011213047912e-05, + "loss": 0.46, + "step": 757 + }, + { + "epoch": 0.869889542389901, + "grad_norm": 11.6875, + "learning_rate": 1.8363914373088686e-05, + "loss": 0.5179, + "step": 758 + }, + { + "epoch": 0.871037153923397, + "grad_norm": 60.0, + "learning_rate": 1.835881753312946e-05, + "loss": 0.5217, + "step": 759 + }, + { + "epoch": 0.8721847654568928, + "grad_norm": 9.75, + "learning_rate": 1.8353720693170237e-05, + "loss": 0.3772, + "step": 760 + }, + { + "epoch": 0.8733323769903888, + "grad_norm": 11.75, + "learning_rate": 1.834862385321101e-05, + "loss": 0.5367, + "step": 761 + }, + { + "epoch": 0.8744799885238846, + "grad_norm": 41.5, + "learning_rate": 1.8343527013251785e-05, + "loss": 0.573, + "step": 762 + }, + { + "epoch": 0.8756276000573806, + "grad_norm": 34.25, + "learning_rate": 1.8338430173292562e-05, + "loss": 0.1937, + "step": 763 + }, + { + "epoch": 0.8767752115908765, + "grad_norm": 23.125, + "learning_rate": 1.8333333333333333e-05, + "loss": 0.4614, + "step": 764 + }, + { + "epoch": 0.8779228231243724, + "grad_norm": 46.5, + "learning_rate": 1.832823649337411e-05, + "loss": 0.442, + "step": 765 + }, + { + "epoch": 0.8790704346578683, + "grad_norm": 74.5, + "learning_rate": 1.8323139653414884e-05, + "loss": 0.9426, + "step": 766 + }, + { + "epoch": 0.8802180461913642, + "grad_norm": 65.5, + "learning_rate": 1.8318042813455658e-05, + "loss": 0.8281, + "step": 767 + }, + { + "epoch": 0.8813656577248601, + "grad_norm": 30.375, + "learning_rate": 1.8312945973496435e-05, + "loss": 0.6035, + "step": 768 + }, + { + "epoch": 0.882513269258356, + "grad_norm": 89.5, + "learning_rate": 1.830784913353721e-05, + "loss": 0.8813, + "step": 769 + }, + { + "epoch": 0.883660880791852, + "grad_norm": 71.5, + "learning_rate": 1.8302752293577983e-05, + "loss": 0.7809, + "step": 770 + }, + { + "epoch": 0.8848084923253479, + "grad_norm": 87.5, + "learning_rate": 1.829765545361876e-05, + "loss": 0.9051, + "step": 771 + }, + { + "epoch": 0.8859561038588438, + "grad_norm": 78.5, + "learning_rate": 1.829255861365953e-05, + "loss": 0.8777, + "step": 772 + }, + { + "epoch": 0.8871037153923397, + "grad_norm": 38.5, + "learning_rate": 1.8287461773700308e-05, + "loss": 0.3393, + "step": 773 + }, + { + "epoch": 0.8882513269258356, + "grad_norm": 34.0, + "learning_rate": 1.8282364933741082e-05, + "loss": 0.4772, + "step": 774 + }, + { + "epoch": 0.8893989384593315, + "grad_norm": 42.25, + "learning_rate": 1.8277268093781856e-05, + "loss": 0.5136, + "step": 775 + }, + { + "epoch": 0.8905465499928275, + "grad_norm": 51.25, + "learning_rate": 1.8272171253822633e-05, + "loss": 0.4965, + "step": 776 + }, + { + "epoch": 0.8916941615263233, + "grad_norm": 23.5, + "learning_rate": 1.8267074413863407e-05, + "loss": 0.6667, + "step": 777 + }, + { + "epoch": 0.8928417730598193, + "grad_norm": 69.0, + "learning_rate": 1.826197757390418e-05, + "loss": 0.7309, + "step": 778 + }, + { + "epoch": 0.8939893845933151, + "grad_norm": 13.5625, + "learning_rate": 1.8256880733944955e-05, + "loss": 0.5041, + "step": 779 + }, + { + "epoch": 0.8951369961268111, + "grad_norm": 115.0, + "learning_rate": 1.825178389398573e-05, + "loss": 0.9974, + "step": 780 + }, + { + "epoch": 0.8962846076603069, + "grad_norm": 19.125, + "learning_rate": 1.8246687054026503e-05, + "loss": 0.5556, + "step": 781 + }, + { + "epoch": 0.8974322191938029, + "grad_norm": 52.5, + "learning_rate": 1.824159021406728e-05, + "loss": 0.7631, + "step": 782 + }, + { + "epoch": 0.8985798307272989, + "grad_norm": 10.4375, + "learning_rate": 1.8236493374108054e-05, + "loss": 0.6217, + "step": 783 + }, + { + "epoch": 0.8997274422607947, + "grad_norm": 20.625, + "learning_rate": 1.8231396534148828e-05, + "loss": 0.4404, + "step": 784 + }, + { + "epoch": 0.9008750537942907, + "grad_norm": 81.0, + "learning_rate": 1.8226299694189605e-05, + "loss": 0.8382, + "step": 785 + }, + { + "epoch": 0.9020226653277865, + "grad_norm": 25.125, + "learning_rate": 1.822120285423038e-05, + "loss": 0.465, + "step": 786 + }, + { + "epoch": 0.9031702768612825, + "grad_norm": 4.90625, + "learning_rate": 1.8216106014271153e-05, + "loss": 0.2211, + "step": 787 + }, + { + "epoch": 0.9043178883947783, + "grad_norm": 18.25, + "learning_rate": 1.821100917431193e-05, + "loss": 0.5354, + "step": 788 + }, + { + "epoch": 0.9054654999282743, + "grad_norm": 22.625, + "learning_rate": 1.82059123343527e-05, + "loss": 0.4656, + "step": 789 + }, + { + "epoch": 0.9066131114617701, + "grad_norm": 22.125, + "learning_rate": 1.8200815494393478e-05, + "loss": 0.7412, + "step": 790 + }, + { + "epoch": 0.9077607229952661, + "grad_norm": 18.125, + "learning_rate": 1.8195718654434252e-05, + "loss": 0.5085, + "step": 791 + }, + { + "epoch": 0.908908334528762, + "grad_norm": 17.625, + "learning_rate": 1.8190621814475026e-05, + "loss": 0.4275, + "step": 792 + }, + { + "epoch": 0.9100559460622579, + "grad_norm": 74.5, + "learning_rate": 1.8185524974515803e-05, + "loss": 0.8506, + "step": 793 + }, + { + "epoch": 0.9112035575957539, + "grad_norm": 59.75, + "learning_rate": 1.8180428134556577e-05, + "loss": 0.5863, + "step": 794 + }, + { + "epoch": 0.9123511691292497, + "grad_norm": 17.875, + "learning_rate": 1.817533129459735e-05, + "loss": 0.5018, + "step": 795 + }, + { + "epoch": 0.9134987806627457, + "grad_norm": 73.0, + "learning_rate": 1.8170234454638125e-05, + "loss": 0.7539, + "step": 796 + }, + { + "epoch": 0.9146463921962416, + "grad_norm": 38.0, + "learning_rate": 1.81651376146789e-05, + "loss": 0.6021, + "step": 797 + }, + { + "epoch": 0.9157940037297375, + "grad_norm": 34.75, + "learning_rate": 1.8160040774719676e-05, + "loss": 0.4989, + "step": 798 + }, + { + "epoch": 0.9169416152632334, + "grad_norm": 29.25, + "learning_rate": 1.815494393476045e-05, + "loss": 0.5503, + "step": 799 + }, + { + "epoch": 0.9180892267967293, + "grad_norm": 113.0, + "learning_rate": 1.8149847094801224e-05, + "loss": 0.7238, + "step": 800 + }, + { + "epoch": 0.9180892267967293, + "eval_accuracy": 0.67, + "eval_loss": 0.5950115323066711, + "eval_runtime": 49.3005, + "eval_samples_per_second": 2.028, + "eval_steps_per_second": 2.028, + "step": 800 + }, + { + "epoch": 0.9192368383302252, + "grad_norm": 12.9375, + "learning_rate": 1.8144750254841998e-05, + "loss": 0.632, + "step": 801 + }, + { + "epoch": 0.9203844498637211, + "grad_norm": 49.75, + "learning_rate": 1.8139653414882775e-05, + "loss": 0.6413, + "step": 802 + }, + { + "epoch": 0.921532061397217, + "grad_norm": 13.125, + "learning_rate": 1.813455657492355e-05, + "loss": 0.5482, + "step": 803 + }, + { + "epoch": 0.922679672930713, + "grad_norm": 20.125, + "learning_rate": 1.8129459734964323e-05, + "loss": 0.5773, + "step": 804 + }, + { + "epoch": 0.9238272844642089, + "grad_norm": 100.0, + "learning_rate": 1.81243628950051e-05, + "loss": 1.35, + "step": 805 + }, + { + "epoch": 0.9249748959977048, + "grad_norm": 25.75, + "learning_rate": 1.811926605504587e-05, + "loss": 0.5234, + "step": 806 + }, + { + "epoch": 0.9261225075312007, + "grad_norm": 37.0, + "learning_rate": 1.8114169215086648e-05, + "loss": 0.473, + "step": 807 + }, + { + "epoch": 0.9272701190646966, + "grad_norm": 29.0, + "learning_rate": 1.8109072375127422e-05, + "loss": 0.4716, + "step": 808 + }, + { + "epoch": 0.9284177305981925, + "grad_norm": 22.0, + "learning_rate": 1.8103975535168196e-05, + "loss": 0.5146, + "step": 809 + }, + { + "epoch": 0.9295653421316884, + "grad_norm": 11.75, + "learning_rate": 1.8098878695208973e-05, + "loss": 0.6532, + "step": 810 + }, + { + "epoch": 0.9307129536651844, + "grad_norm": 14.375, + "learning_rate": 1.8093781855249747e-05, + "loss": 0.5441, + "step": 811 + }, + { + "epoch": 0.9318605651986802, + "grad_norm": 42.0, + "learning_rate": 1.808868501529052e-05, + "loss": 0.4905, + "step": 812 + }, + { + "epoch": 0.9330081767321762, + "grad_norm": 64.0, + "learning_rate": 1.8083588175331298e-05, + "loss": 0.8364, + "step": 813 + }, + { + "epoch": 0.934155788265672, + "grad_norm": 28.875, + "learning_rate": 1.807849133537207e-05, + "loss": 0.414, + "step": 814 + }, + { + "epoch": 0.935303399799168, + "grad_norm": 7.6875, + "learning_rate": 1.8073394495412846e-05, + "loss": 0.3923, + "step": 815 + }, + { + "epoch": 0.9364510113326638, + "grad_norm": 32.25, + "learning_rate": 1.806829765545362e-05, + "loss": 0.5358, + "step": 816 + }, + { + "epoch": 0.9375986228661598, + "grad_norm": 57.5, + "learning_rate": 1.8063200815494394e-05, + "loss": 0.4813, + "step": 817 + }, + { + "epoch": 0.9387462343996558, + "grad_norm": 33.5, + "learning_rate": 1.805810397553517e-05, + "loss": 0.4693, + "step": 818 + }, + { + "epoch": 0.9398938459331516, + "grad_norm": 35.0, + "learning_rate": 1.8053007135575945e-05, + "loss": 0.3321, + "step": 819 + }, + { + "epoch": 0.9410414574666476, + "grad_norm": 19.75, + "learning_rate": 1.804791029561672e-05, + "loss": 0.5709, + "step": 820 + }, + { + "epoch": 0.9421890690001434, + "grad_norm": 133.0, + "learning_rate": 1.8042813455657493e-05, + "loss": 1.0803, + "step": 821 + }, + { + "epoch": 0.9433366805336394, + "grad_norm": 7.90625, + "learning_rate": 1.803771661569827e-05, + "loss": 0.5182, + "step": 822 + }, + { + "epoch": 0.9444842920671352, + "grad_norm": 25.375, + "learning_rate": 1.8032619775739044e-05, + "loss": 0.6489, + "step": 823 + }, + { + "epoch": 0.9456319036006312, + "grad_norm": 24.125, + "learning_rate": 1.8027522935779818e-05, + "loss": 0.5298, + "step": 824 + }, + { + "epoch": 0.9467795151341271, + "grad_norm": 26.5, + "learning_rate": 1.8022426095820592e-05, + "loss": 0.4967, + "step": 825 + }, + { + "epoch": 0.947927126667623, + "grad_norm": 55.5, + "learning_rate": 1.8017329255861366e-05, + "loss": 0.7623, + "step": 826 + }, + { + "epoch": 0.9490747382011189, + "grad_norm": 38.75, + "learning_rate": 1.8012232415902143e-05, + "loss": 0.3873, + "step": 827 + }, + { + "epoch": 0.9502223497346148, + "grad_norm": 36.5, + "learning_rate": 1.8007135575942917e-05, + "loss": 0.4885, + "step": 828 + }, + { + "epoch": 0.9513699612681108, + "grad_norm": 36.75, + "learning_rate": 1.800203873598369e-05, + "loss": 0.4786, + "step": 829 + }, + { + "epoch": 0.9525175728016066, + "grad_norm": 15.9375, + "learning_rate": 1.7996941896024468e-05, + "loss": 0.7344, + "step": 830 + }, + { + "epoch": 0.9536651843351026, + "grad_norm": 35.75, + "learning_rate": 1.799184505606524e-05, + "loss": 0.4364, + "step": 831 + }, + { + "epoch": 0.9548127958685985, + "grad_norm": 60.5, + "learning_rate": 1.7986748216106016e-05, + "loss": 0.4018, + "step": 832 + }, + { + "epoch": 0.9559604074020944, + "grad_norm": 29.5, + "learning_rate": 1.798165137614679e-05, + "loss": 0.5492, + "step": 833 + }, + { + "epoch": 0.9571080189355903, + "grad_norm": 51.75, + "learning_rate": 1.7976554536187564e-05, + "loss": 0.5751, + "step": 834 + }, + { + "epoch": 0.9582556304690862, + "grad_norm": 27.0, + "learning_rate": 1.797145769622834e-05, + "loss": 0.4943, + "step": 835 + }, + { + "epoch": 0.9594032420025821, + "grad_norm": 40.75, + "learning_rate": 1.7966360856269115e-05, + "loss": 0.6622, + "step": 836 + }, + { + "epoch": 0.960550853536078, + "grad_norm": 82.0, + "learning_rate": 1.796126401630989e-05, + "loss": 0.6737, + "step": 837 + }, + { + "epoch": 0.9616984650695739, + "grad_norm": 11.125, + "learning_rate": 1.7956167176350666e-05, + "loss": 0.4544, + "step": 838 + }, + { + "epoch": 0.9628460766030699, + "grad_norm": 18.125, + "learning_rate": 1.795107033639144e-05, + "loss": 0.5389, + "step": 839 + }, + { + "epoch": 0.9639936881365657, + "grad_norm": 15.9375, + "learning_rate": 1.7945973496432214e-05, + "loss": 0.1783, + "step": 840 + }, + { + "epoch": 0.9651412996700617, + "grad_norm": 53.5, + "learning_rate": 1.7940876656472988e-05, + "loss": 0.3035, + "step": 841 + }, + { + "epoch": 0.9662889112035576, + "grad_norm": 52.75, + "learning_rate": 1.7935779816513762e-05, + "loss": 0.6946, + "step": 842 + }, + { + "epoch": 0.9674365227370535, + "grad_norm": 34.75, + "learning_rate": 1.793068297655454e-05, + "loss": 0.5466, + "step": 843 + }, + { + "epoch": 0.9685841342705495, + "grad_norm": 21.875, + "learning_rate": 1.7925586136595313e-05, + "loss": 0.4619, + "step": 844 + }, + { + "epoch": 0.9697317458040453, + "grad_norm": 50.5, + "learning_rate": 1.7920489296636087e-05, + "loss": 0.6513, + "step": 845 + }, + { + "epoch": 0.9708793573375413, + "grad_norm": 15.125, + "learning_rate": 1.791539245667686e-05, + "loss": 0.379, + "step": 846 + }, + { + "epoch": 0.9720269688710371, + "grad_norm": 10.625, + "learning_rate": 1.7910295616717638e-05, + "loss": 0.5085, + "step": 847 + }, + { + "epoch": 0.9731745804045331, + "grad_norm": 12.6875, + "learning_rate": 1.7905198776758412e-05, + "loss": 0.5272, + "step": 848 + }, + { + "epoch": 0.9743221919380289, + "grad_norm": 27.5, + "learning_rate": 1.7900101936799186e-05, + "loss": 1.0062, + "step": 849 + }, + { + "epoch": 0.9754698034715249, + "grad_norm": 127.5, + "learning_rate": 1.789500509683996e-05, + "loss": 1.0798, + "step": 850 + }, + { + "epoch": 0.9766174150050208, + "grad_norm": 103.5, + "learning_rate": 1.7889908256880734e-05, + "loss": 1.1638, + "step": 851 + }, + { + "epoch": 0.9777650265385167, + "grad_norm": 69.0, + "learning_rate": 1.788481141692151e-05, + "loss": 0.9011, + "step": 852 + }, + { + "epoch": 0.9789126380720127, + "grad_norm": 86.5, + "learning_rate": 1.7879714576962285e-05, + "loss": 0.8197, + "step": 853 + }, + { + "epoch": 0.9800602496055085, + "grad_norm": 15.5, + "learning_rate": 1.787461773700306e-05, + "loss": 0.6134, + "step": 854 + }, + { + "epoch": 0.9812078611390045, + "grad_norm": 110.5, + "learning_rate": 1.7869520897043836e-05, + "loss": 0.919, + "step": 855 + }, + { + "epoch": 0.9823554726725003, + "grad_norm": 30.125, + "learning_rate": 1.786442405708461e-05, + "loss": 0.5746, + "step": 856 + }, + { + "epoch": 0.9835030842059963, + "grad_norm": 100.0, + "learning_rate": 1.7859327217125384e-05, + "loss": 0.3361, + "step": 857 + }, + { + "epoch": 0.9846506957394922, + "grad_norm": 60.0, + "learning_rate": 1.7854230377166158e-05, + "loss": 0.6782, + "step": 858 + }, + { + "epoch": 0.9857983072729881, + "grad_norm": 62.5, + "learning_rate": 1.7849133537206932e-05, + "loss": 0.8552, + "step": 859 + }, + { + "epoch": 0.986945918806484, + "grad_norm": 38.75, + "learning_rate": 1.784403669724771e-05, + "loss": 0.7251, + "step": 860 + }, + { + "epoch": 0.9880935303399799, + "grad_norm": 84.5, + "learning_rate": 1.7838939857288483e-05, + "loss": 0.9825, + "step": 861 + }, + { + "epoch": 0.9892411418734758, + "grad_norm": 32.75, + "learning_rate": 1.7833843017329257e-05, + "loss": 0.2631, + "step": 862 + }, + { + "epoch": 0.9903887534069717, + "grad_norm": 101.5, + "learning_rate": 1.782874617737003e-05, + "loss": 1.0281, + "step": 863 + }, + { + "epoch": 0.9915363649404677, + "grad_norm": 36.75, + "learning_rate": 1.7823649337410808e-05, + "loss": 0.6591, + "step": 864 + }, + { + "epoch": 0.9926839764739636, + "grad_norm": 23.75, + "learning_rate": 1.7818552497451582e-05, + "loss": 0.2017, + "step": 865 + }, + { + "epoch": 0.9938315880074595, + "grad_norm": 11.0625, + "learning_rate": 1.7813455657492356e-05, + "loss": 0.6496, + "step": 866 + }, + { + "epoch": 0.9949791995409554, + "grad_norm": 52.0, + "learning_rate": 1.780835881753313e-05, + "loss": 0.7726, + "step": 867 + }, + { + "epoch": 0.9961268110744513, + "grad_norm": 8.9375, + "learning_rate": 1.7803261977573904e-05, + "loss": 0.2688, + "step": 868 + }, + { + "epoch": 0.9972744226079472, + "grad_norm": 29.625, + "learning_rate": 1.779816513761468e-05, + "loss": 0.4991, + "step": 869 + }, + { + "epoch": 0.9984220341414431, + "grad_norm": 6.59375, + "learning_rate": 1.7793068297655455e-05, + "loss": 0.3232, + "step": 870 + }, + { + "epoch": 0.999569645674939, + "grad_norm": 39.25, + "learning_rate": 1.778797145769623e-05, + "loss": 0.4866, + "step": 871 + }, + { + "epoch": 1.0, + "grad_norm": 31.25, + "learning_rate": 1.7782874617737006e-05, + "loss": 0.1617, + "step": 872 + }, + { + "epoch": 1.0011476115334959, + "grad_norm": 54.25, + "learning_rate": 1.7777777777777777e-05, + "loss": 0.5081, + "step": 873 + }, + { + "epoch": 1.002295223066992, + "grad_norm": 59.5, + "learning_rate": 1.7772680937818554e-05, + "loss": 0.6284, + "step": 874 + }, + { + "epoch": 1.0034428346004878, + "grad_norm": 62.0, + "learning_rate": 1.7767584097859328e-05, + "loss": 0.6364, + "step": 875 + }, + { + "epoch": 1.0045904461339836, + "grad_norm": 99.5, + "learning_rate": 1.7762487257900102e-05, + "loss": 1.521, + "step": 876 + }, + { + "epoch": 1.0057380576674795, + "grad_norm": 105.5, + "learning_rate": 1.775739041794088e-05, + "loss": 1.0837, + "step": 877 + }, + { + "epoch": 1.0068856692009756, + "grad_norm": 117.0, + "learning_rate": 1.7752293577981653e-05, + "loss": 1.0871, + "step": 878 + }, + { + "epoch": 1.0080332807344714, + "grad_norm": 91.5, + "learning_rate": 1.7747196738022427e-05, + "loss": 0.7927, + "step": 879 + }, + { + "epoch": 1.0091808922679673, + "grad_norm": 68.5, + "learning_rate": 1.7742099898063204e-05, + "loss": 0.6309, + "step": 880 + }, + { + "epoch": 1.010328503801463, + "grad_norm": 11.3125, + "learning_rate": 1.7737003058103978e-05, + "loss": 0.3369, + "step": 881 + }, + { + "epoch": 1.0114761153349592, + "grad_norm": 11.125, + "learning_rate": 1.7731906218144752e-05, + "loss": 0.2181, + "step": 882 + }, + { + "epoch": 1.012623726868455, + "grad_norm": 23.75, + "learning_rate": 1.7726809378185526e-05, + "loss": 0.4936, + "step": 883 + }, + { + "epoch": 1.0137713384019509, + "grad_norm": 26.25, + "learning_rate": 1.77217125382263e-05, + "loss": 0.5372, + "step": 884 + }, + { + "epoch": 1.014918949935447, + "grad_norm": 23.375, + "learning_rate": 1.7716615698267077e-05, + "loss": 0.2898, + "step": 885 + }, + { + "epoch": 1.0160665614689428, + "grad_norm": 61.25, + "learning_rate": 1.771151885830785e-05, + "loss": 1.0463, + "step": 886 + }, + { + "epoch": 1.0172141730024387, + "grad_norm": 16.125, + "learning_rate": 1.7706422018348625e-05, + "loss": 0.3061, + "step": 887 + }, + { + "epoch": 1.0183617845359345, + "grad_norm": 83.5, + "learning_rate": 1.77013251783894e-05, + "loss": 0.7545, + "step": 888 + }, + { + "epoch": 1.0195093960694306, + "grad_norm": 57.75, + "learning_rate": 1.7696228338430176e-05, + "loss": 0.7643, + "step": 889 + }, + { + "epoch": 1.0206570076029264, + "grad_norm": 19.125, + "learning_rate": 1.769113149847095e-05, + "loss": 0.6013, + "step": 890 + }, + { + "epoch": 1.0218046191364223, + "grad_norm": 12.25, + "learning_rate": 1.7686034658511724e-05, + "loss": 0.4579, + "step": 891 + }, + { + "epoch": 1.0229522306699181, + "grad_norm": 38.25, + "learning_rate": 1.7680937818552498e-05, + "loss": 0.4669, + "step": 892 + }, + { + "epoch": 1.0240998422034142, + "grad_norm": 68.0, + "learning_rate": 1.767584097859327e-05, + "loss": 0.4824, + "step": 893 + }, + { + "epoch": 1.02524745373691, + "grad_norm": 10.5625, + "learning_rate": 1.767074413863405e-05, + "loss": 0.5689, + "step": 894 + }, + { + "epoch": 1.026395065270406, + "grad_norm": 8.875, + "learning_rate": 1.7665647298674823e-05, + "loss": 0.3161, + "step": 895 + }, + { + "epoch": 1.0275426768039018, + "grad_norm": 23.625, + "learning_rate": 1.7660550458715597e-05, + "loss": 0.4443, + "step": 896 + }, + { + "epoch": 1.0286902883373978, + "grad_norm": 15.75, + "learning_rate": 1.7655453618756374e-05, + "loss": 0.2331, + "step": 897 + }, + { + "epoch": 1.0298378998708937, + "grad_norm": 8.25, + "learning_rate": 1.7650356778797148e-05, + "loss": 0.3554, + "step": 898 + }, + { + "epoch": 1.0309855114043895, + "grad_norm": 14.5, + "learning_rate": 1.7645259938837922e-05, + "loss": 0.6107, + "step": 899 + }, + { + "epoch": 1.0321331229378856, + "grad_norm": 42.25, + "learning_rate": 1.76401630988787e-05, + "loss": 0.3624, + "step": 900 + }, + { + "epoch": 1.0321331229378856, + "eval_accuracy": 0.64, + "eval_loss": 0.6176496744155884, + "eval_runtime": 49.5336, + "eval_samples_per_second": 2.019, + "eval_steps_per_second": 2.019, + "step": 900 + }, + { + "epoch": 1.0332807344713815, + "grad_norm": 23.0, + "learning_rate": 1.763506625891947e-05, + "loss": 0.4606, + "step": 901 + }, + { + "epoch": 1.0344283460048773, + "grad_norm": 50.75, + "learning_rate": 1.7629969418960247e-05, + "loss": 0.5176, + "step": 902 + }, + { + "epoch": 1.0355759575383732, + "grad_norm": 58.0, + "learning_rate": 1.762487257900102e-05, + "loss": 0.3688, + "step": 903 + }, + { + "epoch": 1.0367235690718692, + "grad_norm": 36.0, + "learning_rate": 1.7619775739041795e-05, + "loss": 0.7414, + "step": 904 + }, + { + "epoch": 1.037871180605365, + "grad_norm": 28.5, + "learning_rate": 1.7614678899082572e-05, + "loss": 0.8468, + "step": 905 + }, + { + "epoch": 1.039018792138861, + "grad_norm": 26.25, + "learning_rate": 1.7609582059123346e-05, + "loss": 0.4338, + "step": 906 + }, + { + "epoch": 1.040166403672357, + "grad_norm": 122.5, + "learning_rate": 1.760448521916412e-05, + "loss": 0.9431, + "step": 907 + }, + { + "epoch": 1.0413140152058529, + "grad_norm": 15.375, + "learning_rate": 1.7599388379204894e-05, + "loss": 0.5602, + "step": 908 + }, + { + "epoch": 1.0424616267393487, + "grad_norm": 96.5, + "learning_rate": 1.7594291539245668e-05, + "loss": 0.6268, + "step": 909 + }, + { + "epoch": 1.0436092382728446, + "grad_norm": 59.25, + "learning_rate": 1.7589194699286445e-05, + "loss": 0.404, + "step": 910 + }, + { + "epoch": 1.0447568498063406, + "grad_norm": 30.5, + "learning_rate": 1.758409785932722e-05, + "loss": 0.5772, + "step": 911 + }, + { + "epoch": 1.0459044613398365, + "grad_norm": 15.875, + "learning_rate": 1.7579001019367993e-05, + "loss": 0.4666, + "step": 912 + }, + { + "epoch": 1.0470520728733324, + "grad_norm": 20.25, + "learning_rate": 1.7573904179408767e-05, + "loss": 0.4576, + "step": 913 + }, + { + "epoch": 1.0481996844068282, + "grad_norm": 33.5, + "learning_rate": 1.7568807339449544e-05, + "loss": 0.4427, + "step": 914 + }, + { + "epoch": 1.0493472959403243, + "grad_norm": 6.59375, + "learning_rate": 1.7563710499490318e-05, + "loss": 0.1466, + "step": 915 + }, + { + "epoch": 1.0504949074738201, + "grad_norm": 44.0, + "learning_rate": 1.7558613659531092e-05, + "loss": 0.3564, + "step": 916 + }, + { + "epoch": 1.051642519007316, + "grad_norm": 26.75, + "learning_rate": 1.755351681957187e-05, + "loss": 0.6648, + "step": 917 + }, + { + "epoch": 1.0527901305408118, + "grad_norm": 53.5, + "learning_rate": 1.754841997961264e-05, + "loss": 0.5389, + "step": 918 + }, + { + "epoch": 1.053937742074308, + "grad_norm": 23.875, + "learning_rate": 1.7543323139653417e-05, + "loss": 0.4424, + "step": 919 + }, + { + "epoch": 1.0550853536078038, + "grad_norm": 34.75, + "learning_rate": 1.753822629969419e-05, + "loss": 0.4035, + "step": 920 + }, + { + "epoch": 1.0562329651412996, + "grad_norm": 25.125, + "learning_rate": 1.7533129459734965e-05, + "loss": 0.5704, + "step": 921 + }, + { + "epoch": 1.0573805766747957, + "grad_norm": 25.625, + "learning_rate": 1.7528032619775742e-05, + "loss": 0.7077, + "step": 922 + }, + { + "epoch": 1.0585281882082915, + "grad_norm": 11.25, + "learning_rate": 1.7522935779816516e-05, + "loss": 0.228, + "step": 923 + }, + { + "epoch": 1.0596757997417874, + "grad_norm": 31.125, + "learning_rate": 1.751783893985729e-05, + "loss": 0.5716, + "step": 924 + }, + { + "epoch": 1.0608234112752832, + "grad_norm": 21.25, + "learning_rate": 1.7512742099898067e-05, + "loss": 0.2658, + "step": 925 + }, + { + "epoch": 1.0619710228087793, + "grad_norm": 12.875, + "learning_rate": 1.7507645259938838e-05, + "loss": 0.2415, + "step": 926 + }, + { + "epoch": 1.0631186343422752, + "grad_norm": 100.5, + "learning_rate": 1.7502548419979615e-05, + "loss": 1.0011, + "step": 927 + }, + { + "epoch": 1.064266245875771, + "grad_norm": 67.0, + "learning_rate": 1.749745158002039e-05, + "loss": 0.6753, + "step": 928 + }, + { + "epoch": 1.0654138574092669, + "grad_norm": 109.0, + "learning_rate": 1.7492354740061163e-05, + "loss": 0.8631, + "step": 929 + }, + { + "epoch": 1.066561468942763, + "grad_norm": 68.5, + "learning_rate": 1.7487257900101937e-05, + "loss": 1.0799, + "step": 930 + }, + { + "epoch": 1.0677090804762588, + "grad_norm": 74.0, + "learning_rate": 1.7482161060142714e-05, + "loss": 0.5419, + "step": 931 + }, + { + "epoch": 1.0688566920097546, + "grad_norm": 61.25, + "learning_rate": 1.7477064220183488e-05, + "loss": 0.6041, + "step": 932 + }, + { + "epoch": 1.0700043035432507, + "grad_norm": 32.25, + "learning_rate": 1.7471967380224262e-05, + "loss": 0.8215, + "step": 933 + }, + { + "epoch": 1.0711519150767466, + "grad_norm": 45.25, + "learning_rate": 1.746687054026504e-05, + "loss": 0.5843, + "step": 934 + }, + { + "epoch": 1.0722995266102424, + "grad_norm": 104.5, + "learning_rate": 1.746177370030581e-05, + "loss": 1.3222, + "step": 935 + }, + { + "epoch": 1.0734471381437383, + "grad_norm": 56.0, + "learning_rate": 1.7456676860346587e-05, + "loss": 0.5504, + "step": 936 + }, + { + "epoch": 1.0745947496772343, + "grad_norm": 54.0, + "learning_rate": 1.745158002038736e-05, + "loss": 0.8466, + "step": 937 + }, + { + "epoch": 1.0757423612107302, + "grad_norm": 27.375, + "learning_rate": 1.7446483180428135e-05, + "loss": 0.9508, + "step": 938 + }, + { + "epoch": 1.076889972744226, + "grad_norm": 14.625, + "learning_rate": 1.7441386340468912e-05, + "loss": 0.3969, + "step": 939 + }, + { + "epoch": 1.078037584277722, + "grad_norm": 75.0, + "learning_rate": 1.7436289500509686e-05, + "loss": 0.9936, + "step": 940 + }, + { + "epoch": 1.079185195811218, + "grad_norm": 51.75, + "learning_rate": 1.743119266055046e-05, + "loss": 0.5978, + "step": 941 + }, + { + "epoch": 1.0803328073447138, + "grad_norm": 57.5, + "learning_rate": 1.7426095820591237e-05, + "loss": 0.6549, + "step": 942 + }, + { + "epoch": 1.0814804188782097, + "grad_norm": 19.625, + "learning_rate": 1.7420998980632008e-05, + "loss": 0.4942, + "step": 943 + }, + { + "epoch": 1.0826280304117057, + "grad_norm": 83.0, + "learning_rate": 1.7415902140672785e-05, + "loss": 0.6702, + "step": 944 + }, + { + "epoch": 1.0837756419452016, + "grad_norm": 42.5, + "learning_rate": 1.741080530071356e-05, + "loss": 0.6299, + "step": 945 + }, + { + "epoch": 1.0849232534786974, + "grad_norm": 19.25, + "learning_rate": 1.7405708460754333e-05, + "loss": 0.5421, + "step": 946 + }, + { + "epoch": 1.0860708650121933, + "grad_norm": 34.0, + "learning_rate": 1.740061162079511e-05, + "loss": 0.7019, + "step": 947 + }, + { + "epoch": 1.0872184765456894, + "grad_norm": 34.0, + "learning_rate": 1.7395514780835884e-05, + "loss": 0.5919, + "step": 948 + }, + { + "epoch": 1.0883660880791852, + "grad_norm": 17.875, + "learning_rate": 1.7390417940876658e-05, + "loss": 0.2788, + "step": 949 + }, + { + "epoch": 1.089513699612681, + "grad_norm": 16.0, + "learning_rate": 1.738532110091743e-05, + "loss": 0.7744, + "step": 950 + }, + { + "epoch": 1.090661311146177, + "grad_norm": 61.5, + "learning_rate": 1.7380224260958206e-05, + "loss": 0.6198, + "step": 951 + }, + { + "epoch": 1.091808922679673, + "grad_norm": 17.5, + "learning_rate": 1.7375127420998983e-05, + "loss": 0.5995, + "step": 952 + }, + { + "epoch": 1.0929565342131689, + "grad_norm": 15.0, + "learning_rate": 1.7370030581039757e-05, + "loss": 0.4392, + "step": 953 + }, + { + "epoch": 1.0941041457466647, + "grad_norm": 54.75, + "learning_rate": 1.736493374108053e-05, + "loss": 0.4673, + "step": 954 + }, + { + "epoch": 1.0952517572801606, + "grad_norm": 31.5, + "learning_rate": 1.7359836901121305e-05, + "loss": 0.5318, + "step": 955 + }, + { + "epoch": 1.0963993688136566, + "grad_norm": 35.0, + "learning_rate": 1.7354740061162082e-05, + "loss": 0.5184, + "step": 956 + }, + { + "epoch": 1.0975469803471525, + "grad_norm": 23.75, + "learning_rate": 1.7349643221202856e-05, + "loss": 0.5015, + "step": 957 + }, + { + "epoch": 1.0986945918806483, + "grad_norm": 54.0, + "learning_rate": 1.734454638124363e-05, + "loss": 0.5254, + "step": 958 + }, + { + "epoch": 1.0998422034141444, + "grad_norm": 10.375, + "learning_rate": 1.7339449541284407e-05, + "loss": 0.4739, + "step": 959 + }, + { + "epoch": 1.1009898149476403, + "grad_norm": 23.5, + "learning_rate": 1.7334352701325177e-05, + "loss": 0.5565, + "step": 960 + }, + { + "epoch": 1.102137426481136, + "grad_norm": 11.875, + "learning_rate": 1.7329255861365955e-05, + "loss": 0.3887, + "step": 961 + }, + { + "epoch": 1.103285038014632, + "grad_norm": 10.875, + "learning_rate": 1.732415902140673e-05, + "loss": 0.6166, + "step": 962 + }, + { + "epoch": 1.104432649548128, + "grad_norm": 43.75, + "learning_rate": 1.7319062181447503e-05, + "loss": 0.9438, + "step": 963 + }, + { + "epoch": 1.1055802610816239, + "grad_norm": 17.375, + "learning_rate": 1.731396534148828e-05, + "loss": 0.6131, + "step": 964 + }, + { + "epoch": 1.1067278726151197, + "grad_norm": 36.5, + "learning_rate": 1.7308868501529054e-05, + "loss": 0.5897, + "step": 965 + }, + { + "epoch": 1.1078754841486158, + "grad_norm": 34.25, + "learning_rate": 1.7303771661569828e-05, + "loss": 0.473, + "step": 966 + }, + { + "epoch": 1.1090230956821117, + "grad_norm": 23.75, + "learning_rate": 1.7298674821610605e-05, + "loss": 0.6736, + "step": 967 + }, + { + "epoch": 1.1101707072156075, + "grad_norm": 47.25, + "learning_rate": 1.7293577981651376e-05, + "loss": 0.4113, + "step": 968 + }, + { + "epoch": 1.1113183187491034, + "grad_norm": 13.6875, + "learning_rate": 1.7288481141692153e-05, + "loss": 0.2634, + "step": 969 + }, + { + "epoch": 1.1124659302825994, + "grad_norm": 36.75, + "learning_rate": 1.7283384301732927e-05, + "loss": 0.5289, + "step": 970 + }, + { + "epoch": 1.1136135418160953, + "grad_norm": 28.625, + "learning_rate": 1.72782874617737e-05, + "loss": 0.5775, + "step": 971 + }, + { + "epoch": 1.1147611533495911, + "grad_norm": 42.25, + "learning_rate": 1.7273190621814478e-05, + "loss": 0.7163, + "step": 972 + }, + { + "epoch": 1.115908764883087, + "grad_norm": 57.0, + "learning_rate": 1.7268093781855252e-05, + "loss": 0.5009, + "step": 973 + }, + { + "epoch": 1.117056376416583, + "grad_norm": 22.375, + "learning_rate": 1.7262996941896026e-05, + "loss": 0.4101, + "step": 974 + }, + { + "epoch": 1.118203987950079, + "grad_norm": 41.25, + "learning_rate": 1.72579001019368e-05, + "loss": 0.4195, + "step": 975 + }, + { + "epoch": 1.1193515994835748, + "grad_norm": 17.625, + "learning_rate": 1.7252803261977577e-05, + "loss": 0.4409, + "step": 976 + }, + { + "epoch": 1.1204992110170706, + "grad_norm": 18.375, + "learning_rate": 1.724770642201835e-05, + "loss": 0.4041, + "step": 977 + }, + { + "epoch": 1.1216468225505667, + "grad_norm": 39.0, + "learning_rate": 1.7242609582059125e-05, + "loss": 0.6333, + "step": 978 + }, + { + "epoch": 1.1227944340840625, + "grad_norm": 62.25, + "learning_rate": 1.72375127420999e-05, + "loss": 0.648, + "step": 979 + }, + { + "epoch": 1.1239420456175584, + "grad_norm": 57.0, + "learning_rate": 1.7232415902140673e-05, + "loss": 0.5549, + "step": 980 + }, + { + "epoch": 1.1250896571510545, + "grad_norm": 17.875, + "learning_rate": 1.722731906218145e-05, + "loss": 0.3829, + "step": 981 + }, + { + "epoch": 1.1262372686845503, + "grad_norm": 23.5, + "learning_rate": 1.7222222222222224e-05, + "loss": 0.3594, + "step": 982 + }, + { + "epoch": 1.1273848802180462, + "grad_norm": 53.0, + "learning_rate": 1.7217125382262998e-05, + "loss": 0.6625, + "step": 983 + }, + { + "epoch": 1.128532491751542, + "grad_norm": 49.75, + "learning_rate": 1.7212028542303775e-05, + "loss": 0.4887, + "step": 984 + }, + { + "epoch": 1.129680103285038, + "grad_norm": 15.0, + "learning_rate": 1.7206931702344545e-05, + "loss": 0.5548, + "step": 985 + }, + { + "epoch": 1.130827714818534, + "grad_norm": 40.5, + "learning_rate": 1.7201834862385323e-05, + "loss": 0.7024, + "step": 986 + }, + { + "epoch": 1.1319753263520298, + "grad_norm": 58.25, + "learning_rate": 1.7196738022426097e-05, + "loss": 0.4027, + "step": 987 + }, + { + "epoch": 1.1331229378855259, + "grad_norm": 70.5, + "learning_rate": 1.719164118246687e-05, + "loss": 0.6295, + "step": 988 + }, + { + "epoch": 1.1342705494190217, + "grad_norm": 34.25, + "learning_rate": 1.7186544342507648e-05, + "loss": 0.3274, + "step": 989 + }, + { + "epoch": 1.1354181609525176, + "grad_norm": 20.0, + "learning_rate": 1.7181447502548422e-05, + "loss": 0.1818, + "step": 990 + }, + { + "epoch": 1.1365657724860134, + "grad_norm": 46.5, + "learning_rate": 1.7176350662589196e-05, + "loss": 0.4344, + "step": 991 + }, + { + "epoch": 1.1377133840195093, + "grad_norm": 60.0, + "learning_rate": 1.7171253822629973e-05, + "loss": 0.3682, + "step": 992 + }, + { + "epoch": 1.1388609955530054, + "grad_norm": 30.25, + "learning_rate": 1.7166156982670747e-05, + "loss": 0.4771, + "step": 993 + }, + { + "epoch": 1.1400086070865012, + "grad_norm": 23.375, + "learning_rate": 1.716106014271152e-05, + "loss": 0.4939, + "step": 994 + }, + { + "epoch": 1.141156218619997, + "grad_norm": 17.625, + "learning_rate": 1.7155963302752295e-05, + "loss": 0.6885, + "step": 995 + }, + { + "epoch": 1.1423038301534931, + "grad_norm": 64.5, + "learning_rate": 1.715086646279307e-05, + "loss": 0.8163, + "step": 996 + }, + { + "epoch": 1.143451441686989, + "grad_norm": 39.5, + "learning_rate": 1.7145769622833846e-05, + "loss": 0.3577, + "step": 997 + }, + { + "epoch": 1.1445990532204848, + "grad_norm": 8.6875, + "learning_rate": 1.714067278287462e-05, + "loss": 0.202, + "step": 998 + }, + { + "epoch": 1.1457466647539807, + "grad_norm": 52.0, + "learning_rate": 1.7135575942915394e-05, + "loss": 0.5541, + "step": 999 + }, + { + "epoch": 1.1468942762874768, + "grad_norm": 22.125, + "learning_rate": 1.7130479102956168e-05, + "loss": 0.2125, + "step": 1000 + }, + { + "epoch": 1.1468942762874768, + "eval_accuracy": 0.6, + "eval_loss": 0.5487725734710693, + "eval_runtime": 50.2711, + "eval_samples_per_second": 1.989, + "eval_steps_per_second": 1.989, + "step": 1000 + }, + { + "epoch": 1.1480418878209726, + "grad_norm": 21.25, + "learning_rate": 1.7125382262996945e-05, + "loss": 0.3415, + "step": 1001 + }, + { + "epoch": 1.1491894993544685, + "grad_norm": 39.5, + "learning_rate": 1.712028542303772e-05, + "loss": 0.6746, + "step": 1002 + }, + { + "epoch": 1.1503371108879645, + "grad_norm": 16.875, + "learning_rate": 1.7115188583078493e-05, + "loss": 0.7315, + "step": 1003 + }, + { + "epoch": 1.1514847224214604, + "grad_norm": 13.6875, + "learning_rate": 1.7110091743119267e-05, + "loss": 0.5293, + "step": 1004 + }, + { + "epoch": 1.1526323339549562, + "grad_norm": 10.4375, + "learning_rate": 1.710499490316004e-05, + "loss": 0.4509, + "step": 1005 + }, + { + "epoch": 1.153779945488452, + "grad_norm": 18.375, + "learning_rate": 1.7099898063200818e-05, + "loss": 0.3469, + "step": 1006 + }, + { + "epoch": 1.1549275570219482, + "grad_norm": 12.375, + "learning_rate": 1.709480122324159e-05, + "loss": 0.4868, + "step": 1007 + }, + { + "epoch": 1.156075168555444, + "grad_norm": 57.5, + "learning_rate": 1.7089704383282366e-05, + "loss": 0.5211, + "step": 1008 + }, + { + "epoch": 1.1572227800889399, + "grad_norm": 13.875, + "learning_rate": 1.7084607543323143e-05, + "loss": 0.3623, + "step": 1009 + }, + { + "epoch": 1.1583703916224357, + "grad_norm": 69.0, + "learning_rate": 1.7079510703363917e-05, + "loss": 0.274, + "step": 1010 + }, + { + "epoch": 1.1595180031559318, + "grad_norm": 18.5, + "learning_rate": 1.707441386340469e-05, + "loss": 0.2365, + "step": 1011 + }, + { + "epoch": 1.1606656146894276, + "grad_norm": 40.25, + "learning_rate": 1.7069317023445465e-05, + "loss": 0.2999, + "step": 1012 + }, + { + "epoch": 1.1618132262229235, + "grad_norm": 57.5, + "learning_rate": 1.706422018348624e-05, + "loss": 0.5137, + "step": 1013 + }, + { + "epoch": 1.1629608377564193, + "grad_norm": 20.875, + "learning_rate": 1.7059123343527016e-05, + "loss": 0.6691, + "step": 1014 + }, + { + "epoch": 1.1641084492899154, + "grad_norm": 30.875, + "learning_rate": 1.705402650356779e-05, + "loss": 0.6642, + "step": 1015 + }, + { + "epoch": 1.1652560608234113, + "grad_norm": 15.9375, + "learning_rate": 1.7048929663608564e-05, + "loss": 0.2695, + "step": 1016 + }, + { + "epoch": 1.1664036723569071, + "grad_norm": 88.5, + "learning_rate": 1.7043832823649338e-05, + "loss": 0.8211, + "step": 1017 + }, + { + "epoch": 1.1675512838904032, + "grad_norm": 45.5, + "learning_rate": 1.7038735983690115e-05, + "loss": 0.7956, + "step": 1018 + }, + { + "epoch": 1.168698895423899, + "grad_norm": 80.0, + "learning_rate": 1.703363914373089e-05, + "loss": 0.8805, + "step": 1019 + }, + { + "epoch": 1.169846506957395, + "grad_norm": 15.1875, + "learning_rate": 1.7028542303771663e-05, + "loss": 0.5262, + "step": 1020 + }, + { + "epoch": 1.1709941184908907, + "grad_norm": 60.75, + "learning_rate": 1.7023445463812437e-05, + "loss": 1.1968, + "step": 1021 + }, + { + "epoch": 1.1721417300243868, + "grad_norm": 31.375, + "learning_rate": 1.701834862385321e-05, + "loss": 0.744, + "step": 1022 + }, + { + "epoch": 1.1732893415578827, + "grad_norm": 20.625, + "learning_rate": 1.7013251783893988e-05, + "loss": 0.3321, + "step": 1023 + }, + { + "epoch": 1.1744369530913785, + "grad_norm": 19.875, + "learning_rate": 1.700815494393476e-05, + "loss": 0.4447, + "step": 1024 + }, + { + "epoch": 1.1755845646248746, + "grad_norm": 23.625, + "learning_rate": 1.7003058103975536e-05, + "loss": 0.3697, + "step": 1025 + }, + { + "epoch": 1.1767321761583704, + "grad_norm": 20.625, + "learning_rate": 1.6997961264016313e-05, + "loss": 0.2759, + "step": 1026 + }, + { + "epoch": 1.1778797876918663, + "grad_norm": 58.75, + "learning_rate": 1.6992864424057087e-05, + "loss": 0.7182, + "step": 1027 + }, + { + "epoch": 1.1790273992253621, + "grad_norm": 6.96875, + "learning_rate": 1.698776758409786e-05, + "loss": 0.1403, + "step": 1028 + }, + { + "epoch": 1.1801750107588582, + "grad_norm": 24.875, + "learning_rate": 1.6982670744138638e-05, + "loss": 0.513, + "step": 1029 + }, + { + "epoch": 1.181322622292354, + "grad_norm": 15.8125, + "learning_rate": 1.697757390417941e-05, + "loss": 0.5238, + "step": 1030 + }, + { + "epoch": 1.18247023382585, + "grad_norm": 22.0, + "learning_rate": 1.6972477064220186e-05, + "loss": 0.445, + "step": 1031 + }, + { + "epoch": 1.1836178453593458, + "grad_norm": 86.5, + "learning_rate": 1.696738022426096e-05, + "loss": 0.7085, + "step": 1032 + }, + { + "epoch": 1.1847654568928419, + "grad_norm": 63.75, + "learning_rate": 1.6962283384301734e-05, + "loss": 1.0473, + "step": 1033 + }, + { + "epoch": 1.1859130684263377, + "grad_norm": 63.5, + "learning_rate": 1.695718654434251e-05, + "loss": 0.3947, + "step": 1034 + }, + { + "epoch": 1.1870606799598336, + "grad_norm": 12.375, + "learning_rate": 1.6952089704383285e-05, + "loss": 0.3453, + "step": 1035 + }, + { + "epoch": 1.1882082914933294, + "grad_norm": 9.1875, + "learning_rate": 1.694699286442406e-05, + "loss": 0.2802, + "step": 1036 + }, + { + "epoch": 1.1893559030268255, + "grad_norm": 29.75, + "learning_rate": 1.6941896024464833e-05, + "loss": 0.543, + "step": 1037 + }, + { + "epoch": 1.1905035145603213, + "grad_norm": 60.75, + "learning_rate": 1.6936799184505606e-05, + "loss": 0.7737, + "step": 1038 + }, + { + "epoch": 1.1916511260938172, + "grad_norm": 38.0, + "learning_rate": 1.6931702344546384e-05, + "loss": 0.7725, + "step": 1039 + }, + { + "epoch": 1.1927987376273133, + "grad_norm": 9.9375, + "learning_rate": 1.6926605504587158e-05, + "loss": 0.2875, + "step": 1040 + }, + { + "epoch": 1.193946349160809, + "grad_norm": 53.5, + "learning_rate": 1.692150866462793e-05, + "loss": 0.6683, + "step": 1041 + }, + { + "epoch": 1.195093960694305, + "grad_norm": 58.25, + "learning_rate": 1.6916411824668705e-05, + "loss": 0.4406, + "step": 1042 + }, + { + "epoch": 1.1962415722278008, + "grad_norm": 46.0, + "learning_rate": 1.6911314984709483e-05, + "loss": 0.6739, + "step": 1043 + }, + { + "epoch": 1.1973891837612969, + "grad_norm": 47.25, + "learning_rate": 1.6906218144750257e-05, + "loss": 0.278, + "step": 1044 + }, + { + "epoch": 1.1985367952947927, + "grad_norm": 42.5, + "learning_rate": 1.690112130479103e-05, + "loss": 0.4348, + "step": 1045 + }, + { + "epoch": 1.1996844068282886, + "grad_norm": 20.0, + "learning_rate": 1.6896024464831804e-05, + "loss": 0.1145, + "step": 1046 + }, + { + "epoch": 1.2008320183617847, + "grad_norm": 36.0, + "learning_rate": 1.689092762487258e-05, + "loss": 0.7542, + "step": 1047 + }, + { + "epoch": 1.2019796298952805, + "grad_norm": 7.5625, + "learning_rate": 1.6885830784913356e-05, + "loss": 0.1475, + "step": 1048 + }, + { + "epoch": 1.2031272414287764, + "grad_norm": 54.75, + "learning_rate": 1.688073394495413e-05, + "loss": 0.3082, + "step": 1049 + }, + { + "epoch": 1.2042748529622722, + "grad_norm": 24.5, + "learning_rate": 1.6875637104994903e-05, + "loss": 0.8289, + "step": 1050 + }, + { + "epoch": 1.205422464495768, + "grad_norm": 69.5, + "learning_rate": 1.687054026503568e-05, + "loss": 0.7198, + "step": 1051 + }, + { + "epoch": 1.2065700760292641, + "grad_norm": 57.25, + "learning_rate": 1.6865443425076455e-05, + "loss": 0.3022, + "step": 1052 + }, + { + "epoch": 1.20771768756276, + "grad_norm": 43.5, + "learning_rate": 1.686034658511723e-05, + "loss": 0.5085, + "step": 1053 + }, + { + "epoch": 1.2088652990962558, + "grad_norm": 16.75, + "learning_rate": 1.6855249745158006e-05, + "loss": 0.4784, + "step": 1054 + }, + { + "epoch": 1.210012910629752, + "grad_norm": 14.3125, + "learning_rate": 1.6850152905198776e-05, + "loss": 0.3837, + "step": 1055 + }, + { + "epoch": 1.2111605221632478, + "grad_norm": 9.625, + "learning_rate": 1.6845056065239554e-05, + "loss": 0.2057, + "step": 1056 + }, + { + "epoch": 1.2123081336967436, + "grad_norm": 23.625, + "learning_rate": 1.6839959225280328e-05, + "loss": 0.9273, + "step": 1057 + }, + { + "epoch": 1.2134557452302395, + "grad_norm": 27.125, + "learning_rate": 1.68348623853211e-05, + "loss": 0.5371, + "step": 1058 + }, + { + "epoch": 1.2146033567637355, + "grad_norm": 55.75, + "learning_rate": 1.682976554536188e-05, + "loss": 0.5682, + "step": 1059 + }, + { + "epoch": 1.2157509682972314, + "grad_norm": 55.25, + "learning_rate": 1.6824668705402653e-05, + "loss": 0.4674, + "step": 1060 + }, + { + "epoch": 1.2168985798307272, + "grad_norm": 18.625, + "learning_rate": 1.6819571865443427e-05, + "loss": 0.515, + "step": 1061 + }, + { + "epoch": 1.2180461913642233, + "grad_norm": 118.5, + "learning_rate": 1.68144750254842e-05, + "loss": 1.1109, + "step": 1062 + }, + { + "epoch": 1.2191938028977192, + "grad_norm": 45.5, + "learning_rate": 1.6809378185524974e-05, + "loss": 0.2941, + "step": 1063 + }, + { + "epoch": 1.220341414431215, + "grad_norm": 21.0, + "learning_rate": 1.6804281345565752e-05, + "loss": 0.4562, + "step": 1064 + }, + { + "epoch": 1.2214890259647109, + "grad_norm": 9.375, + "learning_rate": 1.6799184505606526e-05, + "loss": 0.1801, + "step": 1065 + }, + { + "epoch": 1.222636637498207, + "grad_norm": 12.5, + "learning_rate": 1.67940876656473e-05, + "loss": 0.2672, + "step": 1066 + }, + { + "epoch": 1.2237842490317028, + "grad_norm": 77.5, + "learning_rate": 1.6788990825688073e-05, + "loss": 0.6601, + "step": 1067 + }, + { + "epoch": 1.2249318605651986, + "grad_norm": 12.0625, + "learning_rate": 1.678389398572885e-05, + "loss": 0.1519, + "step": 1068 + }, + { + "epoch": 1.2260794720986945, + "grad_norm": 24.75, + "learning_rate": 1.6778797145769625e-05, + "loss": 0.5777, + "step": 1069 + }, + { + "epoch": 1.2272270836321906, + "grad_norm": 31.125, + "learning_rate": 1.67737003058104e-05, + "loss": 0.7135, + "step": 1070 + }, + { + "epoch": 1.2283746951656864, + "grad_norm": 63.5, + "learning_rate": 1.6768603465851176e-05, + "loss": 0.8834, + "step": 1071 + }, + { + "epoch": 1.2295223066991823, + "grad_norm": 64.0, + "learning_rate": 1.6763506625891946e-05, + "loss": 0.8991, + "step": 1072 + }, + { + "epoch": 1.2306699182326781, + "grad_norm": 21.375, + "learning_rate": 1.6758409785932724e-05, + "loss": 0.2697, + "step": 1073 + }, + { + "epoch": 1.2318175297661742, + "grad_norm": 44.0, + "learning_rate": 1.6753312945973498e-05, + "loss": 0.4755, + "step": 1074 + }, + { + "epoch": 1.23296514129967, + "grad_norm": 28.875, + "learning_rate": 1.674821610601427e-05, + "loss": 0.3531, + "step": 1075 + }, + { + "epoch": 1.234112752833166, + "grad_norm": 33.0, + "learning_rate": 1.674311926605505e-05, + "loss": 0.1501, + "step": 1076 + }, + { + "epoch": 1.235260364366662, + "grad_norm": 23.0, + "learning_rate": 1.6738022426095823e-05, + "loss": 0.7386, + "step": 1077 + }, + { + "epoch": 1.2364079759001578, + "grad_norm": 18.75, + "learning_rate": 1.6732925586136597e-05, + "loss": 0.1371, + "step": 1078 + }, + { + "epoch": 1.2375555874336537, + "grad_norm": 32.25, + "learning_rate": 1.672782874617737e-05, + "loss": 0.4283, + "step": 1079 + }, + { + "epoch": 1.2387031989671495, + "grad_norm": 15.25, + "learning_rate": 1.6722731906218144e-05, + "loss": 0.221, + "step": 1080 + }, + { + "epoch": 1.2398508105006456, + "grad_norm": 30.625, + "learning_rate": 1.671763506625892e-05, + "loss": 0.4018, + "step": 1081 + }, + { + "epoch": 1.2409984220341415, + "grad_norm": 67.0, + "learning_rate": 1.6712538226299696e-05, + "loss": 0.9727, + "step": 1082 + }, + { + "epoch": 1.2421460335676373, + "grad_norm": 31.375, + "learning_rate": 1.670744138634047e-05, + "loss": 0.4461, + "step": 1083 + }, + { + "epoch": 1.2432936451011334, + "grad_norm": 15.9375, + "learning_rate": 1.6702344546381243e-05, + "loss": 0.4182, + "step": 1084 + }, + { + "epoch": 1.2444412566346292, + "grad_norm": 58.25, + "learning_rate": 1.669724770642202e-05, + "loss": 0.6867, + "step": 1085 + }, + { + "epoch": 1.245588868168125, + "grad_norm": 30.75, + "learning_rate": 1.6692150866462795e-05, + "loss": 0.3318, + "step": 1086 + }, + { + "epoch": 1.246736479701621, + "grad_norm": 52.5, + "learning_rate": 1.668705402650357e-05, + "loss": 0.4148, + "step": 1087 + }, + { + "epoch": 1.247884091235117, + "grad_norm": 22.125, + "learning_rate": 1.6681957186544346e-05, + "loss": 0.5934, + "step": 1088 + }, + { + "epoch": 1.2490317027686129, + "grad_norm": 33.5, + "learning_rate": 1.6676860346585116e-05, + "loss": 0.2049, + "step": 1089 + }, + { + "epoch": 1.2501793143021087, + "grad_norm": 37.5, + "learning_rate": 1.6671763506625894e-05, + "loss": 0.3963, + "step": 1090 + }, + { + "epoch": 1.2513269258356048, + "grad_norm": 18.5, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.2542, + "step": 1091 + }, + { + "epoch": 1.2524745373691006, + "grad_norm": 22.75, + "learning_rate": 1.666156982670744e-05, + "loss": 0.608, + "step": 1092 + }, + { + "epoch": 1.2536221489025965, + "grad_norm": 29.625, + "learning_rate": 1.665647298674822e-05, + "loss": 0.6878, + "step": 1093 + }, + { + "epoch": 1.2547697604360923, + "grad_norm": 30.125, + "learning_rate": 1.6651376146788993e-05, + "loss": 0.172, + "step": 1094 + }, + { + "epoch": 1.2559173719695882, + "grad_norm": 32.75, + "learning_rate": 1.6646279306829766e-05, + "loss": 0.535, + "step": 1095 + }, + { + "epoch": 1.2570649835030843, + "grad_norm": 53.25, + "learning_rate": 1.6641182466870544e-05, + "loss": 0.5492, + "step": 1096 + }, + { + "epoch": 1.2582125950365801, + "grad_norm": 29.0, + "learning_rate": 1.6636085626911314e-05, + "loss": 0.3528, + "step": 1097 + }, + { + "epoch": 1.259360206570076, + "grad_norm": 72.5, + "learning_rate": 1.663098878695209e-05, + "loss": 0.5466, + "step": 1098 + }, + { + "epoch": 1.260507818103572, + "grad_norm": 33.25, + "learning_rate": 1.6625891946992865e-05, + "loss": 0.2994, + "step": 1099 + }, + { + "epoch": 1.261655429637068, + "grad_norm": 44.5, + "learning_rate": 1.662079510703364e-05, + "loss": 0.5398, + "step": 1100 + }, + { + "epoch": 1.261655429637068, + "eval_accuracy": 0.56, + "eval_loss": 0.5154783129692078, + "eval_runtime": 49.6732, + "eval_samples_per_second": 2.013, + "eval_steps_per_second": 2.013, + "step": 1100 + }, + { + "epoch": 1.2628030411705637, + "grad_norm": 23.5, + "learning_rate": 1.6615698267074417e-05, + "loss": 0.6216, + "step": 1101 + }, + { + "epoch": 1.2639506527040596, + "grad_norm": 42.0, + "learning_rate": 1.661060142711519e-05, + "loss": 0.2358, + "step": 1102 + }, + { + "epoch": 1.2650982642375557, + "grad_norm": 32.75, + "learning_rate": 1.6605504587155964e-05, + "loss": 0.2636, + "step": 1103 + }, + { + "epoch": 1.2662458757710515, + "grad_norm": 19.875, + "learning_rate": 1.660040774719674e-05, + "loss": 0.2472, + "step": 1104 + }, + { + "epoch": 1.2673934873045474, + "grad_norm": 11.8125, + "learning_rate": 1.6595310907237516e-05, + "loss": 0.1907, + "step": 1105 + }, + { + "epoch": 1.2685410988380434, + "grad_norm": 74.0, + "learning_rate": 1.659021406727829e-05, + "loss": 0.8738, + "step": 1106 + }, + { + "epoch": 1.2696887103715393, + "grad_norm": 39.5, + "learning_rate": 1.6585117227319063e-05, + "loss": 0.4113, + "step": 1107 + }, + { + "epoch": 1.2708363219050351, + "grad_norm": 34.25, + "learning_rate": 1.6580020387359837e-05, + "loss": 0.7458, + "step": 1108 + }, + { + "epoch": 1.271983933438531, + "grad_norm": 130.0, + "learning_rate": 1.657492354740061e-05, + "loss": 1.2238, + "step": 1109 + }, + { + "epoch": 1.2731315449720269, + "grad_norm": 37.5, + "learning_rate": 1.656982670744139e-05, + "loss": 0.6994, + "step": 1110 + }, + { + "epoch": 1.274279156505523, + "grad_norm": 83.5, + "learning_rate": 1.6564729867482163e-05, + "loss": 1.1055, + "step": 1111 + }, + { + "epoch": 1.2754267680390188, + "grad_norm": 7.15625, + "learning_rate": 1.6559633027522936e-05, + "loss": 0.1926, + "step": 1112 + }, + { + "epoch": 1.2765743795725146, + "grad_norm": 151.0, + "learning_rate": 1.6554536187563714e-05, + "loss": 0.4989, + "step": 1113 + }, + { + "epoch": 1.2777219911060107, + "grad_norm": 43.75, + "learning_rate": 1.6549439347604484e-05, + "loss": 0.4593, + "step": 1114 + }, + { + "epoch": 1.2788696026395066, + "grad_norm": 23.5, + "learning_rate": 1.654434250764526e-05, + "loss": 0.2898, + "step": 1115 + }, + { + "epoch": 1.2800172141730024, + "grad_norm": 37.5, + "learning_rate": 1.6539245667686035e-05, + "loss": 0.3342, + "step": 1116 + }, + { + "epoch": 1.2811648257064983, + "grad_norm": 35.25, + "learning_rate": 1.653414882772681e-05, + "loss": 0.4059, + "step": 1117 + }, + { + "epoch": 1.2823124372399943, + "grad_norm": 17.875, + "learning_rate": 1.6529051987767587e-05, + "loss": 0.3272, + "step": 1118 + }, + { + "epoch": 1.2834600487734902, + "grad_norm": 59.25, + "learning_rate": 1.652395514780836e-05, + "loss": 0.5725, + "step": 1119 + }, + { + "epoch": 1.284607660306986, + "grad_norm": 66.0, + "learning_rate": 1.6518858307849134e-05, + "loss": 0.8477, + "step": 1120 + }, + { + "epoch": 1.285755271840482, + "grad_norm": 67.0, + "learning_rate": 1.6513761467889912e-05, + "loss": 0.5421, + "step": 1121 + }, + { + "epoch": 1.286902883373978, + "grad_norm": 23.75, + "learning_rate": 1.6508664627930682e-05, + "loss": 0.457, + "step": 1122 + }, + { + "epoch": 1.2880504949074738, + "grad_norm": 22.875, + "learning_rate": 1.650356778797146e-05, + "loss": 0.5799, + "step": 1123 + }, + { + "epoch": 1.2891981064409697, + "grad_norm": 48.25, + "learning_rate": 1.6498470948012233e-05, + "loss": 0.5672, + "step": 1124 + }, + { + "epoch": 1.2903457179744655, + "grad_norm": 31.625, + "learning_rate": 1.6493374108053007e-05, + "loss": 0.6196, + "step": 1125 + }, + { + "epoch": 1.2914933295079616, + "grad_norm": 79.5, + "learning_rate": 1.6488277268093785e-05, + "loss": 0.6727, + "step": 1126 + }, + { + "epoch": 1.2926409410414574, + "grad_norm": 55.25, + "learning_rate": 1.648318042813456e-05, + "loss": 0.6848, + "step": 1127 + }, + { + "epoch": 1.2937885525749535, + "grad_norm": 49.5, + "learning_rate": 1.6478083588175332e-05, + "loss": 1.015, + "step": 1128 + }, + { + "epoch": 1.2949361641084494, + "grad_norm": 30.375, + "learning_rate": 1.6472986748216106e-05, + "loss": 0.9048, + "step": 1129 + }, + { + "epoch": 1.2960837756419452, + "grad_norm": 65.5, + "learning_rate": 1.6467889908256884e-05, + "loss": 0.7712, + "step": 1130 + }, + { + "epoch": 1.297231387175441, + "grad_norm": 14.8125, + "learning_rate": 1.6462793068297658e-05, + "loss": 0.1942, + "step": 1131 + }, + { + "epoch": 1.298378998708937, + "grad_norm": 57.0, + "learning_rate": 1.645769622833843e-05, + "loss": 0.5278, + "step": 1132 + }, + { + "epoch": 1.299526610242433, + "grad_norm": 20.125, + "learning_rate": 1.6452599388379205e-05, + "loss": 0.3787, + "step": 1133 + }, + { + "epoch": 1.3006742217759288, + "grad_norm": 20.875, + "learning_rate": 1.644750254841998e-05, + "loss": 0.347, + "step": 1134 + }, + { + "epoch": 1.3018218333094247, + "grad_norm": 36.25, + "learning_rate": 1.6442405708460757e-05, + "loss": 0.613, + "step": 1135 + }, + { + "epoch": 1.3029694448429208, + "grad_norm": 46.75, + "learning_rate": 1.643730886850153e-05, + "loss": 0.3531, + "step": 1136 + }, + { + "epoch": 1.3041170563764166, + "grad_norm": 51.5, + "learning_rate": 1.6432212028542304e-05, + "loss": 0.4654, + "step": 1137 + }, + { + "epoch": 1.3052646679099125, + "grad_norm": 59.5, + "learning_rate": 1.642711518858308e-05, + "loss": 0.6825, + "step": 1138 + }, + { + "epoch": 1.3064122794434083, + "grad_norm": 20.125, + "learning_rate": 1.6422018348623852e-05, + "loss": 0.5258, + "step": 1139 + }, + { + "epoch": 1.3075598909769044, + "grad_norm": 21.375, + "learning_rate": 1.641692150866463e-05, + "loss": 0.2334, + "step": 1140 + }, + { + "epoch": 1.3087075025104002, + "grad_norm": 57.5, + "learning_rate": 1.6411824668705403e-05, + "loss": 0.9003, + "step": 1141 + }, + { + "epoch": 1.309855114043896, + "grad_norm": 61.5, + "learning_rate": 1.6406727828746177e-05, + "loss": 0.6237, + "step": 1142 + }, + { + "epoch": 1.3110027255773922, + "grad_norm": 20.875, + "learning_rate": 1.6401630988786955e-05, + "loss": 0.3164, + "step": 1143 + }, + { + "epoch": 1.312150337110888, + "grad_norm": 48.5, + "learning_rate": 1.639653414882773e-05, + "loss": 0.4018, + "step": 1144 + }, + { + "epoch": 1.3132979486443839, + "grad_norm": 56.0, + "learning_rate": 1.6391437308868502e-05, + "loss": 0.7092, + "step": 1145 + }, + { + "epoch": 1.3144455601778797, + "grad_norm": 38.75, + "learning_rate": 1.638634046890928e-05, + "loss": 0.5181, + "step": 1146 + }, + { + "epoch": 1.3155931717113756, + "grad_norm": 40.5, + "learning_rate": 1.6381243628950054e-05, + "loss": 0.3165, + "step": 1147 + }, + { + "epoch": 1.3167407832448716, + "grad_norm": 32.25, + "learning_rate": 1.6376146788990827e-05, + "loss": 0.7836, + "step": 1148 + }, + { + "epoch": 1.3178883947783675, + "grad_norm": 61.5, + "learning_rate": 1.63710499490316e-05, + "loss": 0.7627, + "step": 1149 + }, + { + "epoch": 1.3190360063118634, + "grad_norm": 79.5, + "learning_rate": 1.6365953109072375e-05, + "loss": 0.9356, + "step": 1150 + }, + { + "epoch": 1.3201836178453594, + "grad_norm": 189.0, + "learning_rate": 1.6360856269113153e-05, + "loss": 1.0391, + "step": 1151 + }, + { + "epoch": 1.3213312293788553, + "grad_norm": 20.25, + "learning_rate": 1.6355759429153926e-05, + "loss": 0.6406, + "step": 1152 + }, + { + "epoch": 1.3224788409123511, + "grad_norm": 26.375, + "learning_rate": 1.63506625891947e-05, + "loss": 0.3832, + "step": 1153 + }, + { + "epoch": 1.323626452445847, + "grad_norm": 33.0, + "learning_rate": 1.6345565749235474e-05, + "loss": 0.4041, + "step": 1154 + }, + { + "epoch": 1.324774063979343, + "grad_norm": 23.625, + "learning_rate": 1.634046890927625e-05, + "loss": 0.3527, + "step": 1155 + }, + { + "epoch": 1.325921675512839, + "grad_norm": 99.5, + "learning_rate": 1.6335372069317022e-05, + "loss": 0.9746, + "step": 1156 + }, + { + "epoch": 1.3270692870463348, + "grad_norm": 45.0, + "learning_rate": 1.63302752293578e-05, + "loss": 0.2891, + "step": 1157 + }, + { + "epoch": 1.3282168985798308, + "grad_norm": 56.5, + "learning_rate": 1.6325178389398573e-05, + "loss": 0.8078, + "step": 1158 + }, + { + "epoch": 1.3293645101133267, + "grad_norm": 16.125, + "learning_rate": 1.6320081549439347e-05, + "loss": 0.6181, + "step": 1159 + }, + { + "epoch": 1.3305121216468225, + "grad_norm": 31.5, + "learning_rate": 1.6314984709480125e-05, + "loss": 0.3313, + "step": 1160 + }, + { + "epoch": 1.3316597331803184, + "grad_norm": 11.75, + "learning_rate": 1.63098878695209e-05, + "loss": 0.2764, + "step": 1161 + }, + { + "epoch": 1.3328073447138145, + "grad_norm": 32.0, + "learning_rate": 1.6304791029561672e-05, + "loss": 0.6992, + "step": 1162 + }, + { + "epoch": 1.3339549562473103, + "grad_norm": 40.25, + "learning_rate": 1.629969418960245e-05, + "loss": 0.4695, + "step": 1163 + }, + { + "epoch": 1.3351025677808062, + "grad_norm": 60.75, + "learning_rate": 1.6294597349643224e-05, + "loss": 0.5952, + "step": 1164 + }, + { + "epoch": 1.3362501793143022, + "grad_norm": 52.75, + "learning_rate": 1.6289500509683997e-05, + "loss": 0.4987, + "step": 1165 + }, + { + "epoch": 1.337397790847798, + "grad_norm": 28.25, + "learning_rate": 1.628440366972477e-05, + "loss": 0.31, + "step": 1166 + }, + { + "epoch": 1.338545402381294, + "grad_norm": 21.25, + "learning_rate": 1.6279306829765545e-05, + "loss": 0.4244, + "step": 1167 + }, + { + "epoch": 1.3396930139147898, + "grad_norm": 30.75, + "learning_rate": 1.6274209989806323e-05, + "loss": 0.5522, + "step": 1168 + }, + { + "epoch": 1.3408406254482856, + "grad_norm": 18.5, + "learning_rate": 1.6269113149847096e-05, + "loss": 0.3786, + "step": 1169 + }, + { + "epoch": 1.3419882369817817, + "grad_norm": 14.6875, + "learning_rate": 1.626401630988787e-05, + "loss": 0.0966, + "step": 1170 + }, + { + "epoch": 1.3431358485152776, + "grad_norm": 50.0, + "learning_rate": 1.6258919469928644e-05, + "loss": 0.4607, + "step": 1171 + }, + { + "epoch": 1.3442834600487734, + "grad_norm": 27.375, + "learning_rate": 1.625382262996942e-05, + "loss": 0.847, + "step": 1172 + }, + { + "epoch": 1.3454310715822695, + "grad_norm": 13.0625, + "learning_rate": 1.6248725790010195e-05, + "loss": 0.4091, + "step": 1173 + }, + { + "epoch": 1.3465786831157653, + "grad_norm": 16.0, + "learning_rate": 1.624362895005097e-05, + "loss": 0.2403, + "step": 1174 + }, + { + "epoch": 1.3477262946492612, + "grad_norm": 37.25, + "learning_rate": 1.6238532110091743e-05, + "loss": 0.421, + "step": 1175 + }, + { + "epoch": 1.348873906182757, + "grad_norm": 55.25, + "learning_rate": 1.6233435270132517e-05, + "loss": 0.662, + "step": 1176 + }, + { + "epoch": 1.3500215177162531, + "grad_norm": 40.5, + "learning_rate": 1.6228338430173294e-05, + "loss": 0.4565, + "step": 1177 + }, + { + "epoch": 1.351169129249749, + "grad_norm": 14.3125, + "learning_rate": 1.622324159021407e-05, + "loss": 0.4465, + "step": 1178 + }, + { + "epoch": 1.3523167407832448, + "grad_norm": 28.625, + "learning_rate": 1.6218144750254842e-05, + "loss": 0.3729, + "step": 1179 + }, + { + "epoch": 1.353464352316741, + "grad_norm": 55.25, + "learning_rate": 1.621304791029562e-05, + "loss": 0.3222, + "step": 1180 + }, + { + "epoch": 1.3546119638502367, + "grad_norm": 22.875, + "learning_rate": 1.6207951070336393e-05, + "loss": 0.437, + "step": 1181 + }, + { + "epoch": 1.3557595753837326, + "grad_norm": 38.0, + "learning_rate": 1.6202854230377167e-05, + "loss": 0.651, + "step": 1182 + }, + { + "epoch": 1.3569071869172284, + "grad_norm": 21.625, + "learning_rate": 1.6197757390417945e-05, + "loss": 0.4508, + "step": 1183 + }, + { + "epoch": 1.3580547984507243, + "grad_norm": 27.875, + "learning_rate": 1.6192660550458715e-05, + "loss": 0.3005, + "step": 1184 + }, + { + "epoch": 1.3592024099842204, + "grad_norm": 42.0, + "learning_rate": 1.6187563710499492e-05, + "loss": 0.2964, + "step": 1185 + }, + { + "epoch": 1.3603500215177162, + "grad_norm": 26.125, + "learning_rate": 1.6182466870540266e-05, + "loss": 0.6497, + "step": 1186 + }, + { + "epoch": 1.3614976330512123, + "grad_norm": 20.375, + "learning_rate": 1.617737003058104e-05, + "loss": 0.3097, + "step": 1187 + }, + { + "epoch": 1.3626452445847081, + "grad_norm": 56.5, + "learning_rate": 1.6172273190621818e-05, + "loss": 0.437, + "step": 1188 + }, + { + "epoch": 1.363792856118204, + "grad_norm": 69.5, + "learning_rate": 1.616717635066259e-05, + "loss": 0.4491, + "step": 1189 + }, + { + "epoch": 1.3649404676516999, + "grad_norm": 58.75, + "learning_rate": 1.6162079510703365e-05, + "loss": 0.4697, + "step": 1190 + }, + { + "epoch": 1.3660880791851957, + "grad_norm": 15.0, + "learning_rate": 1.615698267074414e-05, + "loss": 0.2935, + "step": 1191 + }, + { + "epoch": 1.3672356907186918, + "grad_norm": 69.0, + "learning_rate": 1.6151885830784913e-05, + "loss": 0.8532, + "step": 1192 + }, + { + "epoch": 1.3683833022521876, + "grad_norm": 27.5, + "learning_rate": 1.614678899082569e-05, + "loss": 0.3305, + "step": 1193 + }, + { + "epoch": 1.3695309137856835, + "grad_norm": 65.5, + "learning_rate": 1.6141692150866464e-05, + "loss": 0.6747, + "step": 1194 + }, + { + "epoch": 1.3706785253191796, + "grad_norm": 39.5, + "learning_rate": 1.6136595310907238e-05, + "loss": 0.4378, + "step": 1195 + }, + { + "epoch": 1.3718261368526754, + "grad_norm": 36.25, + "learning_rate": 1.6131498470948012e-05, + "loss": 0.4756, + "step": 1196 + }, + { + "epoch": 1.3729737483861713, + "grad_norm": 25.75, + "learning_rate": 1.612640163098879e-05, + "loss": 0.2116, + "step": 1197 + }, + { + "epoch": 1.374121359919667, + "grad_norm": 51.5, + "learning_rate": 1.6121304791029563e-05, + "loss": 0.6976, + "step": 1198 + }, + { + "epoch": 1.3752689714531632, + "grad_norm": 28.25, + "learning_rate": 1.6116207951070337e-05, + "loss": 0.3644, + "step": 1199 + }, + { + "epoch": 1.376416582986659, + "grad_norm": 21.25, + "learning_rate": 1.6111111111111115e-05, + "loss": 0.4288, + "step": 1200 + }, + { + "epoch": 1.376416582986659, + "eval_accuracy": 0.61, + "eval_loss": 0.5443911552429199, + "eval_runtime": 49.3817, + "eval_samples_per_second": 2.025, + "eval_steps_per_second": 2.025, + "step": 1200 + }, + { + "epoch": 1.3775641945201549, + "grad_norm": 16.75, + "learning_rate": 1.6106014271151885e-05, + "loss": 0.5041, + "step": 1201 + }, + { + "epoch": 1.378711806053651, + "grad_norm": 50.25, + "learning_rate": 1.6100917431192662e-05, + "loss": 0.5077, + "step": 1202 + }, + { + "epoch": 1.3798594175871468, + "grad_norm": 15.875, + "learning_rate": 1.6095820591233436e-05, + "loss": 0.2509, + "step": 1203 + }, + { + "epoch": 1.3810070291206427, + "grad_norm": 52.5, + "learning_rate": 1.609072375127421e-05, + "loss": 0.6619, + "step": 1204 + }, + { + "epoch": 1.3821546406541385, + "grad_norm": 27.0, + "learning_rate": 1.6085626911314988e-05, + "loss": 0.3906, + "step": 1205 + }, + { + "epoch": 1.3833022521876344, + "grad_norm": 28.5, + "learning_rate": 1.608053007135576e-05, + "loss": 0.506, + "step": 1206 + }, + { + "epoch": 1.3844498637211304, + "grad_norm": 34.25, + "learning_rate": 1.6075433231396535e-05, + "loss": 0.3932, + "step": 1207 + }, + { + "epoch": 1.3855974752546263, + "grad_norm": 36.75, + "learning_rate": 1.6070336391437313e-05, + "loss": 0.5362, + "step": 1208 + }, + { + "epoch": 1.3867450867881221, + "grad_norm": 52.5, + "learning_rate": 1.6065239551478083e-05, + "loss": 0.5699, + "step": 1209 + }, + { + "epoch": 1.3878926983216182, + "grad_norm": 45.5, + "learning_rate": 1.606014271151886e-05, + "loss": 0.5685, + "step": 1210 + }, + { + "epoch": 1.389040309855114, + "grad_norm": 60.25, + "learning_rate": 1.6055045871559634e-05, + "loss": 0.9313, + "step": 1211 + }, + { + "epoch": 1.39018792138861, + "grad_norm": 38.0, + "learning_rate": 1.6049949031600408e-05, + "loss": 0.5542, + "step": 1212 + }, + { + "epoch": 1.3913355329221058, + "grad_norm": 40.75, + "learning_rate": 1.6044852191641186e-05, + "loss": 0.8328, + "step": 1213 + }, + { + "epoch": 1.3924831444556018, + "grad_norm": 42.25, + "learning_rate": 1.603975535168196e-05, + "loss": 0.2783, + "step": 1214 + }, + { + "epoch": 1.3936307559890977, + "grad_norm": 39.5, + "learning_rate": 1.6034658511722733e-05, + "loss": 0.5385, + "step": 1215 + }, + { + "epoch": 1.3947783675225935, + "grad_norm": 42.75, + "learning_rate": 1.6029561671763507e-05, + "loss": 0.5375, + "step": 1216 + }, + { + "epoch": 1.3959259790560896, + "grad_norm": 22.25, + "learning_rate": 1.602446483180428e-05, + "loss": 0.6028, + "step": 1217 + }, + { + "epoch": 1.3970735905895855, + "grad_norm": 34.0, + "learning_rate": 1.601936799184506e-05, + "loss": 0.5153, + "step": 1218 + }, + { + "epoch": 1.3982212021230813, + "grad_norm": 79.5, + "learning_rate": 1.6014271151885832e-05, + "loss": 0.7959, + "step": 1219 + }, + { + "epoch": 1.3993688136565772, + "grad_norm": 28.0, + "learning_rate": 1.6009174311926606e-05, + "loss": 0.271, + "step": 1220 + }, + { + "epoch": 1.400516425190073, + "grad_norm": 76.5, + "learning_rate": 1.600407747196738e-05, + "loss": 0.6952, + "step": 1221 + }, + { + "epoch": 1.401664036723569, + "grad_norm": 21.875, + "learning_rate": 1.5998980632008157e-05, + "loss": 0.452, + "step": 1222 + }, + { + "epoch": 1.402811648257065, + "grad_norm": 70.5, + "learning_rate": 1.599388379204893e-05, + "loss": 0.5592, + "step": 1223 + }, + { + "epoch": 1.403959259790561, + "grad_norm": 17.875, + "learning_rate": 1.5988786952089705e-05, + "loss": 0.451, + "step": 1224 + }, + { + "epoch": 1.4051068713240569, + "grad_norm": 30.125, + "learning_rate": 1.5983690112130483e-05, + "loss": 0.4143, + "step": 1225 + }, + { + "epoch": 1.4062544828575527, + "grad_norm": 25.625, + "learning_rate": 1.5978593272171253e-05, + "loss": 0.454, + "step": 1226 + }, + { + "epoch": 1.4074020943910486, + "grad_norm": 24.625, + "learning_rate": 1.597349643221203e-05, + "loss": 0.4827, + "step": 1227 + }, + { + "epoch": 1.4085497059245444, + "grad_norm": 14.6875, + "learning_rate": 1.5968399592252804e-05, + "loss": 0.1517, + "step": 1228 + }, + { + "epoch": 1.4096973174580405, + "grad_norm": 12.3125, + "learning_rate": 1.5963302752293578e-05, + "loss": 0.4015, + "step": 1229 + }, + { + "epoch": 1.4108449289915364, + "grad_norm": 59.0, + "learning_rate": 1.5958205912334355e-05, + "loss": 0.5366, + "step": 1230 + }, + { + "epoch": 1.4119925405250322, + "grad_norm": 11.1875, + "learning_rate": 1.595310907237513e-05, + "loss": 0.3743, + "step": 1231 + }, + { + "epoch": 1.4131401520585283, + "grad_norm": 18.75, + "learning_rate": 1.5948012232415903e-05, + "loss": 0.4668, + "step": 1232 + }, + { + "epoch": 1.4142877635920241, + "grad_norm": 50.75, + "learning_rate": 1.5942915392456677e-05, + "loss": 0.3211, + "step": 1233 + }, + { + "epoch": 1.41543537512552, + "grad_norm": 41.5, + "learning_rate": 1.593781855249745e-05, + "loss": 0.5208, + "step": 1234 + }, + { + "epoch": 1.4165829866590158, + "grad_norm": 16.5, + "learning_rate": 1.593272171253823e-05, + "loss": 0.2334, + "step": 1235 + }, + { + "epoch": 1.417730598192512, + "grad_norm": 72.0, + "learning_rate": 1.5927624872579002e-05, + "loss": 0.4065, + "step": 1236 + }, + { + "epoch": 1.4188782097260078, + "grad_norm": 21.0, + "learning_rate": 1.5922528032619776e-05, + "loss": 0.4257, + "step": 1237 + }, + { + "epoch": 1.4200258212595036, + "grad_norm": 18.75, + "learning_rate": 1.591743119266055e-05, + "loss": 0.3615, + "step": 1238 + }, + { + "epoch": 1.4211734327929997, + "grad_norm": 54.5, + "learning_rate": 1.5912334352701327e-05, + "loss": 0.2902, + "step": 1239 + }, + { + "epoch": 1.4223210443264955, + "grad_norm": 8.3125, + "learning_rate": 1.59072375127421e-05, + "loss": 0.1653, + "step": 1240 + }, + { + "epoch": 1.4234686558599914, + "grad_norm": 18.125, + "learning_rate": 1.5902140672782875e-05, + "loss": 0.3842, + "step": 1241 + }, + { + "epoch": 1.4246162673934872, + "grad_norm": 85.0, + "learning_rate": 1.5897043832823652e-05, + "loss": 0.7718, + "step": 1242 + }, + { + "epoch": 1.425763878926983, + "grad_norm": 27.125, + "learning_rate": 1.5891946992864423e-05, + "loss": 0.195, + "step": 1243 + }, + { + "epoch": 1.4269114904604792, + "grad_norm": 31.125, + "learning_rate": 1.58868501529052e-05, + "loss": 0.5963, + "step": 1244 + }, + { + "epoch": 1.428059101993975, + "grad_norm": 67.0, + "learning_rate": 1.5881753312945974e-05, + "loss": 0.709, + "step": 1245 + }, + { + "epoch": 1.429206713527471, + "grad_norm": 20.25, + "learning_rate": 1.5876656472986748e-05, + "loss": 0.3003, + "step": 1246 + }, + { + "epoch": 1.430354325060967, + "grad_norm": 40.25, + "learning_rate": 1.5871559633027525e-05, + "loss": 0.7344, + "step": 1247 + }, + { + "epoch": 1.4315019365944628, + "grad_norm": 26.75, + "learning_rate": 1.58664627930683e-05, + "loss": 1.0281, + "step": 1248 + }, + { + "epoch": 1.4326495481279586, + "grad_norm": 49.25, + "learning_rate": 1.5861365953109073e-05, + "loss": 0.3, + "step": 1249 + }, + { + "epoch": 1.4337971596614545, + "grad_norm": 27.125, + "learning_rate": 1.585626911314985e-05, + "loss": 0.5945, + "step": 1250 + }, + { + "epoch": 1.4349447711949506, + "grad_norm": 41.5, + "learning_rate": 1.585117227319062e-05, + "loss": 0.6692, + "step": 1251 + }, + { + "epoch": 1.4360923827284464, + "grad_norm": 14.375, + "learning_rate": 1.58460754332314e-05, + "loss": 0.3908, + "step": 1252 + }, + { + "epoch": 1.4372399942619423, + "grad_norm": 77.5, + "learning_rate": 1.5840978593272172e-05, + "loss": 0.7376, + "step": 1253 + }, + { + "epoch": 1.4383876057954383, + "grad_norm": 29.75, + "learning_rate": 1.5835881753312946e-05, + "loss": 0.355, + "step": 1254 + }, + { + "epoch": 1.4395352173289342, + "grad_norm": 32.0, + "learning_rate": 1.5830784913353723e-05, + "loss": 0.7525, + "step": 1255 + }, + { + "epoch": 1.44068282886243, + "grad_norm": 42.75, + "learning_rate": 1.5825688073394497e-05, + "loss": 0.2832, + "step": 1256 + }, + { + "epoch": 1.441830440395926, + "grad_norm": 21.125, + "learning_rate": 1.582059123343527e-05, + "loss": 0.3375, + "step": 1257 + }, + { + "epoch": 1.442978051929422, + "grad_norm": 33.25, + "learning_rate": 1.5815494393476045e-05, + "loss": 0.3517, + "step": 1258 + }, + { + "epoch": 1.4441256634629178, + "grad_norm": 35.0, + "learning_rate": 1.5810397553516822e-05, + "loss": 0.382, + "step": 1259 + }, + { + "epoch": 1.4452732749964137, + "grad_norm": 53.75, + "learning_rate": 1.5805300713557596e-05, + "loss": 0.3113, + "step": 1260 + }, + { + "epoch": 1.4464208865299097, + "grad_norm": 43.75, + "learning_rate": 1.580020387359837e-05, + "loss": 0.3177, + "step": 1261 + }, + { + "epoch": 1.4475684980634056, + "grad_norm": 35.0, + "learning_rate": 1.5795107033639144e-05, + "loss": 0.3791, + "step": 1262 + }, + { + "epoch": 1.4487161095969014, + "grad_norm": 45.0, + "learning_rate": 1.5790010193679918e-05, + "loss": 0.4492, + "step": 1263 + }, + { + "epoch": 1.4498637211303973, + "grad_norm": 27.875, + "learning_rate": 1.5784913353720695e-05, + "loss": 0.3343, + "step": 1264 + }, + { + "epoch": 1.4510113326638931, + "grad_norm": 19.125, + "learning_rate": 1.577981651376147e-05, + "loss": 0.8559, + "step": 1265 + }, + { + "epoch": 1.4521589441973892, + "grad_norm": 8.0, + "learning_rate": 1.5774719673802243e-05, + "loss": 0.1379, + "step": 1266 + }, + { + "epoch": 1.453306555730885, + "grad_norm": 49.5, + "learning_rate": 1.576962283384302e-05, + "loss": 0.4941, + "step": 1267 + }, + { + "epoch": 1.454454167264381, + "grad_norm": 84.5, + "learning_rate": 1.576452599388379e-05, + "loss": 1.4308, + "step": 1268 + }, + { + "epoch": 1.455601778797877, + "grad_norm": 92.0, + "learning_rate": 1.5759429153924568e-05, + "loss": 0.9692, + "step": 1269 + }, + { + "epoch": 1.4567493903313729, + "grad_norm": 88.0, + "learning_rate": 1.5754332313965342e-05, + "loss": 0.9589, + "step": 1270 + }, + { + "epoch": 1.4578970018648687, + "grad_norm": 50.25, + "learning_rate": 1.5749235474006116e-05, + "loss": 0.5352, + "step": 1271 + }, + { + "epoch": 1.4590446133983646, + "grad_norm": 42.0, + "learning_rate": 1.5744138634046893e-05, + "loss": 0.3708, + "step": 1272 + }, + { + "epoch": 1.4601922249318606, + "grad_norm": 35.0, + "learning_rate": 1.5739041794087667e-05, + "loss": 0.7022, + "step": 1273 + }, + { + "epoch": 1.4613398364653565, + "grad_norm": 13.375, + "learning_rate": 1.573394495412844e-05, + "loss": 0.3201, + "step": 1274 + }, + { + "epoch": 1.4624874479988523, + "grad_norm": 87.5, + "learning_rate": 1.572884811416922e-05, + "loss": 0.576, + "step": 1275 + }, + { + "epoch": 1.4636350595323484, + "grad_norm": 68.5, + "learning_rate": 1.5723751274209992e-05, + "loss": 0.5697, + "step": 1276 + }, + { + "epoch": 1.4647826710658443, + "grad_norm": 31.75, + "learning_rate": 1.5718654434250766e-05, + "loss": 0.4631, + "step": 1277 + }, + { + "epoch": 1.46593028259934, + "grad_norm": 19.5, + "learning_rate": 1.571355759429154e-05, + "loss": 0.4516, + "step": 1278 + }, + { + "epoch": 1.467077894132836, + "grad_norm": 52.0, + "learning_rate": 1.5708460754332314e-05, + "loss": 0.6808, + "step": 1279 + }, + { + "epoch": 1.4682255056663318, + "grad_norm": 17.875, + "learning_rate": 1.570336391437309e-05, + "loss": 0.3936, + "step": 1280 + }, + { + "epoch": 1.4693731171998279, + "grad_norm": 24.25, + "learning_rate": 1.5698267074413865e-05, + "loss": 0.4196, + "step": 1281 + }, + { + "epoch": 1.4705207287333237, + "grad_norm": 111.0, + "learning_rate": 1.569317023445464e-05, + "loss": 0.8228, + "step": 1282 + }, + { + "epoch": 1.4716683402668198, + "grad_norm": 36.5, + "learning_rate": 1.5688073394495413e-05, + "loss": 0.5546, + "step": 1283 + }, + { + "epoch": 1.4728159518003157, + "grad_norm": 40.5, + "learning_rate": 1.568297655453619e-05, + "loss": 0.4347, + "step": 1284 + }, + { + "epoch": 1.4739635633338115, + "grad_norm": 59.75, + "learning_rate": 1.5677879714576964e-05, + "loss": 0.8506, + "step": 1285 + }, + { + "epoch": 1.4751111748673074, + "grad_norm": 58.25, + "learning_rate": 1.5672782874617738e-05, + "loss": 0.4958, + "step": 1286 + }, + { + "epoch": 1.4762587864008032, + "grad_norm": 41.5, + "learning_rate": 1.5667686034658512e-05, + "loss": 0.6571, + "step": 1287 + }, + { + "epoch": 1.4774063979342993, + "grad_norm": 20.75, + "learning_rate": 1.5662589194699286e-05, + "loss": 0.2749, + "step": 1288 + }, + { + "epoch": 1.4785540094677951, + "grad_norm": 24.875, + "learning_rate": 1.5657492354740063e-05, + "loss": 0.552, + "step": 1289 + }, + { + "epoch": 1.479701621001291, + "grad_norm": 24.625, + "learning_rate": 1.5652395514780837e-05, + "loss": 0.5655, + "step": 1290 + }, + { + "epoch": 1.480849232534787, + "grad_norm": 71.0, + "learning_rate": 1.564729867482161e-05, + "loss": 1.1072, + "step": 1291 + }, + { + "epoch": 1.481996844068283, + "grad_norm": 56.5, + "learning_rate": 1.564220183486239e-05, + "loss": 0.9029, + "step": 1292 + }, + { + "epoch": 1.4831444556017788, + "grad_norm": 75.0, + "learning_rate": 1.563710499490316e-05, + "loss": 0.8671, + "step": 1293 + }, + { + "epoch": 1.4842920671352746, + "grad_norm": 68.5, + "learning_rate": 1.5632008154943936e-05, + "loss": 0.6165, + "step": 1294 + }, + { + "epoch": 1.4854396786687707, + "grad_norm": 57.5, + "learning_rate": 1.5626911314984713e-05, + "loss": 0.4413, + "step": 1295 + }, + { + "epoch": 1.4865872902022665, + "grad_norm": 38.25, + "learning_rate": 1.5621814475025484e-05, + "loss": 0.4508, + "step": 1296 + }, + { + "epoch": 1.4877349017357624, + "grad_norm": 22.375, + "learning_rate": 1.561671763506626e-05, + "loss": 0.4694, + "step": 1297 + }, + { + "epoch": 1.4888825132692585, + "grad_norm": 19.625, + "learning_rate": 1.5611620795107035e-05, + "loss": 0.4833, + "step": 1298 + }, + { + "epoch": 1.4900301248027543, + "grad_norm": 74.0, + "learning_rate": 1.560652395514781e-05, + "loss": 0.6443, + "step": 1299 + }, + { + "epoch": 1.4911777363362502, + "grad_norm": 30.25, + "learning_rate": 1.5601427115188586e-05, + "loss": 0.5003, + "step": 1300 + }, + { + "epoch": 1.4911777363362502, + "eval_accuracy": 0.64, + "eval_loss": 0.5184877514839172, + "eval_runtime": 49.6613, + "eval_samples_per_second": 2.014, + "eval_steps_per_second": 2.014, + "step": 1300 + }, + { + "epoch": 1.492325347869746, + "grad_norm": 26.5, + "learning_rate": 1.559633027522936e-05, + "loss": 0.4356, + "step": 1301 + }, + { + "epoch": 1.4934729594032419, + "grad_norm": 93.0, + "learning_rate": 1.5591233435270134e-05, + "loss": 0.6945, + "step": 1302 + }, + { + "epoch": 1.494620570936738, + "grad_norm": 84.0, + "learning_rate": 1.5586136595310908e-05, + "loss": 0.7059, + "step": 1303 + }, + { + "epoch": 1.4957681824702338, + "grad_norm": 84.5, + "learning_rate": 1.5581039755351682e-05, + "loss": 0.8654, + "step": 1304 + }, + { + "epoch": 1.4969157940037299, + "grad_norm": 79.5, + "learning_rate": 1.5575942915392456e-05, + "loss": 0.8112, + "step": 1305 + }, + { + "epoch": 1.4980634055372257, + "grad_norm": 41.75, + "learning_rate": 1.5570846075433233e-05, + "loss": 1.0995, + "step": 1306 + }, + { + "epoch": 1.4992110170707216, + "grad_norm": 28.625, + "learning_rate": 1.5565749235474007e-05, + "loss": 0.8355, + "step": 1307 + }, + { + "epoch": 1.5003586286042174, + "grad_norm": 67.5, + "learning_rate": 1.556065239551478e-05, + "loss": 0.7727, + "step": 1308 + }, + { + "epoch": 1.5015062401377133, + "grad_norm": 17.5, + "learning_rate": 1.555555555555556e-05, + "loss": 0.4682, + "step": 1309 + }, + { + "epoch": 1.5026538516712094, + "grad_norm": 18.625, + "learning_rate": 1.555045871559633e-05, + "loss": 0.2126, + "step": 1310 + }, + { + "epoch": 1.5038014632047052, + "grad_norm": 15.75, + "learning_rate": 1.5545361875637106e-05, + "loss": 0.4916, + "step": 1311 + }, + { + "epoch": 1.5049490747382013, + "grad_norm": 31.625, + "learning_rate": 1.554026503567788e-05, + "loss": 0.2308, + "step": 1312 + }, + { + "epoch": 1.5060966862716971, + "grad_norm": 51.75, + "learning_rate": 1.5535168195718654e-05, + "loss": 1.0898, + "step": 1313 + }, + { + "epoch": 1.507244297805193, + "grad_norm": 31.75, + "learning_rate": 1.553007135575943e-05, + "loss": 0.4099, + "step": 1314 + }, + { + "epoch": 1.5083919093386888, + "grad_norm": 88.0, + "learning_rate": 1.5524974515800205e-05, + "loss": 0.9649, + "step": 1315 + }, + { + "epoch": 1.5095395208721847, + "grad_norm": 24.75, + "learning_rate": 1.551987767584098e-05, + "loss": 1.0352, + "step": 1316 + }, + { + "epoch": 1.5106871324056805, + "grad_norm": 13.625, + "learning_rate": 1.5514780835881756e-05, + "loss": 0.3537, + "step": 1317 + }, + { + "epoch": 1.5118347439391766, + "grad_norm": 94.0, + "learning_rate": 1.550968399592253e-05, + "loss": 0.9038, + "step": 1318 + }, + { + "epoch": 1.5129823554726725, + "grad_norm": 26.0, + "learning_rate": 1.5504587155963304e-05, + "loss": 0.346, + "step": 1319 + }, + { + "epoch": 1.5141299670061685, + "grad_norm": 44.25, + "learning_rate": 1.5499490316004078e-05, + "loss": 0.7941, + "step": 1320 + }, + { + "epoch": 1.5152775785396644, + "grad_norm": 27.75, + "learning_rate": 1.5494393476044852e-05, + "loss": 0.3747, + "step": 1321 + }, + { + "epoch": 1.5164251900731602, + "grad_norm": 97.5, + "learning_rate": 1.548929663608563e-05, + "loss": 0.9651, + "step": 1322 + }, + { + "epoch": 1.517572801606656, + "grad_norm": 10.6875, + "learning_rate": 1.5484199796126403e-05, + "loss": 0.2523, + "step": 1323 + }, + { + "epoch": 1.518720413140152, + "grad_norm": 23.25, + "learning_rate": 1.5479102956167177e-05, + "loss": 0.5667, + "step": 1324 + }, + { + "epoch": 1.519868024673648, + "grad_norm": 22.25, + "learning_rate": 1.547400611620795e-05, + "loss": 0.3108, + "step": 1325 + }, + { + "epoch": 1.5210156362071439, + "grad_norm": 29.125, + "learning_rate": 1.5468909276248728e-05, + "loss": 0.5994, + "step": 1326 + }, + { + "epoch": 1.52216324774064, + "grad_norm": 37.0, + "learning_rate": 1.5463812436289502e-05, + "loss": 0.6835, + "step": 1327 + }, + { + "epoch": 1.5233108592741358, + "grad_norm": 35.0, + "learning_rate": 1.5458715596330276e-05, + "loss": 0.3934, + "step": 1328 + }, + { + "epoch": 1.5244584708076316, + "grad_norm": 42.5, + "learning_rate": 1.545361875637105e-05, + "loss": 0.4904, + "step": 1329 + }, + { + "epoch": 1.5256060823411275, + "grad_norm": 79.5, + "learning_rate": 1.5448521916411824e-05, + "loss": 0.8999, + "step": 1330 + }, + { + "epoch": 1.5267536938746233, + "grad_norm": 51.25, + "learning_rate": 1.54434250764526e-05, + "loss": 0.5231, + "step": 1331 + }, + { + "epoch": 1.5279013054081192, + "grad_norm": 53.5, + "learning_rate": 1.5438328236493375e-05, + "loss": 0.6297, + "step": 1332 + }, + { + "epoch": 1.5290489169416153, + "grad_norm": 65.5, + "learning_rate": 1.543323139653415e-05, + "loss": 0.5863, + "step": 1333 + }, + { + "epoch": 1.5301965284751113, + "grad_norm": 44.0, + "learning_rate": 1.5428134556574926e-05, + "loss": 0.402, + "step": 1334 + }, + { + "epoch": 1.5313441400086072, + "grad_norm": 54.0, + "learning_rate": 1.54230377166157e-05, + "loss": 0.5476, + "step": 1335 + }, + { + "epoch": 1.532491751542103, + "grad_norm": 40.5, + "learning_rate": 1.5417940876656474e-05, + "loss": 0.4921, + "step": 1336 + }, + { + "epoch": 1.533639363075599, + "grad_norm": 15.125, + "learning_rate": 1.541284403669725e-05, + "loss": 0.4748, + "step": 1337 + }, + { + "epoch": 1.5347869746090947, + "grad_norm": 35.25, + "learning_rate": 1.5407747196738022e-05, + "loss": 0.5071, + "step": 1338 + }, + { + "epoch": 1.5359345861425906, + "grad_norm": 26.5, + "learning_rate": 1.54026503567788e-05, + "loss": 0.2151, + "step": 1339 + }, + { + "epoch": 1.5370821976760867, + "grad_norm": 32.5, + "learning_rate": 1.5397553516819573e-05, + "loss": 0.4312, + "step": 1340 + }, + { + "epoch": 1.5382298092095825, + "grad_norm": 80.5, + "learning_rate": 1.5392456676860347e-05, + "loss": 0.6625, + "step": 1341 + }, + { + "epoch": 1.5393774207430786, + "grad_norm": 46.25, + "learning_rate": 1.5387359836901124e-05, + "loss": 0.3488, + "step": 1342 + }, + { + "epoch": 1.5405250322765744, + "grad_norm": 41.75, + "learning_rate": 1.5382262996941898e-05, + "loss": 0.5342, + "step": 1343 + }, + { + "epoch": 1.5416726438100703, + "grad_norm": 44.0, + "learning_rate": 1.5377166156982672e-05, + "loss": 0.4736, + "step": 1344 + }, + { + "epoch": 1.5428202553435661, + "grad_norm": 11.625, + "learning_rate": 1.5372069317023446e-05, + "loss": 0.3527, + "step": 1345 + }, + { + "epoch": 1.543967866877062, + "grad_norm": 31.75, + "learning_rate": 1.536697247706422e-05, + "loss": 0.6221, + "step": 1346 + }, + { + "epoch": 1.545115478410558, + "grad_norm": 47.0, + "learning_rate": 1.5361875637104997e-05, + "loss": 0.6081, + "step": 1347 + }, + { + "epoch": 1.546263089944054, + "grad_norm": 22.5, + "learning_rate": 1.535677879714577e-05, + "loss": 0.4948, + "step": 1348 + }, + { + "epoch": 1.54741070147755, + "grad_norm": 82.5, + "learning_rate": 1.5351681957186545e-05, + "loss": 0.7993, + "step": 1349 + }, + { + "epoch": 1.5485583130110459, + "grad_norm": 45.5, + "learning_rate": 1.534658511722732e-05, + "loss": 0.6413, + "step": 1350 + }, + { + "epoch": 1.5497059245445417, + "grad_norm": 14.875, + "learning_rate": 1.5341488277268096e-05, + "loss": 0.4485, + "step": 1351 + }, + { + "epoch": 1.5508535360780376, + "grad_norm": 50.0, + "learning_rate": 1.533639143730887e-05, + "loss": 1.0687, + "step": 1352 + }, + { + "epoch": 1.5520011476115334, + "grad_norm": 58.75, + "learning_rate": 1.5331294597349644e-05, + "loss": 1.0185, + "step": 1353 + }, + { + "epoch": 1.5531487591450293, + "grad_norm": 43.0, + "learning_rate": 1.532619775739042e-05, + "loss": 0.5074, + "step": 1354 + }, + { + "epoch": 1.5542963706785253, + "grad_norm": 47.25, + "learning_rate": 1.5321100917431192e-05, + "loss": 0.3446, + "step": 1355 + }, + { + "epoch": 1.5554439822120212, + "grad_norm": 37.25, + "learning_rate": 1.531600407747197e-05, + "loss": 0.3883, + "step": 1356 + }, + { + "epoch": 1.5565915937455173, + "grad_norm": 79.0, + "learning_rate": 1.5310907237512743e-05, + "loss": 0.8577, + "step": 1357 + }, + { + "epoch": 1.557739205279013, + "grad_norm": 45.0, + "learning_rate": 1.5305810397553517e-05, + "loss": 0.5937, + "step": 1358 + }, + { + "epoch": 1.558886816812509, + "grad_norm": 56.25, + "learning_rate": 1.5300713557594294e-05, + "loss": 0.8568, + "step": 1359 + }, + { + "epoch": 1.5600344283460048, + "grad_norm": 33.25, + "learning_rate": 1.5295616717635068e-05, + "loss": 0.4064, + "step": 1360 + }, + { + "epoch": 1.5611820398795007, + "grad_norm": 59.0, + "learning_rate": 1.5290519877675842e-05, + "loss": 0.6067, + "step": 1361 + }, + { + "epoch": 1.5623296514129967, + "grad_norm": 11.75, + "learning_rate": 1.528542303771662e-05, + "loss": 0.3149, + "step": 1362 + }, + { + "epoch": 1.5634772629464926, + "grad_norm": 38.0, + "learning_rate": 1.528032619775739e-05, + "loss": 0.5482, + "step": 1363 + }, + { + "epoch": 1.5646248744799887, + "grad_norm": 43.0, + "learning_rate": 1.5275229357798167e-05, + "loss": 0.3758, + "step": 1364 + }, + { + "epoch": 1.5657724860134845, + "grad_norm": 17.625, + "learning_rate": 1.527013251783894e-05, + "loss": 0.0865, + "step": 1365 + }, + { + "epoch": 1.5669200975469804, + "grad_norm": 79.0, + "learning_rate": 1.5265035677879715e-05, + "loss": 1.07, + "step": 1366 + }, + { + "epoch": 1.5680677090804762, + "grad_norm": 86.5, + "learning_rate": 1.5259938837920492e-05, + "loss": 1.2776, + "step": 1367 + }, + { + "epoch": 1.569215320613972, + "grad_norm": 65.0, + "learning_rate": 1.5254841997961264e-05, + "loss": 1.0829, + "step": 1368 + }, + { + "epoch": 1.570362932147468, + "grad_norm": 11.25, + "learning_rate": 1.524974515800204e-05, + "loss": 0.1616, + "step": 1369 + }, + { + "epoch": 1.571510543680964, + "grad_norm": 175.0, + "learning_rate": 1.5244648318042814e-05, + "loss": 0.788, + "step": 1370 + }, + { + "epoch": 1.57265815521446, + "grad_norm": 52.25, + "learning_rate": 1.523955147808359e-05, + "loss": 0.6801, + "step": 1371 + }, + { + "epoch": 1.573805766747956, + "grad_norm": 90.0, + "learning_rate": 1.5234454638124365e-05, + "loss": 1.1125, + "step": 1372 + }, + { + "epoch": 1.5749533782814518, + "grad_norm": 69.5, + "learning_rate": 1.5229357798165139e-05, + "loss": 0.7275, + "step": 1373 + }, + { + "epoch": 1.5761009898149476, + "grad_norm": 21.625, + "learning_rate": 1.5224260958205915e-05, + "loss": 0.2809, + "step": 1374 + }, + { + "epoch": 1.5772486013484435, + "grad_norm": 41.5, + "learning_rate": 1.5219164118246687e-05, + "loss": 1.0073, + "step": 1375 + }, + { + "epoch": 1.5783962128819393, + "grad_norm": 65.5, + "learning_rate": 1.5214067278287462e-05, + "loss": 0.6342, + "step": 1376 + }, + { + "epoch": 1.5795438244154354, + "grad_norm": 19.5, + "learning_rate": 1.5208970438328238e-05, + "loss": 0.2352, + "step": 1377 + }, + { + "epoch": 1.5806914359489312, + "grad_norm": 17.125, + "learning_rate": 1.5203873598369012e-05, + "loss": 0.5829, + "step": 1378 + }, + { + "epoch": 1.5818390474824273, + "grad_norm": 40.25, + "learning_rate": 1.5198776758409788e-05, + "loss": 0.5567, + "step": 1379 + }, + { + "epoch": 1.5829866590159232, + "grad_norm": 20.0, + "learning_rate": 1.5193679918450561e-05, + "loss": 0.6663, + "step": 1380 + }, + { + "epoch": 1.584134270549419, + "grad_norm": 84.0, + "learning_rate": 1.5188583078491337e-05, + "loss": 0.6399, + "step": 1381 + }, + { + "epoch": 1.5852818820829149, + "grad_norm": 13.5, + "learning_rate": 1.5183486238532111e-05, + "loss": 0.3913, + "step": 1382 + }, + { + "epoch": 1.5864294936164107, + "grad_norm": 39.75, + "learning_rate": 1.5178389398572887e-05, + "loss": 0.4537, + "step": 1383 + }, + { + "epoch": 1.5875771051499068, + "grad_norm": 10.3125, + "learning_rate": 1.5173292558613662e-05, + "loss": 0.5198, + "step": 1384 + }, + { + "epoch": 1.5887247166834026, + "grad_norm": 26.75, + "learning_rate": 1.5168195718654434e-05, + "loss": 0.4686, + "step": 1385 + }, + { + "epoch": 1.5898723282168987, + "grad_norm": 57.25, + "learning_rate": 1.516309887869521e-05, + "loss": 0.5172, + "step": 1386 + }, + { + "epoch": 1.5910199397503946, + "grad_norm": 82.0, + "learning_rate": 1.5158002038735984e-05, + "loss": 0.9411, + "step": 1387 + }, + { + "epoch": 1.5921675512838904, + "grad_norm": 31.0, + "learning_rate": 1.515290519877676e-05, + "loss": 0.3182, + "step": 1388 + }, + { + "epoch": 1.5933151628173863, + "grad_norm": 79.5, + "learning_rate": 1.5147808358817535e-05, + "loss": 0.7013, + "step": 1389 + }, + { + "epoch": 1.5944627743508821, + "grad_norm": 17.875, + "learning_rate": 1.5142711518858309e-05, + "loss": 0.5569, + "step": 1390 + }, + { + "epoch": 1.595610385884378, + "grad_norm": 23.375, + "learning_rate": 1.5137614678899085e-05, + "loss": 0.5306, + "step": 1391 + }, + { + "epoch": 1.596757997417874, + "grad_norm": 11.5, + "learning_rate": 1.5132517838939857e-05, + "loss": 0.2887, + "step": 1392 + }, + { + "epoch": 1.5979056089513701, + "grad_norm": 22.5, + "learning_rate": 1.5127420998980632e-05, + "loss": 0.5286, + "step": 1393 + }, + { + "epoch": 1.599053220484866, + "grad_norm": 21.0, + "learning_rate": 1.5122324159021408e-05, + "loss": 0.3716, + "step": 1394 + }, + { + "epoch": 1.6002008320183618, + "grad_norm": 63.25, + "learning_rate": 1.5117227319062182e-05, + "loss": 0.7257, + "step": 1395 + }, + { + "epoch": 1.6013484435518577, + "grad_norm": 7.6875, + "learning_rate": 1.5112130479102958e-05, + "loss": 0.127, + "step": 1396 + }, + { + "epoch": 1.6024960550853535, + "grad_norm": 17.0, + "learning_rate": 1.5107033639143731e-05, + "loss": 0.2272, + "step": 1397 + }, + { + "epoch": 1.6036436666188494, + "grad_norm": 30.875, + "learning_rate": 1.5101936799184507e-05, + "loss": 0.4778, + "step": 1398 + }, + { + "epoch": 1.6047912781523455, + "grad_norm": 19.5, + "learning_rate": 1.5096839959225283e-05, + "loss": 0.5537, + "step": 1399 + }, + { + "epoch": 1.6059388896858413, + "grad_norm": 57.5, + "learning_rate": 1.5091743119266057e-05, + "loss": 0.6817, + "step": 1400 + }, + { + "epoch": 1.6059388896858413, + "eval_accuracy": 0.63, + "eval_loss": 0.49080872535705566, + "eval_runtime": 49.7511, + "eval_samples_per_second": 2.01, + "eval_steps_per_second": 2.01, + "step": 1400 + }, + { + "epoch": 1.6070865012193374, + "grad_norm": 14.125, + "learning_rate": 1.5086646279306832e-05, + "loss": 0.6062, + "step": 1401 + }, + { + "epoch": 1.6082341127528332, + "grad_norm": 9.625, + "learning_rate": 1.5081549439347604e-05, + "loss": 0.2577, + "step": 1402 + }, + { + "epoch": 1.609381724286329, + "grad_norm": 19.25, + "learning_rate": 1.507645259938838e-05, + "loss": 0.4531, + "step": 1403 + }, + { + "epoch": 1.610529335819825, + "grad_norm": 41.0, + "learning_rate": 1.5071355759429156e-05, + "loss": 0.6092, + "step": 1404 + }, + { + "epoch": 1.6116769473533208, + "grad_norm": 34.0, + "learning_rate": 1.506625891946993e-05, + "loss": 0.4515, + "step": 1405 + }, + { + "epoch": 1.6128245588868169, + "grad_norm": 23.75, + "learning_rate": 1.5061162079510705e-05, + "loss": 0.5269, + "step": 1406 + }, + { + "epoch": 1.6139721704203127, + "grad_norm": 31.75, + "learning_rate": 1.5056065239551479e-05, + "loss": 0.4641, + "step": 1407 + }, + { + "epoch": 1.6151197819538088, + "grad_norm": 33.75, + "learning_rate": 1.5050968399592255e-05, + "loss": 0.3172, + "step": 1408 + }, + { + "epoch": 1.6162673934873046, + "grad_norm": 9.9375, + "learning_rate": 1.504587155963303e-05, + "loss": 0.231, + "step": 1409 + }, + { + "epoch": 1.6174150050208005, + "grad_norm": 10.5, + "learning_rate": 1.5040774719673802e-05, + "loss": 0.19, + "step": 1410 + }, + { + "epoch": 1.6185626165542963, + "grad_norm": 26.375, + "learning_rate": 1.5035677879714578e-05, + "loss": 0.6969, + "step": 1411 + }, + { + "epoch": 1.6197102280877922, + "grad_norm": 11.5625, + "learning_rate": 1.5030581039755352e-05, + "loss": 0.3084, + "step": 1412 + }, + { + "epoch": 1.620857839621288, + "grad_norm": 33.5, + "learning_rate": 1.5025484199796127e-05, + "loss": 0.9029, + "step": 1413 + }, + { + "epoch": 1.6220054511547841, + "grad_norm": 26.75, + "learning_rate": 1.5020387359836903e-05, + "loss": 0.7869, + "step": 1414 + }, + { + "epoch": 1.62315306268828, + "grad_norm": 18.5, + "learning_rate": 1.5015290519877677e-05, + "loss": 0.555, + "step": 1415 + }, + { + "epoch": 1.624300674221776, + "grad_norm": 26.25, + "learning_rate": 1.5010193679918453e-05, + "loss": 0.8343, + "step": 1416 + }, + { + "epoch": 1.625448285755272, + "grad_norm": 18.625, + "learning_rate": 1.5005096839959225e-05, + "loss": 0.4117, + "step": 1417 + }, + { + "epoch": 1.6265958972887677, + "grad_norm": 30.75, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.3806, + "step": 1418 + }, + { + "epoch": 1.6277435088222636, + "grad_norm": 24.375, + "learning_rate": 1.4994903160040778e-05, + "loss": 0.4463, + "step": 1419 + }, + { + "epoch": 1.6288911203557594, + "grad_norm": 14.5, + "learning_rate": 1.498980632008155e-05, + "loss": 0.1973, + "step": 1420 + }, + { + "epoch": 1.6300387318892555, + "grad_norm": 19.25, + "learning_rate": 1.4984709480122325e-05, + "loss": 0.689, + "step": 1421 + }, + { + "epoch": 1.6311863434227514, + "grad_norm": 21.125, + "learning_rate": 1.49796126401631e-05, + "loss": 0.5135, + "step": 1422 + }, + { + "epoch": 1.6323339549562474, + "grad_norm": 29.5, + "learning_rate": 1.4974515800203875e-05, + "loss": 0.3149, + "step": 1423 + }, + { + "epoch": 1.6334815664897433, + "grad_norm": 13.1875, + "learning_rate": 1.496941896024465e-05, + "loss": 0.2246, + "step": 1424 + }, + { + "epoch": 1.6346291780232391, + "grad_norm": 76.0, + "learning_rate": 1.4964322120285424e-05, + "loss": 0.7469, + "step": 1425 + }, + { + "epoch": 1.635776789556735, + "grad_norm": 90.5, + "learning_rate": 1.49592252803262e-05, + "loss": 0.8995, + "step": 1426 + }, + { + "epoch": 1.6369244010902309, + "grad_norm": 23.25, + "learning_rate": 1.4954128440366972e-05, + "loss": 0.7444, + "step": 1427 + }, + { + "epoch": 1.6380720126237267, + "grad_norm": 18.0, + "learning_rate": 1.4949031600407748e-05, + "loss": 0.4238, + "step": 1428 + }, + { + "epoch": 1.6392196241572228, + "grad_norm": 31.25, + "learning_rate": 1.4943934760448523e-05, + "loss": 0.4251, + "step": 1429 + }, + { + "epoch": 1.6403672356907189, + "grad_norm": 20.0, + "learning_rate": 1.4938837920489297e-05, + "loss": 0.6363, + "step": 1430 + }, + { + "epoch": 1.6415148472242147, + "grad_norm": 42.0, + "learning_rate": 1.4933741080530073e-05, + "loss": 0.6076, + "step": 1431 + }, + { + "epoch": 1.6426624587577106, + "grad_norm": 62.0, + "learning_rate": 1.4928644240570847e-05, + "loss": 0.5137, + "step": 1432 + }, + { + "epoch": 1.6438100702912064, + "grad_norm": 26.125, + "learning_rate": 1.4923547400611623e-05, + "loss": 0.4482, + "step": 1433 + }, + { + "epoch": 1.6449576818247023, + "grad_norm": 66.0, + "learning_rate": 1.4918450560652398e-05, + "loss": 0.9944, + "step": 1434 + }, + { + "epoch": 1.646105293358198, + "grad_norm": 21.25, + "learning_rate": 1.491335372069317e-05, + "loss": 0.2451, + "step": 1435 + }, + { + "epoch": 1.6472529048916942, + "grad_norm": 18.0, + "learning_rate": 1.4908256880733946e-05, + "loss": 0.699, + "step": 1436 + }, + { + "epoch": 1.64840051642519, + "grad_norm": 17.125, + "learning_rate": 1.490316004077472e-05, + "loss": 0.4074, + "step": 1437 + }, + { + "epoch": 1.649548127958686, + "grad_norm": 59.25, + "learning_rate": 1.4898063200815495e-05, + "loss": 0.4132, + "step": 1438 + }, + { + "epoch": 1.650695739492182, + "grad_norm": 20.5, + "learning_rate": 1.4892966360856271e-05, + "loss": 0.339, + "step": 1439 + }, + { + "epoch": 1.6518433510256778, + "grad_norm": 27.0, + "learning_rate": 1.4887869520897045e-05, + "loss": 0.3662, + "step": 1440 + }, + { + "epoch": 1.6529909625591737, + "grad_norm": 9.5625, + "learning_rate": 1.488277268093782e-05, + "loss": 0.3819, + "step": 1441 + }, + { + "epoch": 1.6541385740926695, + "grad_norm": 130.0, + "learning_rate": 1.4877675840978594e-05, + "loss": 0.4787, + "step": 1442 + }, + { + "epoch": 1.6552861856261656, + "grad_norm": 29.375, + "learning_rate": 1.487257900101937e-05, + "loss": 0.4502, + "step": 1443 + }, + { + "epoch": 1.6564337971596614, + "grad_norm": 28.125, + "learning_rate": 1.4867482161060146e-05, + "loss": 0.4953, + "step": 1444 + }, + { + "epoch": 1.6575814086931575, + "grad_norm": 36.0, + "learning_rate": 1.4862385321100918e-05, + "loss": 0.9421, + "step": 1445 + }, + { + "epoch": 1.6587290202266534, + "grad_norm": 35.0, + "learning_rate": 1.4857288481141693e-05, + "loss": 0.3018, + "step": 1446 + }, + { + "epoch": 1.6598766317601492, + "grad_norm": 18.625, + "learning_rate": 1.4852191641182467e-05, + "loss": 0.2527, + "step": 1447 + }, + { + "epoch": 1.661024243293645, + "grad_norm": 12.5, + "learning_rate": 1.4847094801223243e-05, + "loss": 0.3482, + "step": 1448 + }, + { + "epoch": 1.662171854827141, + "grad_norm": 18.75, + "learning_rate": 1.4841997961264019e-05, + "loss": 0.1798, + "step": 1449 + }, + { + "epoch": 1.6633194663606368, + "grad_norm": 6.34375, + "learning_rate": 1.4836901121304792e-05, + "loss": 0.1118, + "step": 1450 + }, + { + "epoch": 1.6644670778941328, + "grad_norm": 21.375, + "learning_rate": 1.4831804281345568e-05, + "loss": 0.5154, + "step": 1451 + }, + { + "epoch": 1.665614689427629, + "grad_norm": 57.75, + "learning_rate": 1.482670744138634e-05, + "loss": 0.845, + "step": 1452 + }, + { + "epoch": 1.6667623009611248, + "grad_norm": 31.875, + "learning_rate": 1.4821610601427116e-05, + "loss": 0.6743, + "step": 1453 + }, + { + "epoch": 1.6679099124946206, + "grad_norm": 30.5, + "learning_rate": 1.4816513761467891e-05, + "loss": 0.6286, + "step": 1454 + }, + { + "epoch": 1.6690575240281165, + "grad_norm": 26.25, + "learning_rate": 1.4811416921508665e-05, + "loss": 0.2807, + "step": 1455 + }, + { + "epoch": 1.6702051355616123, + "grad_norm": 21.25, + "learning_rate": 1.4806320081549441e-05, + "loss": 0.5438, + "step": 1456 + }, + { + "epoch": 1.6713527470951082, + "grad_norm": 15.875, + "learning_rate": 1.4801223241590215e-05, + "loss": 0.4873, + "step": 1457 + }, + { + "epoch": 1.6725003586286042, + "grad_norm": 12.3125, + "learning_rate": 1.479612640163099e-05, + "loss": 0.2455, + "step": 1458 + }, + { + "epoch": 1.6736479701621, + "grad_norm": 36.25, + "learning_rate": 1.4791029561671764e-05, + "loss": 0.6741, + "step": 1459 + }, + { + "epoch": 1.6747955816955962, + "grad_norm": 36.25, + "learning_rate": 1.478593272171254e-05, + "loss": 0.2113, + "step": 1460 + }, + { + "epoch": 1.675943193229092, + "grad_norm": 15.0625, + "learning_rate": 1.4780835881753316e-05, + "loss": 0.3223, + "step": 1461 + }, + { + "epoch": 1.6770908047625879, + "grad_norm": 70.5, + "learning_rate": 1.4775739041794088e-05, + "loss": 0.7413, + "step": 1462 + }, + { + "epoch": 1.6782384162960837, + "grad_norm": 50.25, + "learning_rate": 1.4770642201834863e-05, + "loss": 0.5802, + "step": 1463 + }, + { + "epoch": 1.6793860278295796, + "grad_norm": 13.3125, + "learning_rate": 1.4765545361875637e-05, + "loss": 0.3988, + "step": 1464 + }, + { + "epoch": 1.6805336393630756, + "grad_norm": 22.125, + "learning_rate": 1.4760448521916413e-05, + "loss": 0.2763, + "step": 1465 + }, + { + "epoch": 1.6816812508965715, + "grad_norm": 63.5, + "learning_rate": 1.4755351681957188e-05, + "loss": 0.5855, + "step": 1466 + }, + { + "epoch": 1.6828288624300676, + "grad_norm": 41.75, + "learning_rate": 1.4750254841997962e-05, + "loss": 0.413, + "step": 1467 + }, + { + "epoch": 1.6839764739635634, + "grad_norm": 75.0, + "learning_rate": 1.4745158002038738e-05, + "loss": 1.2905, + "step": 1468 + }, + { + "epoch": 1.6851240854970593, + "grad_norm": 35.0, + "learning_rate": 1.474006116207951e-05, + "loss": 0.5774, + "step": 1469 + }, + { + "epoch": 1.6862716970305551, + "grad_norm": 28.25, + "learning_rate": 1.4734964322120286e-05, + "loss": 0.8901, + "step": 1470 + }, + { + "epoch": 1.687419308564051, + "grad_norm": 35.0, + "learning_rate": 1.4729867482161061e-05, + "loss": 0.457, + "step": 1471 + }, + { + "epoch": 1.6885669200975468, + "grad_norm": 34.0, + "learning_rate": 1.4724770642201835e-05, + "loss": 0.4638, + "step": 1472 + }, + { + "epoch": 1.689714531631043, + "grad_norm": 37.5, + "learning_rate": 1.4719673802242611e-05, + "loss": 0.5084, + "step": 1473 + }, + { + "epoch": 1.6908621431645388, + "grad_norm": 26.625, + "learning_rate": 1.4714576962283385e-05, + "loss": 0.2821, + "step": 1474 + }, + { + "epoch": 1.6920097546980348, + "grad_norm": 34.25, + "learning_rate": 1.470948012232416e-05, + "loss": 0.3812, + "step": 1475 + }, + { + "epoch": 1.6931573662315307, + "grad_norm": 59.0, + "learning_rate": 1.4704383282364936e-05, + "loss": 0.5477, + "step": 1476 + }, + { + "epoch": 1.6943049777650265, + "grad_norm": 28.25, + "learning_rate": 1.469928644240571e-05, + "loss": 0.6984, + "step": 1477 + }, + { + "epoch": 1.6954525892985224, + "grad_norm": 69.5, + "learning_rate": 1.4694189602446486e-05, + "loss": 0.7855, + "step": 1478 + }, + { + "epoch": 1.6966002008320182, + "grad_norm": 49.0, + "learning_rate": 1.4689092762487258e-05, + "loss": 0.984, + "step": 1479 + }, + { + "epoch": 1.6977478123655143, + "grad_norm": 22.875, + "learning_rate": 1.4683995922528033e-05, + "loss": 0.6088, + "step": 1480 + }, + { + "epoch": 1.6988954238990102, + "grad_norm": 17.875, + "learning_rate": 1.4678899082568809e-05, + "loss": 0.1793, + "step": 1481 + }, + { + "epoch": 1.7000430354325062, + "grad_norm": 22.375, + "learning_rate": 1.4673802242609583e-05, + "loss": 0.4399, + "step": 1482 + }, + { + "epoch": 1.701190646966002, + "grad_norm": 44.75, + "learning_rate": 1.4668705402650358e-05, + "loss": 0.8196, + "step": 1483 + }, + { + "epoch": 1.702338258499498, + "grad_norm": 17.75, + "learning_rate": 1.4663608562691132e-05, + "loss": 0.3481, + "step": 1484 + }, + { + "epoch": 1.7034858700329938, + "grad_norm": 30.75, + "learning_rate": 1.4658511722731908e-05, + "loss": 0.5881, + "step": 1485 + }, + { + "epoch": 1.7046334815664896, + "grad_norm": 54.5, + "learning_rate": 1.4653414882772684e-05, + "loss": 0.9103, + "step": 1486 + }, + { + "epoch": 1.7057810930999855, + "grad_norm": 22.0, + "learning_rate": 1.4648318042813456e-05, + "loss": 0.9757, + "step": 1487 + }, + { + "epoch": 1.7069287046334816, + "grad_norm": 41.25, + "learning_rate": 1.4643221202854231e-05, + "loss": 0.2791, + "step": 1488 + }, + { + "epoch": 1.7080763161669776, + "grad_norm": 72.5, + "learning_rate": 1.4638124362895005e-05, + "loss": 0.6413, + "step": 1489 + }, + { + "epoch": 1.7092239277004735, + "grad_norm": 31.25, + "learning_rate": 1.463302752293578e-05, + "loss": 0.6097, + "step": 1490 + }, + { + "epoch": 1.7103715392339693, + "grad_norm": 31.625, + "learning_rate": 1.4627930682976556e-05, + "loss": 0.6532, + "step": 1491 + }, + { + "epoch": 1.7115191507674652, + "grad_norm": 23.75, + "learning_rate": 1.462283384301733e-05, + "loss": 0.5511, + "step": 1492 + }, + { + "epoch": 1.712666762300961, + "grad_norm": 44.25, + "learning_rate": 1.4617737003058106e-05, + "loss": 0.5933, + "step": 1493 + }, + { + "epoch": 1.713814373834457, + "grad_norm": 175.0, + "learning_rate": 1.461264016309888e-05, + "loss": 0.8476, + "step": 1494 + }, + { + "epoch": 1.714961985367953, + "grad_norm": 12.0625, + "learning_rate": 1.4607543323139655e-05, + "loss": 0.2916, + "step": 1495 + }, + { + "epoch": 1.7161095969014488, + "grad_norm": 40.0, + "learning_rate": 1.4602446483180431e-05, + "loss": 0.4779, + "step": 1496 + }, + { + "epoch": 1.717257208434945, + "grad_norm": 19.25, + "learning_rate": 1.4597349643221203e-05, + "loss": 0.3403, + "step": 1497 + }, + { + "epoch": 1.7184048199684407, + "grad_norm": 20.125, + "learning_rate": 1.4592252803261979e-05, + "loss": 0.4278, + "step": 1498 + }, + { + "epoch": 1.7195524315019366, + "grad_norm": 11.125, + "learning_rate": 1.4587155963302753e-05, + "loss": 0.4435, + "step": 1499 + }, + { + "epoch": 1.7207000430354324, + "grad_norm": 47.75, + "learning_rate": 1.4582059123343528e-05, + "loss": 0.6405, + "step": 1500 + }, + { + "epoch": 1.7207000430354324, + "eval_accuracy": 0.64, + "eval_loss": 0.4719592034816742, + "eval_runtime": 49.6324, + "eval_samples_per_second": 2.015, + "eval_steps_per_second": 2.015, + "step": 1500 + }, + { + "epoch": 1.7218476545689283, + "grad_norm": 11.5, + "learning_rate": 1.4576962283384304e-05, + "loss": 0.3975, + "step": 1501 + }, + { + "epoch": 1.7229952661024244, + "grad_norm": 20.875, + "learning_rate": 1.4571865443425078e-05, + "loss": 0.3939, + "step": 1502 + }, + { + "epoch": 1.7241428776359202, + "grad_norm": 44.25, + "learning_rate": 1.4566768603465853e-05, + "loss": 0.7124, + "step": 1503 + }, + { + "epoch": 1.7252904891694163, + "grad_norm": 33.0, + "learning_rate": 1.4561671763506626e-05, + "loss": 0.5179, + "step": 1504 + }, + { + "epoch": 1.7264381007029121, + "grad_norm": 13.9375, + "learning_rate": 1.4556574923547401e-05, + "loss": 0.6342, + "step": 1505 + }, + { + "epoch": 1.727585712236408, + "grad_norm": 20.75, + "learning_rate": 1.4551478083588177e-05, + "loss": 0.397, + "step": 1506 + }, + { + "epoch": 1.7287333237699039, + "grad_norm": 12.375, + "learning_rate": 1.454638124362895e-05, + "loss": 0.3495, + "step": 1507 + }, + { + "epoch": 1.7298809353033997, + "grad_norm": 53.75, + "learning_rate": 1.4541284403669726e-05, + "loss": 0.5092, + "step": 1508 + }, + { + "epoch": 1.7310285468368956, + "grad_norm": 14.25, + "learning_rate": 1.45361875637105e-05, + "loss": 0.1927, + "step": 1509 + }, + { + "epoch": 1.7321761583703916, + "grad_norm": 20.875, + "learning_rate": 1.4531090723751276e-05, + "loss": 0.7156, + "step": 1510 + }, + { + "epoch": 1.7333237699038875, + "grad_norm": 8.6875, + "learning_rate": 1.4525993883792051e-05, + "loss": 0.3399, + "step": 1511 + }, + { + "epoch": 1.7344713814373836, + "grad_norm": 16.125, + "learning_rate": 1.4520897043832824e-05, + "loss": 0.5978, + "step": 1512 + }, + { + "epoch": 1.7356189929708794, + "grad_norm": 42.0, + "learning_rate": 1.45158002038736e-05, + "loss": 0.9311, + "step": 1513 + }, + { + "epoch": 1.7367666045043753, + "grad_norm": 70.5, + "learning_rate": 1.4510703363914373e-05, + "loss": 0.7334, + "step": 1514 + }, + { + "epoch": 1.737914216037871, + "grad_norm": 16.625, + "learning_rate": 1.4505606523955149e-05, + "loss": 0.4106, + "step": 1515 + }, + { + "epoch": 1.739061827571367, + "grad_norm": 12.0, + "learning_rate": 1.4500509683995924e-05, + "loss": 0.2984, + "step": 1516 + }, + { + "epoch": 1.740209439104863, + "grad_norm": 27.125, + "learning_rate": 1.4495412844036698e-05, + "loss": 0.3245, + "step": 1517 + }, + { + "epoch": 1.7413570506383589, + "grad_norm": 40.25, + "learning_rate": 1.4490316004077474e-05, + "loss": 0.5248, + "step": 1518 + }, + { + "epoch": 1.742504662171855, + "grad_norm": 15.5, + "learning_rate": 1.4485219164118248e-05, + "loss": 0.3244, + "step": 1519 + }, + { + "epoch": 1.7436522737053508, + "grad_norm": 70.5, + "learning_rate": 1.4480122324159023e-05, + "loss": 0.9236, + "step": 1520 + }, + { + "epoch": 1.7447998852388467, + "grad_norm": 30.625, + "learning_rate": 1.4475025484199799e-05, + "loss": 0.8874, + "step": 1521 + }, + { + "epoch": 1.7459474967723425, + "grad_norm": 11.6875, + "learning_rate": 1.4469928644240571e-05, + "loss": 0.3286, + "step": 1522 + }, + { + "epoch": 1.7470951083058384, + "grad_norm": 26.875, + "learning_rate": 1.4464831804281347e-05, + "loss": 0.3404, + "step": 1523 + }, + { + "epoch": 1.7482427198393344, + "grad_norm": 15.375, + "learning_rate": 1.445973496432212e-05, + "loss": 0.4482, + "step": 1524 + }, + { + "epoch": 1.7493903313728303, + "grad_norm": 27.0, + "learning_rate": 1.4454638124362896e-05, + "loss": 0.476, + "step": 1525 + }, + { + "epoch": 1.7505379429063264, + "grad_norm": 20.5, + "learning_rate": 1.4449541284403672e-05, + "loss": 0.3796, + "step": 1526 + }, + { + "epoch": 1.7516855544398222, + "grad_norm": 47.75, + "learning_rate": 1.4444444444444446e-05, + "loss": 0.5618, + "step": 1527 + }, + { + "epoch": 1.752833165973318, + "grad_norm": 29.5, + "learning_rate": 1.4439347604485221e-05, + "loss": 0.4359, + "step": 1528 + }, + { + "epoch": 1.753980777506814, + "grad_norm": 52.25, + "learning_rate": 1.4434250764525994e-05, + "loss": 0.6163, + "step": 1529 + }, + { + "epoch": 1.7551283890403098, + "grad_norm": 19.125, + "learning_rate": 1.442915392456677e-05, + "loss": 0.5202, + "step": 1530 + }, + { + "epoch": 1.7562760005738056, + "grad_norm": 14.0, + "learning_rate": 1.4424057084607545e-05, + "loss": 0.3921, + "step": 1531 + }, + { + "epoch": 1.7574236121073017, + "grad_norm": 64.0, + "learning_rate": 1.4418960244648319e-05, + "loss": 0.7896, + "step": 1532 + }, + { + "epoch": 1.7585712236407975, + "grad_norm": 23.5, + "learning_rate": 1.4413863404689094e-05, + "loss": 0.4141, + "step": 1533 + }, + { + "epoch": 1.7597188351742936, + "grad_norm": 39.75, + "learning_rate": 1.4408766564729868e-05, + "loss": 0.8279, + "step": 1534 + }, + { + "epoch": 1.7608664467077895, + "grad_norm": 60.5, + "learning_rate": 1.4403669724770644e-05, + "loss": 0.6541, + "step": 1535 + }, + { + "epoch": 1.7620140582412853, + "grad_norm": 22.375, + "learning_rate": 1.4398572884811418e-05, + "loss": 0.4579, + "step": 1536 + }, + { + "epoch": 1.7631616697747812, + "grad_norm": 34.5, + "learning_rate": 1.4393476044852193e-05, + "loss": 0.3177, + "step": 1537 + }, + { + "epoch": 1.764309281308277, + "grad_norm": 13.9375, + "learning_rate": 1.4388379204892969e-05, + "loss": 0.405, + "step": 1538 + }, + { + "epoch": 1.765456892841773, + "grad_norm": 45.25, + "learning_rate": 1.4383282364933741e-05, + "loss": 0.4536, + "step": 1539 + }, + { + "epoch": 1.766604504375269, + "grad_norm": 15.0, + "learning_rate": 1.4378185524974517e-05, + "loss": 0.658, + "step": 1540 + }, + { + "epoch": 1.767752115908765, + "grad_norm": 23.125, + "learning_rate": 1.437308868501529e-05, + "loss": 0.5647, + "step": 1541 + }, + { + "epoch": 1.7688997274422609, + "grad_norm": 49.5, + "learning_rate": 1.4367991845056066e-05, + "loss": 0.6544, + "step": 1542 + }, + { + "epoch": 1.7700473389757567, + "grad_norm": 14.625, + "learning_rate": 1.4362895005096842e-05, + "loss": 0.3288, + "step": 1543 + }, + { + "epoch": 1.7711949505092526, + "grad_norm": 14.875, + "learning_rate": 1.4357798165137616e-05, + "loss": 0.5407, + "step": 1544 + }, + { + "epoch": 1.7723425620427484, + "grad_norm": 69.0, + "learning_rate": 1.4352701325178391e-05, + "loss": 0.4395, + "step": 1545 + }, + { + "epoch": 1.7734901735762443, + "grad_norm": 32.5, + "learning_rate": 1.4347604485219164e-05, + "loss": 0.4165, + "step": 1546 + }, + { + "epoch": 1.7746377851097404, + "grad_norm": 52.25, + "learning_rate": 1.434250764525994e-05, + "loss": 0.455, + "step": 1547 + }, + { + "epoch": 1.7757853966432364, + "grad_norm": 26.875, + "learning_rate": 1.4337410805300715e-05, + "loss": 0.5133, + "step": 1548 + }, + { + "epoch": 1.7769330081767323, + "grad_norm": 63.75, + "learning_rate": 1.4332313965341489e-05, + "loss": 0.8173, + "step": 1549 + }, + { + "epoch": 1.7780806197102281, + "grad_norm": 69.5, + "learning_rate": 1.4327217125382264e-05, + "loss": 0.7585, + "step": 1550 + }, + { + "epoch": 1.779228231243724, + "grad_norm": 12.25, + "learning_rate": 1.4322120285423038e-05, + "loss": 0.4586, + "step": 1551 + }, + { + "epoch": 1.7803758427772198, + "grad_norm": 76.0, + "learning_rate": 1.4317023445463814e-05, + "loss": 0.6924, + "step": 1552 + }, + { + "epoch": 1.7815234543107157, + "grad_norm": 12.4375, + "learning_rate": 1.431192660550459e-05, + "loss": 0.3333, + "step": 1553 + }, + { + "epoch": 1.7826710658442118, + "grad_norm": 23.5, + "learning_rate": 1.4306829765545363e-05, + "loss": 0.7329, + "step": 1554 + }, + { + "epoch": 1.7838186773777076, + "grad_norm": 22.875, + "learning_rate": 1.4301732925586139e-05, + "loss": 0.2949, + "step": 1555 + }, + { + "epoch": 1.7849662889112037, + "grad_norm": 52.0, + "learning_rate": 1.4296636085626911e-05, + "loss": 0.6708, + "step": 1556 + }, + { + "epoch": 1.7861139004446995, + "grad_norm": 75.0, + "learning_rate": 1.4291539245667687e-05, + "loss": 0.6416, + "step": 1557 + }, + { + "epoch": 1.7872615119781954, + "grad_norm": 16.0, + "learning_rate": 1.4286442405708462e-05, + "loss": 0.1615, + "step": 1558 + }, + { + "epoch": 1.7884091235116912, + "grad_norm": 13.8125, + "learning_rate": 1.4281345565749236e-05, + "loss": 0.2567, + "step": 1559 + }, + { + "epoch": 1.789556735045187, + "grad_norm": 27.125, + "learning_rate": 1.4276248725790012e-05, + "loss": 0.3011, + "step": 1560 + }, + { + "epoch": 1.7907043465786832, + "grad_norm": 37.5, + "learning_rate": 1.4271151885830786e-05, + "loss": 0.4136, + "step": 1561 + }, + { + "epoch": 1.791851958112179, + "grad_norm": 64.0, + "learning_rate": 1.4266055045871561e-05, + "loss": 0.5132, + "step": 1562 + }, + { + "epoch": 1.792999569645675, + "grad_norm": 23.5, + "learning_rate": 1.4260958205912337e-05, + "loss": 0.8581, + "step": 1563 + }, + { + "epoch": 1.794147181179171, + "grad_norm": 35.75, + "learning_rate": 1.4255861365953109e-05, + "loss": 0.4336, + "step": 1564 + }, + { + "epoch": 1.7952947927126668, + "grad_norm": 34.5, + "learning_rate": 1.4250764525993885e-05, + "loss": 0.7922, + "step": 1565 + }, + { + "epoch": 1.7964424042461626, + "grad_norm": 12.375, + "learning_rate": 1.4245667686034659e-05, + "loss": 0.385, + "step": 1566 + }, + { + "epoch": 1.7975900157796585, + "grad_norm": 22.125, + "learning_rate": 1.4240570846075434e-05, + "loss": 0.1375, + "step": 1567 + }, + { + "epoch": 1.7987376273131543, + "grad_norm": 49.25, + "learning_rate": 1.423547400611621e-05, + "loss": 0.2854, + "step": 1568 + }, + { + "epoch": 1.7998852388466504, + "grad_norm": 74.5, + "learning_rate": 1.4230377166156984e-05, + "loss": 0.8727, + "step": 1569 + }, + { + "epoch": 1.8010328503801463, + "grad_norm": 5.875, + "learning_rate": 1.422528032619776e-05, + "loss": 0.0896, + "step": 1570 + }, + { + "epoch": 1.8021804619136423, + "grad_norm": 23.0, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.709, + "step": 1571 + }, + { + "epoch": 1.8033280734471382, + "grad_norm": 8.5, + "learning_rate": 1.4215086646279309e-05, + "loss": 0.1583, + "step": 1572 + }, + { + "epoch": 1.804475684980634, + "grad_norm": 45.25, + "learning_rate": 1.4209989806320084e-05, + "loss": 0.3676, + "step": 1573 + }, + { + "epoch": 1.80562329651413, + "grad_norm": 14.125, + "learning_rate": 1.4204892966360857e-05, + "loss": 0.2197, + "step": 1574 + }, + { + "epoch": 1.8067709080476257, + "grad_norm": 8.6875, + "learning_rate": 1.4199796126401632e-05, + "loss": 0.2591, + "step": 1575 + }, + { + "epoch": 1.8079185195811218, + "grad_norm": 43.25, + "learning_rate": 1.4194699286442406e-05, + "loss": 0.4846, + "step": 1576 + }, + { + "epoch": 1.8090661311146177, + "grad_norm": 32.0, + "learning_rate": 1.4189602446483182e-05, + "loss": 0.2703, + "step": 1577 + }, + { + "epoch": 1.8102137426481137, + "grad_norm": 46.25, + "learning_rate": 1.4184505606523957e-05, + "loss": 0.6256, + "step": 1578 + }, + { + "epoch": 1.8113613541816096, + "grad_norm": 38.25, + "learning_rate": 1.4179408766564731e-05, + "loss": 1.0764, + "step": 1579 + }, + { + "epoch": 1.8125089657151054, + "grad_norm": 21.625, + "learning_rate": 1.4174311926605507e-05, + "loss": 0.1879, + "step": 1580 + }, + { + "epoch": 1.8136565772486013, + "grad_norm": 25.375, + "learning_rate": 1.4169215086646279e-05, + "loss": 0.8602, + "step": 1581 + }, + { + "epoch": 1.8148041887820971, + "grad_norm": 73.0, + "learning_rate": 1.4164118246687055e-05, + "loss": 0.6298, + "step": 1582 + }, + { + "epoch": 1.8159518003155932, + "grad_norm": 33.0, + "learning_rate": 1.415902140672783e-05, + "loss": 0.3714, + "step": 1583 + }, + { + "epoch": 1.817099411849089, + "grad_norm": 13.9375, + "learning_rate": 1.4153924566768604e-05, + "loss": 0.2252, + "step": 1584 + }, + { + "epoch": 1.8182470233825851, + "grad_norm": 42.5, + "learning_rate": 1.414882772680938e-05, + "loss": 0.577, + "step": 1585 + }, + { + "epoch": 1.819394634916081, + "grad_norm": 28.375, + "learning_rate": 1.4143730886850154e-05, + "loss": 0.5294, + "step": 1586 + }, + { + "epoch": 1.8205422464495769, + "grad_norm": 29.25, + "learning_rate": 1.413863404689093e-05, + "loss": 0.4661, + "step": 1587 + }, + { + "epoch": 1.8216898579830727, + "grad_norm": 15.6875, + "learning_rate": 1.4133537206931705e-05, + "loss": 0.358, + "step": 1588 + }, + { + "epoch": 1.8228374695165686, + "grad_norm": 42.0, + "learning_rate": 1.4128440366972477e-05, + "loss": 0.5276, + "step": 1589 + }, + { + "epoch": 1.8239850810500644, + "grad_norm": 98.5, + "learning_rate": 1.4123343527013254e-05, + "loss": 0.6566, + "step": 1590 + }, + { + "epoch": 1.8251326925835605, + "grad_norm": 37.0, + "learning_rate": 1.4118246687054027e-05, + "loss": 0.2234, + "step": 1591 + }, + { + "epoch": 1.8262803041170563, + "grad_norm": 49.5, + "learning_rate": 1.4113149847094802e-05, + "loss": 0.5727, + "step": 1592 + }, + { + "epoch": 1.8274279156505524, + "grad_norm": 31.75, + "learning_rate": 1.4108053007135578e-05, + "loss": 0.7391, + "step": 1593 + }, + { + "epoch": 1.8285755271840483, + "grad_norm": 81.0, + "learning_rate": 1.4102956167176352e-05, + "loss": 0.762, + "step": 1594 + }, + { + "epoch": 1.829723138717544, + "grad_norm": 56.0, + "learning_rate": 1.4097859327217127e-05, + "loss": 0.371, + "step": 1595 + }, + { + "epoch": 1.83087075025104, + "grad_norm": 33.75, + "learning_rate": 1.4092762487257901e-05, + "loss": 0.5857, + "step": 1596 + }, + { + "epoch": 1.8320183617845358, + "grad_norm": 15.0625, + "learning_rate": 1.4087665647298677e-05, + "loss": 0.2163, + "step": 1597 + }, + { + "epoch": 1.8331659733180319, + "grad_norm": 21.25, + "learning_rate": 1.4082568807339452e-05, + "loss": 0.4766, + "step": 1598 + }, + { + "epoch": 1.8343135848515277, + "grad_norm": 49.75, + "learning_rate": 1.4077471967380225e-05, + "loss": 0.3923, + "step": 1599 + }, + { + "epoch": 1.8354611963850238, + "grad_norm": 38.25, + "learning_rate": 1.4072375127421e-05, + "loss": 0.445, + "step": 1600 + }, + { + "epoch": 1.8354611963850238, + "eval_accuracy": 0.69, + "eval_loss": 0.5018435120582581, + "eval_runtime": 49.4827, + "eval_samples_per_second": 2.021, + "eval_steps_per_second": 2.021, + "step": 1600 + }, + { + "epoch": 1.8366088079185197, + "grad_norm": 36.25, + "learning_rate": 1.4067278287461774e-05, + "loss": 0.7721, + "step": 1601 + }, + { + "epoch": 1.8377564194520155, + "grad_norm": 26.875, + "learning_rate": 1.406218144750255e-05, + "loss": 0.9496, + "step": 1602 + }, + { + "epoch": 1.8389040309855114, + "grad_norm": 47.5, + "learning_rate": 1.4057084607543325e-05, + "loss": 0.5079, + "step": 1603 + }, + { + "epoch": 1.8400516425190072, + "grad_norm": 14.0625, + "learning_rate": 1.40519877675841e-05, + "loss": 0.2523, + "step": 1604 + }, + { + "epoch": 1.841199254052503, + "grad_norm": 36.5, + "learning_rate": 1.4046890927624875e-05, + "loss": 0.6013, + "step": 1605 + }, + { + "epoch": 1.8423468655859991, + "grad_norm": 33.25, + "learning_rate": 1.4041794087665647e-05, + "loss": 0.4822, + "step": 1606 + }, + { + "epoch": 1.8434944771194952, + "grad_norm": 12.6875, + "learning_rate": 1.4036697247706423e-05, + "loss": 0.4923, + "step": 1607 + }, + { + "epoch": 1.844642088652991, + "grad_norm": 15.0625, + "learning_rate": 1.4031600407747196e-05, + "loss": 0.2454, + "step": 1608 + }, + { + "epoch": 1.845789700186487, + "grad_norm": 20.5, + "learning_rate": 1.4026503567787972e-05, + "loss": 0.2125, + "step": 1609 + }, + { + "epoch": 1.8469373117199828, + "grad_norm": 18.875, + "learning_rate": 1.4021406727828748e-05, + "loss": 0.6114, + "step": 1610 + }, + { + "epoch": 1.8480849232534786, + "grad_norm": 37.25, + "learning_rate": 1.4016309887869522e-05, + "loss": 0.4515, + "step": 1611 + }, + { + "epoch": 1.8492325347869745, + "grad_norm": 10.5, + "learning_rate": 1.4011213047910297e-05, + "loss": 0.2101, + "step": 1612 + }, + { + "epoch": 1.8503801463204705, + "grad_norm": 33.5, + "learning_rate": 1.4006116207951071e-05, + "loss": 0.5889, + "step": 1613 + }, + { + "epoch": 1.8515277578539664, + "grad_norm": 19.875, + "learning_rate": 1.4001019367991847e-05, + "loss": 0.4676, + "step": 1614 + }, + { + "epoch": 1.8526753693874625, + "grad_norm": 51.0, + "learning_rate": 1.3995922528032622e-05, + "loss": 0.5021, + "step": 1615 + }, + { + "epoch": 1.8538229809209583, + "grad_norm": 38.0, + "learning_rate": 1.3990825688073395e-05, + "loss": 0.6099, + "step": 1616 + }, + { + "epoch": 1.8549705924544542, + "grad_norm": 49.75, + "learning_rate": 1.398572884811417e-05, + "loss": 0.6493, + "step": 1617 + }, + { + "epoch": 1.85611820398795, + "grad_norm": 14.5625, + "learning_rate": 1.3980632008154944e-05, + "loss": 0.1851, + "step": 1618 + }, + { + "epoch": 1.8572658155214459, + "grad_norm": 74.5, + "learning_rate": 1.397553516819572e-05, + "loss": 0.625, + "step": 1619 + }, + { + "epoch": 1.858413427054942, + "grad_norm": 49.25, + "learning_rate": 1.3970438328236495e-05, + "loss": 0.4501, + "step": 1620 + }, + { + "epoch": 1.8595610385884378, + "grad_norm": 36.0, + "learning_rate": 1.3965341488277269e-05, + "loss": 0.4769, + "step": 1621 + }, + { + "epoch": 1.8607086501219339, + "grad_norm": 72.5, + "learning_rate": 1.3960244648318045e-05, + "loss": 0.6018, + "step": 1622 + }, + { + "epoch": 1.8618562616554297, + "grad_norm": 28.75, + "learning_rate": 1.3955147808358817e-05, + "loss": 0.4446, + "step": 1623 + }, + { + "epoch": 1.8630038731889256, + "grad_norm": 58.5, + "learning_rate": 1.3950050968399593e-05, + "loss": 0.7133, + "step": 1624 + }, + { + "epoch": 1.8641514847224214, + "grad_norm": 14.6875, + "learning_rate": 1.3944954128440368e-05, + "loss": 0.2074, + "step": 1625 + }, + { + "epoch": 1.8652990962559173, + "grad_norm": 54.25, + "learning_rate": 1.3939857288481142e-05, + "loss": 0.3376, + "step": 1626 + }, + { + "epoch": 1.8664467077894131, + "grad_norm": 23.0, + "learning_rate": 1.3934760448521918e-05, + "loss": 0.5169, + "step": 1627 + }, + { + "epoch": 1.8675943193229092, + "grad_norm": 55.5, + "learning_rate": 1.3929663608562692e-05, + "loss": 0.3699, + "step": 1628 + }, + { + "epoch": 1.868741930856405, + "grad_norm": 18.125, + "learning_rate": 1.3924566768603467e-05, + "loss": 0.207, + "step": 1629 + }, + { + "epoch": 1.8698895423899011, + "grad_norm": 26.5, + "learning_rate": 1.3919469928644243e-05, + "loss": 0.4332, + "step": 1630 + }, + { + "epoch": 1.871037153923397, + "grad_norm": 22.75, + "learning_rate": 1.3914373088685017e-05, + "loss": 0.4837, + "step": 1631 + }, + { + "epoch": 1.8721847654568928, + "grad_norm": 69.5, + "learning_rate": 1.3909276248725792e-05, + "loss": 0.5754, + "step": 1632 + }, + { + "epoch": 1.8733323769903887, + "grad_norm": 16.25, + "learning_rate": 1.3904179408766564e-05, + "loss": 0.2141, + "step": 1633 + }, + { + "epoch": 1.8744799885238845, + "grad_norm": 35.75, + "learning_rate": 1.389908256880734e-05, + "loss": 0.3208, + "step": 1634 + }, + { + "epoch": 1.8756276000573806, + "grad_norm": 29.75, + "learning_rate": 1.3893985728848116e-05, + "loss": 0.6767, + "step": 1635 + }, + { + "epoch": 1.8767752115908765, + "grad_norm": 19.375, + "learning_rate": 1.388888888888889e-05, + "loss": 0.1118, + "step": 1636 + }, + { + "epoch": 1.8779228231243725, + "grad_norm": 15.8125, + "learning_rate": 1.3883792048929665e-05, + "loss": 0.1238, + "step": 1637 + }, + { + "epoch": 1.8790704346578684, + "grad_norm": 31.875, + "learning_rate": 1.3878695208970439e-05, + "loss": 0.5031, + "step": 1638 + }, + { + "epoch": 1.8802180461913642, + "grad_norm": 40.25, + "learning_rate": 1.3873598369011215e-05, + "loss": 0.8107, + "step": 1639 + }, + { + "epoch": 1.88136565772486, + "grad_norm": 25.0, + "learning_rate": 1.386850152905199e-05, + "loss": 0.3873, + "step": 1640 + }, + { + "epoch": 1.882513269258356, + "grad_norm": 78.0, + "learning_rate": 1.3863404689092762e-05, + "loss": 1.1926, + "step": 1641 + }, + { + "epoch": 1.883660880791852, + "grad_norm": 34.25, + "learning_rate": 1.3858307849133538e-05, + "loss": 0.5274, + "step": 1642 + }, + { + "epoch": 1.8848084923253479, + "grad_norm": 15.125, + "learning_rate": 1.3853211009174312e-05, + "loss": 0.4215, + "step": 1643 + }, + { + "epoch": 1.885956103858844, + "grad_norm": 28.0, + "learning_rate": 1.3848114169215088e-05, + "loss": 0.2697, + "step": 1644 + }, + { + "epoch": 1.8871037153923398, + "grad_norm": 34.25, + "learning_rate": 1.3843017329255863e-05, + "loss": 0.2025, + "step": 1645 + }, + { + "epoch": 1.8882513269258356, + "grad_norm": 91.5, + "learning_rate": 1.3837920489296637e-05, + "loss": 0.7438, + "step": 1646 + }, + { + "epoch": 1.8893989384593315, + "grad_norm": 88.0, + "learning_rate": 1.3832823649337413e-05, + "loss": 0.9659, + "step": 1647 + }, + { + "epoch": 1.8905465499928273, + "grad_norm": 26.875, + "learning_rate": 1.3827726809378187e-05, + "loss": 0.2307, + "step": 1648 + }, + { + "epoch": 1.8916941615263232, + "grad_norm": 13.375, + "learning_rate": 1.3822629969418962e-05, + "loss": 0.3359, + "step": 1649 + }, + { + "epoch": 1.8928417730598193, + "grad_norm": 72.0, + "learning_rate": 1.3817533129459738e-05, + "loss": 0.5043, + "step": 1650 + }, + { + "epoch": 1.8939893845933151, + "grad_norm": 46.75, + "learning_rate": 1.381243628950051e-05, + "loss": 0.4365, + "step": 1651 + }, + { + "epoch": 1.8951369961268112, + "grad_norm": 27.0, + "learning_rate": 1.3807339449541286e-05, + "loss": 0.4578, + "step": 1652 + }, + { + "epoch": 1.896284607660307, + "grad_norm": 49.75, + "learning_rate": 1.380224260958206e-05, + "loss": 0.645, + "step": 1653 + }, + { + "epoch": 1.897432219193803, + "grad_norm": 58.25, + "learning_rate": 1.3797145769622835e-05, + "loss": 0.7014, + "step": 1654 + }, + { + "epoch": 1.8985798307272987, + "grad_norm": 41.75, + "learning_rate": 1.379204892966361e-05, + "loss": 0.6419, + "step": 1655 + }, + { + "epoch": 1.8997274422607946, + "grad_norm": 49.5, + "learning_rate": 1.3786952089704385e-05, + "loss": 0.6695, + "step": 1656 + }, + { + "epoch": 1.9008750537942907, + "grad_norm": 34.5, + "learning_rate": 1.378185524974516e-05, + "loss": 0.324, + "step": 1657 + }, + { + "epoch": 1.9020226653277865, + "grad_norm": 14.25, + "learning_rate": 1.3776758409785932e-05, + "loss": 0.289, + "step": 1658 + }, + { + "epoch": 1.9031702768612826, + "grad_norm": 20.875, + "learning_rate": 1.3771661569826708e-05, + "loss": 0.2563, + "step": 1659 + }, + { + "epoch": 1.9043178883947784, + "grad_norm": 26.0, + "learning_rate": 1.3766564729867484e-05, + "loss": 0.7482, + "step": 1660 + }, + { + "epoch": 1.9054654999282743, + "grad_norm": 14.4375, + "learning_rate": 1.3761467889908258e-05, + "loss": 0.4048, + "step": 1661 + }, + { + "epoch": 1.9066131114617701, + "grad_norm": 51.25, + "learning_rate": 1.3756371049949033e-05, + "loss": 0.6209, + "step": 1662 + }, + { + "epoch": 1.907760722995266, + "grad_norm": 30.875, + "learning_rate": 1.3751274209989807e-05, + "loss": 0.7158, + "step": 1663 + }, + { + "epoch": 1.9089083345287619, + "grad_norm": 9.4375, + "learning_rate": 1.3746177370030583e-05, + "loss": 0.1511, + "step": 1664 + }, + { + "epoch": 1.910055946062258, + "grad_norm": 19.125, + "learning_rate": 1.3741080530071358e-05, + "loss": 0.3676, + "step": 1665 + }, + { + "epoch": 1.911203557595754, + "grad_norm": 19.875, + "learning_rate": 1.3735983690112132e-05, + "loss": 0.3149, + "step": 1666 + }, + { + "epoch": 1.9123511691292499, + "grad_norm": 22.25, + "learning_rate": 1.3730886850152908e-05, + "loss": 0.2507, + "step": 1667 + }, + { + "epoch": 1.9134987806627457, + "grad_norm": 13.0, + "learning_rate": 1.372579001019368e-05, + "loss": 0.5281, + "step": 1668 + }, + { + "epoch": 1.9146463921962416, + "grad_norm": 22.625, + "learning_rate": 1.3720693170234456e-05, + "loss": 0.3352, + "step": 1669 + }, + { + "epoch": 1.9157940037297374, + "grad_norm": 25.625, + "learning_rate": 1.3715596330275231e-05, + "loss": 0.3003, + "step": 1670 + }, + { + "epoch": 1.9169416152632333, + "grad_norm": 37.5, + "learning_rate": 1.3710499490316005e-05, + "loss": 0.2462, + "step": 1671 + }, + { + "epoch": 1.9180892267967293, + "grad_norm": 20.75, + "learning_rate": 1.370540265035678e-05, + "loss": 0.6685, + "step": 1672 + }, + { + "epoch": 1.9192368383302252, + "grad_norm": 30.0, + "learning_rate": 1.3700305810397555e-05, + "loss": 0.5793, + "step": 1673 + }, + { + "epoch": 1.9203844498637213, + "grad_norm": 67.5, + "learning_rate": 1.369520897043833e-05, + "loss": 0.5628, + "step": 1674 + }, + { + "epoch": 1.921532061397217, + "grad_norm": 68.0, + "learning_rate": 1.3690112130479106e-05, + "loss": 0.3445, + "step": 1675 + }, + { + "epoch": 1.922679672930713, + "grad_norm": 18.375, + "learning_rate": 1.3685015290519878e-05, + "loss": 0.3626, + "step": 1676 + }, + { + "epoch": 1.9238272844642088, + "grad_norm": 26.875, + "learning_rate": 1.3679918450560654e-05, + "loss": 0.8984, + "step": 1677 + }, + { + "epoch": 1.9249748959977047, + "grad_norm": 27.125, + "learning_rate": 1.3674821610601427e-05, + "loss": 0.4586, + "step": 1678 + }, + { + "epoch": 1.9261225075312007, + "grad_norm": 62.0, + "learning_rate": 1.3669724770642203e-05, + "loss": 0.7513, + "step": 1679 + }, + { + "epoch": 1.9272701190646966, + "grad_norm": 38.25, + "learning_rate": 1.3664627930682979e-05, + "loss": 0.4712, + "step": 1680 + }, + { + "epoch": 1.9284177305981927, + "grad_norm": 19.125, + "learning_rate": 1.3659531090723753e-05, + "loss": 0.2701, + "step": 1681 + }, + { + "epoch": 1.9295653421316885, + "grad_norm": 16.625, + "learning_rate": 1.3654434250764528e-05, + "loss": 0.3847, + "step": 1682 + }, + { + "epoch": 1.9307129536651844, + "grad_norm": 52.0, + "learning_rate": 1.36493374108053e-05, + "loss": 0.4352, + "step": 1683 + }, + { + "epoch": 1.9318605651986802, + "grad_norm": 100.0, + "learning_rate": 1.3644240570846076e-05, + "loss": 1.0839, + "step": 1684 + }, + { + "epoch": 1.933008176732176, + "grad_norm": 53.25, + "learning_rate": 1.363914373088685e-05, + "loss": 0.5791, + "step": 1685 + }, + { + "epoch": 1.934155788265672, + "grad_norm": 55.5, + "learning_rate": 1.3634046890927625e-05, + "loss": 0.7248, + "step": 1686 + }, + { + "epoch": 1.935303399799168, + "grad_norm": 16.5, + "learning_rate": 1.3628950050968401e-05, + "loss": 0.3914, + "step": 1687 + }, + { + "epoch": 1.9364510113326638, + "grad_norm": 43.5, + "learning_rate": 1.3623853211009175e-05, + "loss": 0.348, + "step": 1688 + }, + { + "epoch": 1.93759862286616, + "grad_norm": 34.25, + "learning_rate": 1.361875637104995e-05, + "loss": 0.4504, + "step": 1689 + }, + { + "epoch": 1.9387462343996558, + "grad_norm": 32.5, + "learning_rate": 1.3613659531090724e-05, + "loss": 0.4256, + "step": 1690 + }, + { + "epoch": 1.9398938459331516, + "grad_norm": 17.125, + "learning_rate": 1.36085626911315e-05, + "loss": 0.2441, + "step": 1691 + }, + { + "epoch": 1.9410414574666475, + "grad_norm": 31.625, + "learning_rate": 1.3603465851172276e-05, + "loss": 0.5579, + "step": 1692 + }, + { + "epoch": 1.9421890690001433, + "grad_norm": 29.75, + "learning_rate": 1.3598369011213048e-05, + "loss": 0.8088, + "step": 1693 + }, + { + "epoch": 1.9433366805336394, + "grad_norm": 51.5, + "learning_rate": 1.3593272171253823e-05, + "loss": 0.6118, + "step": 1694 + }, + { + "epoch": 1.9444842920671352, + "grad_norm": 27.875, + "learning_rate": 1.3588175331294597e-05, + "loss": 0.2742, + "step": 1695 + }, + { + "epoch": 1.9456319036006313, + "grad_norm": 11.8125, + "learning_rate": 1.3583078491335373e-05, + "loss": 0.2417, + "step": 1696 + }, + { + "epoch": 1.9467795151341272, + "grad_norm": 30.75, + "learning_rate": 1.3577981651376149e-05, + "loss": 0.236, + "step": 1697 + }, + { + "epoch": 1.947927126667623, + "grad_norm": 28.5, + "learning_rate": 1.3572884811416922e-05, + "loss": 0.299, + "step": 1698 + }, + { + "epoch": 1.9490747382011189, + "grad_norm": 15.125, + "learning_rate": 1.3567787971457698e-05, + "loss": 0.1027, + "step": 1699 + }, + { + "epoch": 1.9502223497346147, + "grad_norm": 35.25, + "learning_rate": 1.356269113149847e-05, + "loss": 0.2869, + "step": 1700 + }, + { + "epoch": 1.9502223497346147, + "eval_accuracy": 0.72, + "eval_loss": 0.4552258551120758, + "eval_runtime": 49.3148, + "eval_samples_per_second": 2.028, + "eval_steps_per_second": 2.028, + "step": 1700 + }, + { + "epoch": 1.9513699612681108, + "grad_norm": 27.875, + "learning_rate": 1.3557594291539246e-05, + "loss": 0.9343, + "step": 1701 + }, + { + "epoch": 1.9525175728016066, + "grad_norm": 31.25, + "learning_rate": 1.3552497451580021e-05, + "loss": 0.5365, + "step": 1702 + }, + { + "epoch": 1.9536651843351027, + "grad_norm": 56.25, + "learning_rate": 1.3547400611620795e-05, + "loss": 0.6064, + "step": 1703 + }, + { + "epoch": 1.9548127958685986, + "grad_norm": 8.875, + "learning_rate": 1.3542303771661571e-05, + "loss": 0.2503, + "step": 1704 + }, + { + "epoch": 1.9559604074020944, + "grad_norm": 23.25, + "learning_rate": 1.3537206931702345e-05, + "loss": 0.6551, + "step": 1705 + }, + { + "epoch": 1.9571080189355903, + "grad_norm": 26.75, + "learning_rate": 1.353211009174312e-05, + "loss": 0.4402, + "step": 1706 + }, + { + "epoch": 1.9582556304690861, + "grad_norm": 19.75, + "learning_rate": 1.3527013251783896e-05, + "loss": 0.5219, + "step": 1707 + }, + { + "epoch": 1.959403242002582, + "grad_norm": 109.0, + "learning_rate": 1.352191641182467e-05, + "loss": 0.698, + "step": 1708 + }, + { + "epoch": 1.960550853536078, + "grad_norm": 81.5, + "learning_rate": 1.3516819571865446e-05, + "loss": 0.5249, + "step": 1709 + }, + { + "epoch": 1.961698465069574, + "grad_norm": 29.125, + "learning_rate": 1.3511722731906218e-05, + "loss": 0.5226, + "step": 1710 + }, + { + "epoch": 1.96284607660307, + "grad_norm": 59.75, + "learning_rate": 1.3506625891946993e-05, + "loss": 0.6405, + "step": 1711 + }, + { + "epoch": 1.9639936881365658, + "grad_norm": 21.5, + "learning_rate": 1.3501529051987769e-05, + "loss": 0.4448, + "step": 1712 + }, + { + "epoch": 1.9651412996700617, + "grad_norm": 34.75, + "learning_rate": 1.3496432212028543e-05, + "loss": 0.667, + "step": 1713 + }, + { + "epoch": 1.9662889112035575, + "grad_norm": 15.125, + "learning_rate": 1.3491335372069319e-05, + "loss": 0.4765, + "step": 1714 + }, + { + "epoch": 1.9674365227370534, + "grad_norm": 31.0, + "learning_rate": 1.3486238532110092e-05, + "loss": 0.2273, + "step": 1715 + }, + { + "epoch": 1.9685841342705495, + "grad_norm": 20.75, + "learning_rate": 1.3481141692150868e-05, + "loss": 0.3604, + "step": 1716 + }, + { + "epoch": 1.9697317458040453, + "grad_norm": 39.5, + "learning_rate": 1.3476044852191644e-05, + "loss": 0.4167, + "step": 1717 + }, + { + "epoch": 1.9708793573375414, + "grad_norm": 28.5, + "learning_rate": 1.3470948012232416e-05, + "loss": 0.4476, + "step": 1718 + }, + { + "epoch": 1.9720269688710372, + "grad_norm": 19.25, + "learning_rate": 1.3465851172273191e-05, + "loss": 0.5297, + "step": 1719 + }, + { + "epoch": 1.973174580404533, + "grad_norm": 65.0, + "learning_rate": 1.3460754332313965e-05, + "loss": 0.8327, + "step": 1720 + }, + { + "epoch": 1.974322191938029, + "grad_norm": 23.875, + "learning_rate": 1.3455657492354741e-05, + "loss": 0.1996, + "step": 1721 + }, + { + "epoch": 1.9754698034715248, + "grad_norm": 23.0, + "learning_rate": 1.3450560652395517e-05, + "loss": 0.4416, + "step": 1722 + }, + { + "epoch": 1.9766174150050206, + "grad_norm": 11.4375, + "learning_rate": 1.344546381243629e-05, + "loss": 0.2721, + "step": 1723 + }, + { + "epoch": 1.9777650265385167, + "grad_norm": 35.0, + "learning_rate": 1.3440366972477066e-05, + "loss": 0.5629, + "step": 1724 + }, + { + "epoch": 1.9789126380720128, + "grad_norm": 67.5, + "learning_rate": 1.343527013251784e-05, + "loss": 0.6305, + "step": 1725 + }, + { + "epoch": 1.9800602496055086, + "grad_norm": 32.75, + "learning_rate": 1.3430173292558616e-05, + "loss": 0.2927, + "step": 1726 + }, + { + "epoch": 1.9812078611390045, + "grad_norm": 35.25, + "learning_rate": 1.3425076452599391e-05, + "loss": 0.238, + "step": 1727 + }, + { + "epoch": 1.9823554726725003, + "grad_norm": 20.875, + "learning_rate": 1.3419979612640163e-05, + "loss": 0.4392, + "step": 1728 + }, + { + "epoch": 1.9835030842059962, + "grad_norm": 44.0, + "learning_rate": 1.3414882772680939e-05, + "loss": 0.398, + "step": 1729 + }, + { + "epoch": 1.984650695739492, + "grad_norm": 26.0, + "learning_rate": 1.3409785932721713e-05, + "loss": 0.7501, + "step": 1730 + }, + { + "epoch": 1.9857983072729881, + "grad_norm": 20.75, + "learning_rate": 1.3404689092762488e-05, + "loss": 0.3494, + "step": 1731 + }, + { + "epoch": 1.986945918806484, + "grad_norm": 30.875, + "learning_rate": 1.3399592252803264e-05, + "loss": 1.1064, + "step": 1732 + }, + { + "epoch": 1.98809353033998, + "grad_norm": 30.0, + "learning_rate": 1.3394495412844038e-05, + "loss": 0.6117, + "step": 1733 + }, + { + "epoch": 1.989241141873476, + "grad_norm": 16.875, + "learning_rate": 1.3389398572884814e-05, + "loss": 0.3173, + "step": 1734 + }, + { + "epoch": 1.9903887534069717, + "grad_norm": 15.75, + "learning_rate": 1.3384301732925586e-05, + "loss": 0.4467, + "step": 1735 + }, + { + "epoch": 1.9915363649404676, + "grad_norm": 49.0, + "learning_rate": 1.3379204892966361e-05, + "loss": 0.7462, + "step": 1736 + }, + { + "epoch": 1.9926839764739634, + "grad_norm": 22.25, + "learning_rate": 1.3374108053007137e-05, + "loss": 0.4648, + "step": 1737 + }, + { + "epoch": 1.9938315880074595, + "grad_norm": 41.25, + "learning_rate": 1.3369011213047911e-05, + "loss": 0.2781, + "step": 1738 + }, + { + "epoch": 1.9949791995409554, + "grad_norm": 22.625, + "learning_rate": 1.3363914373088686e-05, + "loss": 0.6798, + "step": 1739 + }, + { + "epoch": 1.9961268110744514, + "grad_norm": 61.5, + "learning_rate": 1.335881753312946e-05, + "loss": 0.4519, + "step": 1740 + }, + { + "epoch": 1.9972744226079473, + "grad_norm": 22.375, + "learning_rate": 1.3353720693170236e-05, + "loss": 0.7196, + "step": 1741 + }, + { + "epoch": 1.9984220341414431, + "grad_norm": 9.9375, + "learning_rate": 1.3348623853211012e-05, + "loss": 0.2518, + "step": 1742 + }, + { + "epoch": 1.999569645674939, + "grad_norm": 24.0, + "learning_rate": 1.3343527013251785e-05, + "loss": 0.5652, + "step": 1743 + }, + { + "epoch": 2.0, + "grad_norm": 43.0, + "learning_rate": 1.3338430173292561e-05, + "loss": 0.2228, + "step": 1744 + }, + { + "epoch": 2.001147611533496, + "grad_norm": 12.75, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.2022, + "step": 1745 + }, + { + "epoch": 2.0022952230669917, + "grad_norm": 38.5, + "learning_rate": 1.3328236493374109e-05, + "loss": 0.5685, + "step": 1746 + }, + { + "epoch": 2.0034428346004876, + "grad_norm": 17.875, + "learning_rate": 1.3323139653414884e-05, + "loss": 0.4231, + "step": 1747 + }, + { + "epoch": 2.004590446133984, + "grad_norm": 30.125, + "learning_rate": 1.3318042813455658e-05, + "loss": 0.4095, + "step": 1748 + }, + { + "epoch": 2.0057380576674797, + "grad_norm": 9.125, + "learning_rate": 1.3312945973496434e-05, + "loss": 0.2246, + "step": 1749 + }, + { + "epoch": 2.0068856692009756, + "grad_norm": 13.3125, + "learning_rate": 1.3307849133537208e-05, + "loss": 0.3579, + "step": 1750 + }, + { + "epoch": 2.0080332807344714, + "grad_norm": 43.25, + "learning_rate": 1.3302752293577984e-05, + "loss": 0.2611, + "step": 1751 + }, + { + "epoch": 2.0091808922679673, + "grad_norm": 29.125, + "learning_rate": 1.3297655453618759e-05, + "loss": 0.1953, + "step": 1752 + }, + { + "epoch": 2.010328503801463, + "grad_norm": 25.625, + "learning_rate": 1.3292558613659531e-05, + "loss": 0.3513, + "step": 1753 + }, + { + "epoch": 2.011476115334959, + "grad_norm": 16.5, + "learning_rate": 1.3287461773700307e-05, + "loss": 0.2294, + "step": 1754 + }, + { + "epoch": 2.012623726868455, + "grad_norm": 7.34375, + "learning_rate": 1.328236493374108e-05, + "loss": 0.1434, + "step": 1755 + }, + { + "epoch": 2.013771338401951, + "grad_norm": 18.625, + "learning_rate": 1.3277268093781856e-05, + "loss": 0.5704, + "step": 1756 + }, + { + "epoch": 2.014918949935447, + "grad_norm": 47.25, + "learning_rate": 1.3272171253822632e-05, + "loss": 0.4283, + "step": 1757 + }, + { + "epoch": 2.016066561468943, + "grad_norm": 9.6875, + "learning_rate": 1.3267074413863406e-05, + "loss": 0.2391, + "step": 1758 + }, + { + "epoch": 2.0172141730024387, + "grad_norm": 34.5, + "learning_rate": 1.3261977573904182e-05, + "loss": 0.248, + "step": 1759 + }, + { + "epoch": 2.0183617845359345, + "grad_norm": 11.9375, + "learning_rate": 1.3256880733944954e-05, + "loss": 0.2023, + "step": 1760 + }, + { + "epoch": 2.0195093960694304, + "grad_norm": 27.375, + "learning_rate": 1.325178389398573e-05, + "loss": 0.261, + "step": 1761 + }, + { + "epoch": 2.020657007602926, + "grad_norm": 10.875, + "learning_rate": 1.3246687054026503e-05, + "loss": 0.224, + "step": 1762 + }, + { + "epoch": 2.0218046191364225, + "grad_norm": 24.375, + "learning_rate": 1.3241590214067279e-05, + "loss": 0.2463, + "step": 1763 + }, + { + "epoch": 2.0229522306699184, + "grad_norm": 18.5, + "learning_rate": 1.3236493374108054e-05, + "loss": 0.2944, + "step": 1764 + }, + { + "epoch": 2.024099842203414, + "grad_norm": 24.0, + "learning_rate": 1.3231396534148828e-05, + "loss": 0.315, + "step": 1765 + }, + { + "epoch": 2.02524745373691, + "grad_norm": 54.0, + "learning_rate": 1.3226299694189604e-05, + "loss": 0.3818, + "step": 1766 + }, + { + "epoch": 2.026395065270406, + "grad_norm": 18.5, + "learning_rate": 1.3221202854230378e-05, + "loss": 0.3524, + "step": 1767 + }, + { + "epoch": 2.0275426768039018, + "grad_norm": 19.75, + "learning_rate": 1.3216106014271153e-05, + "loss": 0.3522, + "step": 1768 + }, + { + "epoch": 2.0286902883373976, + "grad_norm": 35.0, + "learning_rate": 1.3211009174311929e-05, + "loss": 0.3279, + "step": 1769 + }, + { + "epoch": 2.029837899870894, + "grad_norm": 56.75, + "learning_rate": 1.3205912334352701e-05, + "loss": 1.3613, + "step": 1770 + }, + { + "epoch": 2.0309855114043898, + "grad_norm": 28.0, + "learning_rate": 1.3200815494393477e-05, + "loss": 0.2122, + "step": 1771 + }, + { + "epoch": 2.0321331229378856, + "grad_norm": 37.0, + "learning_rate": 1.319571865443425e-05, + "loss": 0.1997, + "step": 1772 + }, + { + "epoch": 2.0332807344713815, + "grad_norm": 6.0, + "learning_rate": 1.3190621814475026e-05, + "loss": 0.0679, + "step": 1773 + }, + { + "epoch": 2.0344283460048773, + "grad_norm": 16.75, + "learning_rate": 1.3185524974515802e-05, + "loss": 0.3065, + "step": 1774 + }, + { + "epoch": 2.035575957538373, + "grad_norm": 33.5, + "learning_rate": 1.3180428134556576e-05, + "loss": 0.2069, + "step": 1775 + }, + { + "epoch": 2.036723569071869, + "grad_norm": 27.25, + "learning_rate": 1.3175331294597351e-05, + "loss": 0.2496, + "step": 1776 + }, + { + "epoch": 2.037871180605365, + "grad_norm": 29.25, + "learning_rate": 1.3170234454638124e-05, + "loss": 0.3496, + "step": 1777 + }, + { + "epoch": 2.039018792138861, + "grad_norm": 22.375, + "learning_rate": 1.31651376146789e-05, + "loss": 0.302, + "step": 1778 + }, + { + "epoch": 2.040166403672357, + "grad_norm": 30.375, + "learning_rate": 1.3160040774719675e-05, + "loss": 0.5632, + "step": 1779 + }, + { + "epoch": 2.041314015205853, + "grad_norm": 61.0, + "learning_rate": 1.3154943934760449e-05, + "loss": 0.3705, + "step": 1780 + }, + { + "epoch": 2.0424616267393487, + "grad_norm": 9.1875, + "learning_rate": 1.3149847094801224e-05, + "loss": 0.1027, + "step": 1781 + }, + { + "epoch": 2.0436092382728446, + "grad_norm": 13.75, + "learning_rate": 1.3144750254841998e-05, + "loss": 0.0892, + "step": 1782 + }, + { + "epoch": 2.0447568498063404, + "grad_norm": 25.0, + "learning_rate": 1.3139653414882774e-05, + "loss": 0.6006, + "step": 1783 + }, + { + "epoch": 2.0459044613398363, + "grad_norm": 11.0625, + "learning_rate": 1.313455657492355e-05, + "loss": 0.1804, + "step": 1784 + }, + { + "epoch": 2.0470520728733326, + "grad_norm": 32.75, + "learning_rate": 1.3129459734964323e-05, + "loss": 0.3527, + "step": 1785 + }, + { + "epoch": 2.0481996844068284, + "grad_norm": 98.5, + "learning_rate": 1.3124362895005099e-05, + "loss": 0.8699, + "step": 1786 + }, + { + "epoch": 2.0493472959403243, + "grad_norm": 40.5, + "learning_rate": 1.3119266055045871e-05, + "loss": 0.193, + "step": 1787 + }, + { + "epoch": 2.05049490747382, + "grad_norm": 27.75, + "learning_rate": 1.3114169215086647e-05, + "loss": 0.3553, + "step": 1788 + }, + { + "epoch": 2.051642519007316, + "grad_norm": 14.4375, + "learning_rate": 1.3109072375127422e-05, + "loss": 0.1813, + "step": 1789 + }, + { + "epoch": 2.052790130540812, + "grad_norm": 17.75, + "learning_rate": 1.3103975535168196e-05, + "loss": 0.0898, + "step": 1790 + }, + { + "epoch": 2.0539377420743077, + "grad_norm": 9.25, + "learning_rate": 1.3098878695208972e-05, + "loss": 0.1149, + "step": 1791 + }, + { + "epoch": 2.0550853536078035, + "grad_norm": 46.0, + "learning_rate": 1.3093781855249746e-05, + "loss": 0.2937, + "step": 1792 + }, + { + "epoch": 2.0562329651413, + "grad_norm": 17.125, + "learning_rate": 1.3088685015290521e-05, + "loss": 0.3886, + "step": 1793 + }, + { + "epoch": 2.0573805766747957, + "grad_norm": 25.875, + "learning_rate": 1.3083588175331297e-05, + "loss": 0.2858, + "step": 1794 + }, + { + "epoch": 2.0585281882082915, + "grad_norm": 42.5, + "learning_rate": 1.307849133537207e-05, + "loss": 0.6463, + "step": 1795 + }, + { + "epoch": 2.0596757997417874, + "grad_norm": 97.5, + "learning_rate": 1.3073394495412845e-05, + "loss": 0.9309, + "step": 1796 + }, + { + "epoch": 2.0608234112752832, + "grad_norm": 27.25, + "learning_rate": 1.3068297655453619e-05, + "loss": 0.3763, + "step": 1797 + }, + { + "epoch": 2.061971022808779, + "grad_norm": 137.0, + "learning_rate": 1.3063200815494394e-05, + "loss": 1.1044, + "step": 1798 + }, + { + "epoch": 2.063118634342275, + "grad_norm": 12.1875, + "learning_rate": 1.305810397553517e-05, + "loss": 0.1574, + "step": 1799 + }, + { + "epoch": 2.0642662458757712, + "grad_norm": 22.875, + "learning_rate": 1.3053007135575944e-05, + "loss": 0.1174, + "step": 1800 + }, + { + "epoch": 2.0642662458757712, + "eval_accuracy": 0.74, + "eval_loss": 0.4835154712200165, + "eval_runtime": 49.2987, + "eval_samples_per_second": 2.028, + "eval_steps_per_second": 2.028, + "step": 1800 + } + ], + "logging_steps": 1, + "max_steps": 4360, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.358825065150048e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}