{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.5728105382319644, "eval_steps": 250, "global_step": 25000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005145621076463929, "grad_norm": 10.373374938964844, "learning_rate": 9.600000000000001e-06, "loss": 4.8488, "step": 50 }, { "epoch": 0.010291242152927859, "grad_norm": 11.32767391204834, "learning_rate": 1.9600000000000002e-05, "loss": 2.2402, "step": 100 }, { "epoch": 0.015436863229391787, "grad_norm": 7.465405464172363, "learning_rate": 1.9990110229731125e-05, "loss": 1.8204, "step": 150 }, { "epoch": 0.020582484305855717, "grad_norm": 11.595544815063477, "learning_rate": 1.9979808385701043e-05, "loss": 1.7765, "step": 200 }, { "epoch": 0.025728105382319646, "grad_norm": 15.59911060333252, "learning_rate": 1.996950654167096e-05, "loss": 1.7482, "step": 250 }, { "epoch": 0.025728105382319646, "eval_accuracy": 0.9227184653282166, "eval_loss": 1.1625392436981201, "eval_runtime": 418.704, "eval_samples_per_second": 82.514, "eval_steps_per_second": 2.579, "step": 250 }, { "epoch": 0.030873726458783574, "grad_norm": 7.875217914581299, "learning_rate": 1.995920469764088e-05, "loss": 1.7821, "step": 300 }, { "epoch": 0.036019347535247506, "grad_norm": 11.855839729309082, "learning_rate": 1.99489028536108e-05, "loss": 1.6761, "step": 350 }, { "epoch": 0.041164968611711435, "grad_norm": 7.26309061050415, "learning_rate": 1.9938601009580717e-05, "loss": 1.4887, "step": 400 }, { "epoch": 0.04631058968817536, "grad_norm": 8.920960426330566, "learning_rate": 1.9928299165550635e-05, "loss": 1.6001, "step": 450 }, { "epoch": 0.05145621076463929, "grad_norm": 7.441282749176025, "learning_rate": 1.9917997321520554e-05, "loss": 1.7426, "step": 500 }, { "epoch": 0.05145621076463929, "eval_accuracy": 0.9316622614860535, "eval_loss": 1.1087791919708252, "eval_runtime": 417.9867, "eval_samples_per_second": 82.656, "eval_steps_per_second": 2.584, "step": 500 }, { "epoch": 0.05660183184110322, "grad_norm": 5.284599781036377, "learning_rate": 1.9907695477490472e-05, "loss": 1.5562, "step": 550 }, { "epoch": 0.06174745291756715, "grad_norm": 9.429953575134277, "learning_rate": 1.989739363346039e-05, "loss": 1.6811, "step": 600 }, { "epoch": 0.06689307399403108, "grad_norm": 13.159565925598145, "learning_rate": 1.988709178943031e-05, "loss": 1.5994, "step": 650 }, { "epoch": 0.07203869507049501, "grad_norm": 6.322610378265381, "learning_rate": 1.9876789945400228e-05, "loss": 1.5981, "step": 700 }, { "epoch": 0.07718431614695893, "grad_norm": 13.995536804199219, "learning_rate": 1.9866488101370146e-05, "loss": 1.5713, "step": 750 }, { "epoch": 0.07718431614695893, "eval_accuracy": 0.9368722438812256, "eval_loss": 1.0817046165466309, "eval_runtime": 418.366, "eval_samples_per_second": 82.581, "eval_steps_per_second": 2.581, "step": 750 }, { "epoch": 0.08232993722342287, "grad_norm": 10.23471450805664, "learning_rate": 1.9856186257340068e-05, "loss": 1.6516, "step": 800 }, { "epoch": 0.08747555829988679, "grad_norm": 9.448832511901855, "learning_rate": 1.9845884413309983e-05, "loss": 1.5768, "step": 850 }, { "epoch": 0.09262117937635073, "grad_norm": 9.08662223815918, "learning_rate": 1.9835582569279905e-05, "loss": 1.5902, "step": 900 }, { "epoch": 0.09776680045281465, "grad_norm": 9.396324157714844, "learning_rate": 1.982528072524982e-05, "loss": 1.4613, "step": 950 }, { "epoch": 0.10291242152927858, "grad_norm": 6.668619632720947, "learning_rate": 1.9814978881219738e-05, "loss": 1.6295, "step": 1000 }, { "epoch": 0.10291242152927858, "eval_accuracy": 0.9374222159385681, "eval_loss": 1.0676764249801636, "eval_runtime": 418.3645, "eval_samples_per_second": 82.581, "eval_steps_per_second": 2.581, "step": 1000 }, { "epoch": 0.10805804260574252, "grad_norm": 12.718502044677734, "learning_rate": 1.980467703718966e-05, "loss": 1.5301, "step": 1050 }, { "epoch": 0.11320366368220644, "grad_norm": 6.540752410888672, "learning_rate": 1.9794375193159575e-05, "loss": 1.6072, "step": 1100 }, { "epoch": 0.11834928475867038, "grad_norm": 11.122970581054688, "learning_rate": 1.9784073349129497e-05, "loss": 1.4644, "step": 1150 }, { "epoch": 0.1234949058351343, "grad_norm": 9.239429473876953, "learning_rate": 1.9773771505099415e-05, "loss": 1.6331, "step": 1200 }, { "epoch": 0.12864052691159822, "grad_norm": 7.708181858062744, "learning_rate": 1.9763469661069334e-05, "loss": 1.5464, "step": 1250 }, { "epoch": 0.12864052691159822, "eval_accuracy": 0.9407797455787659, "eval_loss": 1.0546813011169434, "eval_runtime": 418.2896, "eval_samples_per_second": 82.596, "eval_steps_per_second": 2.582, "step": 1250 }, { "epoch": 0.13378614798806215, "grad_norm": 8.665009498596191, "learning_rate": 1.9753167817039252e-05, "loss": 1.5406, "step": 1300 }, { "epoch": 0.1389317690645261, "grad_norm": 8.5620756149292, "learning_rate": 1.974286597300917e-05, "loss": 1.5471, "step": 1350 }, { "epoch": 0.14407739014099002, "grad_norm": 10.859761238098145, "learning_rate": 1.973256412897909e-05, "loss": 1.6685, "step": 1400 }, { "epoch": 0.14922301121745396, "grad_norm": 6.381153106689453, "learning_rate": 1.9722262284949007e-05, "loss": 1.5644, "step": 1450 }, { "epoch": 0.15436863229391787, "grad_norm": 3.7411134243011475, "learning_rate": 1.9711960440918926e-05, "loss": 1.6587, "step": 1500 }, { "epoch": 0.15436863229391787, "eval_accuracy": 0.9419954419136047, "eval_loss": 1.0589897632598877, "eval_runtime": 418.2875, "eval_samples_per_second": 82.596, "eval_steps_per_second": 2.582, "step": 1500 }, { "epoch": 0.1595142533703818, "grad_norm": 7.128472328186035, "learning_rate": 1.9701658596888844e-05, "loss": 1.5793, "step": 1550 }, { "epoch": 0.16465987444684574, "grad_norm": 7.605306625366211, "learning_rate": 1.9691356752858763e-05, "loss": 1.4877, "step": 1600 }, { "epoch": 0.16980549552330967, "grad_norm": 10.431309700012207, "learning_rate": 1.9681054908828684e-05, "loss": 1.5781, "step": 1650 }, { "epoch": 0.17495111659977358, "grad_norm": 7.39127254486084, "learning_rate": 1.96707530647986e-05, "loss": 1.5081, "step": 1700 }, { "epoch": 0.18009673767623752, "grad_norm": 8.452966690063477, "learning_rate": 1.9660451220768518e-05, "loss": 1.5434, "step": 1750 }, { "epoch": 0.18009673767623752, "eval_accuracy": 0.9395930171012878, "eval_loss": 1.0563884973526, "eval_runtime": 418.2001, "eval_samples_per_second": 82.614, "eval_steps_per_second": 2.582, "step": 1750 }, { "epoch": 0.18524235875270145, "grad_norm": 11.400819778442383, "learning_rate": 1.965014937673844e-05, "loss": 1.4617, "step": 1800 }, { "epoch": 0.1903879798291654, "grad_norm": 2.2198734283447266, "learning_rate": 1.9639847532708355e-05, "loss": 1.4531, "step": 1850 }, { "epoch": 0.1955336009056293, "grad_norm": 9.084458351135254, "learning_rate": 1.9629545688678276e-05, "loss": 1.5713, "step": 1900 }, { "epoch": 0.20067922198209323, "grad_norm": 9.126426696777344, "learning_rate": 1.961924384464819e-05, "loss": 1.5166, "step": 1950 }, { "epoch": 0.20582484305855717, "grad_norm": 18.784996032714844, "learning_rate": 1.9608942000618113e-05, "loss": 1.4771, "step": 2000 }, { "epoch": 0.20582484305855717, "eval_accuracy": 0.943095326423645, "eval_loss": 1.0343589782714844, "eval_runtime": 418.12, "eval_samples_per_second": 82.629, "eval_steps_per_second": 2.583, "step": 2000 }, { "epoch": 0.2109704641350211, "grad_norm": 4.355062961578369, "learning_rate": 1.9598640156588032e-05, "loss": 1.4706, "step": 2050 }, { "epoch": 0.21611608521148504, "grad_norm": 7.846619606018066, "learning_rate": 1.958833831255795e-05, "loss": 1.5276, "step": 2100 }, { "epoch": 0.22126170628794894, "grad_norm": 11.4408597946167, "learning_rate": 1.957803646852787e-05, "loss": 1.4002, "step": 2150 }, { "epoch": 0.22640732736441288, "grad_norm": 10.485089302062988, "learning_rate": 1.9567734624497787e-05, "loss": 1.5605, "step": 2200 }, { "epoch": 0.23155294844087682, "grad_norm": 5.763485431671143, "learning_rate": 1.9557432780467705e-05, "loss": 1.4871, "step": 2250 }, { "epoch": 0.23155294844087682, "eval_accuracy": 0.9440793991088867, "eval_loss": 1.035501480102539, "eval_runtime": 418.1856, "eval_samples_per_second": 82.616, "eval_steps_per_second": 2.583, "step": 2250 }, { "epoch": 0.23669856951734075, "grad_norm": 6.086212635040283, "learning_rate": 1.9547130936437624e-05, "loss": 1.56, "step": 2300 }, { "epoch": 0.24184419059380466, "grad_norm": 10.038729667663574, "learning_rate": 1.9536829092407542e-05, "loss": 1.4322, "step": 2350 }, { "epoch": 0.2469898116702686, "grad_norm": 8.869370460510254, "learning_rate": 1.952652724837746e-05, "loss": 1.4682, "step": 2400 }, { "epoch": 0.25213543274673256, "grad_norm": 9.509527206420898, "learning_rate": 1.951622540434738e-05, "loss": 1.4375, "step": 2450 }, { "epoch": 0.25728105382319644, "grad_norm": 14.525392532348633, "learning_rate": 1.95061295971979e-05, "loss": 1.4499, "step": 2500 }, { "epoch": 0.25728105382319644, "eval_accuracy": 0.9433557987213135, "eval_loss": 1.030641794204712, "eval_runtime": 418.9422, "eval_samples_per_second": 82.467, "eval_steps_per_second": 2.578, "step": 2500 }, { "epoch": 0.2624266748996604, "grad_norm": 10.151646614074707, "learning_rate": 1.949582775316782e-05, "loss": 1.5088, "step": 2550 }, { "epoch": 0.2675722959761243, "grad_norm": 9.49378776550293, "learning_rate": 1.948552590913774e-05, "loss": 1.5577, "step": 2600 }, { "epoch": 0.27271791705258824, "grad_norm": 11.339292526245117, "learning_rate": 1.9475224065107657e-05, "loss": 1.4221, "step": 2650 }, { "epoch": 0.2778635381290522, "grad_norm": 6.203045845031738, "learning_rate": 1.9464922221077572e-05, "loss": 1.5105, "step": 2700 }, { "epoch": 0.2830091592055161, "grad_norm": 8.609308242797852, "learning_rate": 1.9454620377047494e-05, "loss": 1.4681, "step": 2750 }, { "epoch": 0.2830091592055161, "eval_accuracy": 0.9453240633010864, "eval_loss": 1.0219130516052246, "eval_runtime": 418.4116, "eval_samples_per_second": 82.572, "eval_steps_per_second": 2.581, "step": 2750 }, { "epoch": 0.28815478028198005, "grad_norm": 5.019013404846191, "learning_rate": 1.9444318533017412e-05, "loss": 1.4354, "step": 2800 }, { "epoch": 0.293300401358444, "grad_norm": 11.37190055847168, "learning_rate": 1.943401668898733e-05, "loss": 1.4982, "step": 2850 }, { "epoch": 0.2984460224349079, "grad_norm": 4.953094005584717, "learning_rate": 1.942371484495725e-05, "loss": 1.5374, "step": 2900 }, { "epoch": 0.3035916435113718, "grad_norm": 12.19895076751709, "learning_rate": 1.9413413000927167e-05, "loss": 1.4769, "step": 2950 }, { "epoch": 0.30873726458783574, "grad_norm": 6.096263408660889, "learning_rate": 1.9403111156897086e-05, "loss": 1.5767, "step": 3000 }, { "epoch": 0.30873726458783574, "eval_accuracy": 0.9450345635414124, "eval_loss": 1.0167551040649414, "eval_runtime": 419.6576, "eval_samples_per_second": 82.327, "eval_steps_per_second": 2.574, "step": 3000 }, { "epoch": 0.31388288566429967, "grad_norm": 5.119435787200928, "learning_rate": 1.9392809312867004e-05, "loss": 1.3712, "step": 3050 }, { "epoch": 0.3190285067407636, "grad_norm": 3.375337600708008, "learning_rate": 1.9382507468836923e-05, "loss": 1.4979, "step": 3100 }, { "epoch": 0.32417412781722754, "grad_norm": 12.87149715423584, "learning_rate": 1.937220562480684e-05, "loss": 1.4633, "step": 3150 }, { "epoch": 0.3293197488936915, "grad_norm": 8.743680000305176, "learning_rate": 1.936190378077676e-05, "loss": 1.5025, "step": 3200 }, { "epoch": 0.3344653699701554, "grad_norm": 7.334400653839111, "learning_rate": 1.935160193674668e-05, "loss": 1.5206, "step": 3250 }, { "epoch": 0.3344653699701554, "eval_accuracy": 0.9457292556762695, "eval_loss": 1.016142725944519, "eval_runtime": 419.0299, "eval_samples_per_second": 82.45, "eval_steps_per_second": 2.577, "step": 3250 }, { "epoch": 0.33961099104661935, "grad_norm": 15.349682807922363, "learning_rate": 1.9341300092716596e-05, "loss": 1.5119, "step": 3300 }, { "epoch": 0.3447566121230833, "grad_norm": 10.006840705871582, "learning_rate": 1.9330998248686518e-05, "loss": 1.6285, "step": 3350 }, { "epoch": 0.34990223319954716, "grad_norm": 10.248564720153809, "learning_rate": 1.9320696404656437e-05, "loss": 1.4421, "step": 3400 }, { "epoch": 0.3550478542760111, "grad_norm": 10.41620922088623, "learning_rate": 1.931039456062635e-05, "loss": 1.4866, "step": 3450 }, { "epoch": 0.36019347535247503, "grad_norm": 11.682571411132812, "learning_rate": 1.9300092716596273e-05, "loss": 1.4651, "step": 3500 }, { "epoch": 0.36019347535247503, "eval_accuracy": 0.9464818239212036, "eval_loss": 1.0084654092788696, "eval_runtime": 418.3089, "eval_samples_per_second": 82.592, "eval_steps_per_second": 2.582, "step": 3500 }, { "epoch": 0.36533909642893897, "grad_norm": 4.340071678161621, "learning_rate": 1.9289790872566192e-05, "loss": 1.3777, "step": 3550 }, { "epoch": 0.3704847175054029, "grad_norm": 5.201279163360596, "learning_rate": 1.927948902853611e-05, "loss": 1.5256, "step": 3600 }, { "epoch": 0.37563033858186684, "grad_norm": 13.030069351196289, "learning_rate": 1.926918718450603e-05, "loss": 1.358, "step": 3650 }, { "epoch": 0.3807759596583308, "grad_norm": 6.489394664764404, "learning_rate": 1.9259091377356548e-05, "loss": 1.4384, "step": 3700 }, { "epoch": 0.3859215807347947, "grad_norm": 9.361295700073242, "learning_rate": 1.9248789533326466e-05, "loss": 1.4847, "step": 3750 }, { "epoch": 0.3859215807347947, "eval_accuracy": 0.9460765719413757, "eval_loss": 1.0093164443969727, "eval_runtime": 418.9794, "eval_samples_per_second": 82.46, "eval_steps_per_second": 2.578, "step": 3750 }, { "epoch": 0.3910672018112586, "grad_norm": 5.094785213470459, "learning_rate": 1.9238487689296385e-05, "loss": 1.327, "step": 3800 }, { "epoch": 0.3962128228877225, "grad_norm": 2.1664323806762695, "learning_rate": 1.9228185845266307e-05, "loss": 1.4463, "step": 3850 }, { "epoch": 0.40135844396418646, "grad_norm": 2.0584053993225098, "learning_rate": 1.921788400123622e-05, "loss": 1.3179, "step": 3900 }, { "epoch": 0.4065040650406504, "grad_norm": 7.324461936950684, "learning_rate": 1.920758215720614e-05, "loss": 1.4312, "step": 3950 }, { "epoch": 0.41164968611711433, "grad_norm": 13.046509742736816, "learning_rate": 1.9197280313176062e-05, "loss": 1.4179, "step": 4000 }, { "epoch": 0.41164968611711433, "eval_accuracy": 0.9460186958312988, "eval_loss": 1.0144544839859009, "eval_runtime": 418.3617, "eval_samples_per_second": 82.582, "eval_steps_per_second": 2.581, "step": 4000 }, { "epoch": 0.41679530719357827, "grad_norm": 4.131565093994141, "learning_rate": 1.9186978469145977e-05, "loss": 1.4828, "step": 4050 }, { "epoch": 0.4219409282700422, "grad_norm": 14.746273040771484, "learning_rate": 1.91766766251159e-05, "loss": 1.4568, "step": 4100 }, { "epoch": 0.42708654934650614, "grad_norm": 9.327777862548828, "learning_rate": 1.9166374781085817e-05, "loss": 1.4921, "step": 4150 }, { "epoch": 0.4322321704229701, "grad_norm": 11.792791366577148, "learning_rate": 1.9156072937055735e-05, "loss": 1.4485, "step": 4200 }, { "epoch": 0.43737779149943395, "grad_norm": 9.524967193603516, "learning_rate": 1.9145771093025654e-05, "loss": 1.4908, "step": 4250 }, { "epoch": 0.43737779149943395, "eval_accuracy": 0.9477843046188354, "eval_loss": 1.0120937824249268, "eval_runtime": 418.3734, "eval_samples_per_second": 82.579, "eval_steps_per_second": 2.581, "step": 4250 }, { "epoch": 0.4425234125758979, "grad_norm": 6.7903361320495605, "learning_rate": 1.9135469248995572e-05, "loss": 1.295, "step": 4300 }, { "epoch": 0.4476690336523618, "grad_norm": 7.337329387664795, "learning_rate": 1.912516740496549e-05, "loss": 1.4687, "step": 4350 }, { "epoch": 0.45281465472882576, "grad_norm": 8.49622631072998, "learning_rate": 1.911486556093541e-05, "loss": 1.3846, "step": 4400 }, { "epoch": 0.4579602758052897, "grad_norm": 4.385276794433594, "learning_rate": 1.9104563716905328e-05, "loss": 1.4704, "step": 4450 }, { "epoch": 0.46310589688175363, "grad_norm": 9.520790100097656, "learning_rate": 1.9094261872875246e-05, "loss": 1.3646, "step": 4500 }, { "epoch": 0.46310589688175363, "eval_accuracy": 0.9479579925537109, "eval_loss": 1.0055809020996094, "eval_runtime": 419.9183, "eval_samples_per_second": 82.276, "eval_steps_per_second": 2.572, "step": 4500 }, { "epoch": 0.46825151795821757, "grad_norm": 11.194172859191895, "learning_rate": 1.9083960028845164e-05, "loss": 1.4779, "step": 4550 }, { "epoch": 0.4733971390346815, "grad_norm": 8.098811149597168, "learning_rate": 1.9073658184815083e-05, "loss": 1.4581, "step": 4600 }, { "epoch": 0.47854276011114544, "grad_norm": 3.986377477645874, "learning_rate": 1.9063356340785e-05, "loss": 1.3786, "step": 4650 }, { "epoch": 0.4836883811876093, "grad_norm": 7.204378128051758, "learning_rate": 1.9053054496754923e-05, "loss": 1.56, "step": 4700 }, { "epoch": 0.48883400226407325, "grad_norm": 6.332393169403076, "learning_rate": 1.9042752652724838e-05, "loss": 1.4334, "step": 4750 }, { "epoch": 0.48883400226407325, "eval_accuracy": 0.947523832321167, "eval_loss": 1.0032474994659424, "eval_runtime": 418.1822, "eval_samples_per_second": 82.617, "eval_steps_per_second": 2.583, "step": 4750 }, { "epoch": 0.4939796233405372, "grad_norm": 5.785167694091797, "learning_rate": 1.9032450808694756e-05, "loss": 1.3877, "step": 4800 }, { "epoch": 0.4991252444170011, "grad_norm": 2.13838529586792, "learning_rate": 1.9022148964664678e-05, "loss": 1.3485, "step": 4850 }, { "epoch": 0.5042708654934651, "grad_norm": 5.9960618019104, "learning_rate": 1.9011847120634593e-05, "loss": 1.4509, "step": 4900 }, { "epoch": 0.509416486569929, "grad_norm": 3.8960604667663574, "learning_rate": 1.9001545276604515e-05, "loss": 1.3693, "step": 4950 }, { "epoch": 0.5145621076463929, "grad_norm": 5.9818115234375, "learning_rate": 1.8991243432574434e-05, "loss": 1.5226, "step": 5000 }, { "epoch": 0.5145621076463929, "eval_accuracy": 0.9477264285087585, "eval_loss": 0.9975742101669312, "eval_runtime": 418.0863, "eval_samples_per_second": 82.636, "eval_steps_per_second": 2.583, "step": 5000 }, { "epoch": 0.5197077287228569, "grad_norm": 9.811531066894531, "learning_rate": 1.8980941588544352e-05, "loss": 1.4423, "step": 5050 }, { "epoch": 0.5248533497993207, "grad_norm": 4.812816619873047, "learning_rate": 1.897063974451427e-05, "loss": 1.4191, "step": 5100 }, { "epoch": 0.5299989708757847, "grad_norm": 7.36176872253418, "learning_rate": 1.896033790048419e-05, "loss": 1.5109, "step": 5150 }, { "epoch": 0.5351445919522486, "grad_norm": 12.9472017288208, "learning_rate": 1.8950036056454107e-05, "loss": 1.4509, "step": 5200 }, { "epoch": 0.5402902130287126, "grad_norm": 6.96859073638916, "learning_rate": 1.8939734212424026e-05, "loss": 1.4351, "step": 5250 }, { "epoch": 0.5402902130287126, "eval_accuracy": 0.9485947489738464, "eval_loss": 1.000069499015808, "eval_runtime": 418.3411, "eval_samples_per_second": 82.586, "eval_steps_per_second": 2.582, "step": 5250 }, { "epoch": 0.5454358341051765, "grad_norm": 11.015822410583496, "learning_rate": 1.8929432368393944e-05, "loss": 1.3868, "step": 5300 }, { "epoch": 0.5505814551816405, "grad_norm": 7.5497050285339355, "learning_rate": 1.8919130524363863e-05, "loss": 1.4339, "step": 5350 }, { "epoch": 0.5557270762581044, "grad_norm": 11.352765083312988, "learning_rate": 1.890882868033378e-05, "loss": 1.365, "step": 5400 }, { "epoch": 0.5608726973345682, "grad_norm": 10.072178840637207, "learning_rate": 1.88985268363037e-05, "loss": 1.44, "step": 5450 }, { "epoch": 0.5660183184110322, "grad_norm": 5.5806803703308105, "learning_rate": 1.8888224992273618e-05, "loss": 1.2895, "step": 5500 }, { "epoch": 0.5660183184110322, "eval_accuracy": 0.9490578770637512, "eval_loss": 1.0065183639526367, "eval_runtime": 418.105, "eval_samples_per_second": 82.632, "eval_steps_per_second": 2.583, "step": 5500 }, { "epoch": 0.5711639394874961, "grad_norm": 12.096445083618164, "learning_rate": 1.8877923148243536e-05, "loss": 1.4253, "step": 5550 }, { "epoch": 0.5763095605639601, "grad_norm": 6.126964569091797, "learning_rate": 1.8867621304213455e-05, "loss": 1.4438, "step": 5600 }, { "epoch": 0.581455181640424, "grad_norm": 10.5121488571167, "learning_rate": 1.8857319460183373e-05, "loss": 1.3543, "step": 5650 }, { "epoch": 0.586600802716888, "grad_norm": 4.3215227127075195, "learning_rate": 1.8847017616153295e-05, "loss": 1.5587, "step": 5700 }, { "epoch": 0.5917464237933519, "grad_norm": 6.327254295349121, "learning_rate": 1.883671577212321e-05, "loss": 1.342, "step": 5750 }, { "epoch": 0.5917464237933519, "eval_accuracy": 0.9487684369087219, "eval_loss": 0.9927480816841125, "eval_runtime": 418.6843, "eval_samples_per_second": 82.518, "eval_steps_per_second": 2.58, "step": 5750 }, { "epoch": 0.5968920448698158, "grad_norm": 9.713254928588867, "learning_rate": 1.882641392809313e-05, "loss": 1.4503, "step": 5800 }, { "epoch": 0.6020376659462797, "grad_norm": 5.628683090209961, "learning_rate": 1.881611208406305e-05, "loss": 1.4045, "step": 5850 }, { "epoch": 0.6071832870227436, "grad_norm": 11.369056701660156, "learning_rate": 1.8805810240032965e-05, "loss": 1.4092, "step": 5900 }, { "epoch": 0.6123289080992076, "grad_norm": 2.5842366218566895, "learning_rate": 1.8795508396002887e-05, "loss": 1.3318, "step": 5950 }, { "epoch": 0.6174745291756715, "grad_norm": 11.178747177124023, "learning_rate": 1.8785206551972805e-05, "loss": 1.416, "step": 6000 }, { "epoch": 0.6174745291756715, "eval_accuracy": 0.9503603577613831, "eval_loss": 0.9909718632698059, "eval_runtime": 418.5904, "eval_samples_per_second": 82.537, "eval_steps_per_second": 2.58, "step": 6000 }, { "epoch": 0.6226201502521355, "grad_norm": 7.2673115730285645, "learning_rate": 1.8774904707942724e-05, "loss": 1.5132, "step": 6050 }, { "epoch": 0.6277657713285993, "grad_norm": 4.217124938964844, "learning_rate": 1.8764602863912642e-05, "loss": 1.3275, "step": 6100 }, { "epoch": 0.6329113924050633, "grad_norm": 2.112212896347046, "learning_rate": 1.875430101988256e-05, "loss": 1.4595, "step": 6150 }, { "epoch": 0.6380570134815272, "grad_norm": 5.421743392944336, "learning_rate": 1.874399917585248e-05, "loss": 1.5112, "step": 6200 }, { "epoch": 0.6432026345579912, "grad_norm": 9.458545684814453, "learning_rate": 1.8733697331822397e-05, "loss": 1.4435, "step": 6250 }, { "epoch": 0.6432026345579912, "eval_accuracy": 0.9514892101287842, "eval_loss": 0.9927791357040405, "eval_runtime": 418.0438, "eval_samples_per_second": 82.644, "eval_steps_per_second": 2.583, "step": 6250 }, { "epoch": 0.6483482556344551, "grad_norm": 8.746692657470703, "learning_rate": 1.8723395487792316e-05, "loss": 1.4268, "step": 6300 }, { "epoch": 0.653493876710919, "grad_norm": 6.339073657989502, "learning_rate": 1.8713093643762234e-05, "loss": 1.5071, "step": 6350 }, { "epoch": 0.658639497787383, "grad_norm": 10.726541519165039, "learning_rate": 1.8702791799732153e-05, "loss": 1.3817, "step": 6400 }, { "epoch": 0.6637851188638468, "grad_norm": 6.412696361541748, "learning_rate": 1.869248995570207e-05, "loss": 1.5101, "step": 6450 }, { "epoch": 0.6689307399403108, "grad_norm": 8.011473655700684, "learning_rate": 1.868218811167199e-05, "loss": 1.4014, "step": 6500 }, { "epoch": 0.6689307399403108, "eval_accuracy": 0.9489710330963135, "eval_loss": 0.9953876733779907, "eval_runtime": 418.8528, "eval_samples_per_second": 82.485, "eval_steps_per_second": 2.578, "step": 6500 }, { "epoch": 0.6740763610167747, "grad_norm": 2.333108901977539, "learning_rate": 1.867188626764191e-05, "loss": 1.2797, "step": 6550 }, { "epoch": 0.6792219820932387, "grad_norm": 13.239009857177734, "learning_rate": 1.8661584423611826e-05, "loss": 1.3829, "step": 6600 }, { "epoch": 0.6843676031697026, "grad_norm": 7.554291248321533, "learning_rate": 1.8651282579581745e-05, "loss": 1.4907, "step": 6650 }, { "epoch": 0.6895132242461666, "grad_norm": 8.046769142150879, "learning_rate": 1.8640980735551667e-05, "loss": 1.4098, "step": 6700 }, { "epoch": 0.6946588453226304, "grad_norm": 3.5291695594787598, "learning_rate": 1.863067889152158e-05, "loss": 1.482, "step": 6750 }, { "epoch": 0.6946588453226304, "eval_accuracy": 0.9492025971412659, "eval_loss": 0.9936777949333191, "eval_runtime": 418.0424, "eval_samples_per_second": 82.645, "eval_steps_per_second": 2.583, "step": 6750 }, { "epoch": 0.6998044663990943, "grad_norm": 5.854330539703369, "learning_rate": 1.8620377047491503e-05, "loss": 1.3779, "step": 6800 }, { "epoch": 0.7049500874755583, "grad_norm": 9.476693153381348, "learning_rate": 1.8610075203461422e-05, "loss": 1.3791, "step": 6850 }, { "epoch": 0.7100957085520222, "grad_norm": 9.646202087402344, "learning_rate": 1.859977335943134e-05, "loss": 1.5183, "step": 6900 }, { "epoch": 0.7152413296284862, "grad_norm": 3.42673397064209, "learning_rate": 1.858947151540126e-05, "loss": 1.4022, "step": 6950 }, { "epoch": 0.7203869507049501, "grad_norm": 9.239468574523926, "learning_rate": 1.8579169671371177e-05, "loss": 1.544, "step": 7000 }, { "epoch": 0.7203869507049501, "eval_accuracy": 0.9508234858512878, "eval_loss": 0.9934782385826111, "eval_runtime": 418.095, "eval_samples_per_second": 82.634, "eval_steps_per_second": 2.583, "step": 7000 }, { "epoch": 0.7255325717814141, "grad_norm": 5.876420021057129, "learning_rate": 1.8568867827341096e-05, "loss": 1.4566, "step": 7050 }, { "epoch": 0.7306781928578779, "grad_norm": 2.191608190536499, "learning_rate": 1.8558565983311014e-05, "loss": 1.4641, "step": 7100 }, { "epoch": 0.7358238139343419, "grad_norm": 10.467001914978027, "learning_rate": 1.8548264139280932e-05, "loss": 1.4208, "step": 7150 }, { "epoch": 0.7409694350108058, "grad_norm": 9.560342788696289, "learning_rate": 1.853796229525085e-05, "loss": 1.3391, "step": 7200 }, { "epoch": 0.7461150560872697, "grad_norm": 10.074899673461914, "learning_rate": 1.852766045122077e-05, "loss": 1.5002, "step": 7250 }, { "epoch": 0.7461150560872697, "eval_accuracy": 0.9496946334838867, "eval_loss": 0.9860528707504272, "eval_runtime": 418.2736, "eval_samples_per_second": 82.599, "eval_steps_per_second": 2.582, "step": 7250 }, { "epoch": 0.7512606771637337, "grad_norm": 5.140987873077393, "learning_rate": 1.851735860719069e-05, "loss": 1.2985, "step": 7300 }, { "epoch": 0.7564062982401976, "grad_norm": 4.276757717132568, "learning_rate": 1.8507056763160606e-05, "loss": 1.5496, "step": 7350 }, { "epoch": 0.7615519193166616, "grad_norm": 8.268556594848633, "learning_rate": 1.8496754919130528e-05, "loss": 1.5046, "step": 7400 }, { "epoch": 0.7666975403931254, "grad_norm": 7.343358516693115, "learning_rate": 1.8486453075100443e-05, "loss": 1.3687, "step": 7450 }, { "epoch": 0.7718431614695894, "grad_norm": 5.345001220703125, "learning_rate": 1.847615123107036e-05, "loss": 1.3841, "step": 7500 }, { "epoch": 0.7718431614695894, "eval_accuracy": 0.9501287937164307, "eval_loss": 0.9868325591087341, "eval_runtime": 418.192, "eval_samples_per_second": 82.615, "eval_steps_per_second": 2.583, "step": 7500 }, { "epoch": 0.7769887825460533, "grad_norm": 11.624256134033203, "learning_rate": 1.8465849387040283e-05, "loss": 1.3996, "step": 7550 }, { "epoch": 0.7821344036225172, "grad_norm": 6.849825859069824, "learning_rate": 1.8455547543010198e-05, "loss": 1.5112, "step": 7600 }, { "epoch": 0.7872800246989812, "grad_norm": 9.704992294311523, "learning_rate": 1.844524569898012e-05, "loss": 1.4335, "step": 7650 }, { "epoch": 0.792425645775445, "grad_norm": 5.669846534729004, "learning_rate": 1.843494385495004e-05, "loss": 1.3867, "step": 7700 }, { "epoch": 0.797571266851909, "grad_norm": 6.519596099853516, "learning_rate": 1.8424642010919957e-05, "loss": 1.3865, "step": 7750 }, { "epoch": 0.797571266851909, "eval_accuracy": 0.9511418342590332, "eval_loss": 0.986303448677063, "eval_runtime": 418.5501, "eval_samples_per_second": 82.544, "eval_steps_per_second": 2.58, "step": 7750 }, { "epoch": 0.8027168879283729, "grad_norm": 7.500890731811523, "learning_rate": 1.8414340166889875e-05, "loss": 1.4039, "step": 7800 }, { "epoch": 0.8078625090048369, "grad_norm": 8.141263961791992, "learning_rate": 1.8404038322859794e-05, "loss": 1.379, "step": 7850 }, { "epoch": 0.8130081300813008, "grad_norm": 8.75843620300293, "learning_rate": 1.8393736478829712e-05, "loss": 1.3459, "step": 7900 }, { "epoch": 0.8181537511577648, "grad_norm": 9.22071647644043, "learning_rate": 1.838343463479963e-05, "loss": 1.3996, "step": 7950 }, { "epoch": 0.8232993722342287, "grad_norm": 5.6009345054626465, "learning_rate": 1.837313279076955e-05, "loss": 1.4151, "step": 8000 }, { "epoch": 0.8232993722342287, "eval_accuracy": 0.9510839581489563, "eval_loss": 0.9821743369102478, "eval_runtime": 418.1692, "eval_samples_per_second": 82.62, "eval_steps_per_second": 2.583, "step": 8000 }, { "epoch": 0.8284449933106925, "grad_norm": 6.377861976623535, "learning_rate": 1.8362830946739467e-05, "loss": 1.3745, "step": 8050 }, { "epoch": 0.8335906143871565, "grad_norm": 6.7617902755737305, "learning_rate": 1.8352529102709386e-05, "loss": 1.4404, "step": 8100 }, { "epoch": 0.8387362354636204, "grad_norm": 10.52645492553711, "learning_rate": 1.8342227258679308e-05, "loss": 1.4776, "step": 8150 }, { "epoch": 0.8438818565400844, "grad_norm": 7.829946517944336, "learning_rate": 1.8331925414649223e-05, "loss": 1.398, "step": 8200 }, { "epoch": 0.8490274776165483, "grad_norm": 6.536490440368652, "learning_rate": 1.832162357061914e-05, "loss": 1.4482, "step": 8250 }, { "epoch": 0.8490274776165483, "eval_accuracy": 0.9505919218063354, "eval_loss": 0.9802690744400024, "eval_runtime": 417.9193, "eval_samples_per_second": 82.669, "eval_steps_per_second": 2.584, "step": 8250 }, { "epoch": 0.8541730986930123, "grad_norm": 8.002507209777832, "learning_rate": 1.8311321726589063e-05, "loss": 1.4551, "step": 8300 }, { "epoch": 0.8593187197694762, "grad_norm": 10.97170352935791, "learning_rate": 1.8301019882558978e-05, "loss": 1.46, "step": 8350 }, { "epoch": 0.8644643408459401, "grad_norm": 9.144811630249023, "learning_rate": 1.82907180385289e-05, "loss": 1.5179, "step": 8400 }, { "epoch": 0.869609961922404, "grad_norm": 11.398577690124512, "learning_rate": 1.8280416194498818e-05, "loss": 1.4067, "step": 8450 }, { "epoch": 0.8747555829988679, "grad_norm": 8.858057022094727, "learning_rate": 1.8270320387349337e-05, "loss": 1.4393, "step": 8500 }, { "epoch": 0.8747555829988679, "eval_accuracy": 0.9503893256187439, "eval_loss": 0.9808804392814636, "eval_runtime": 418.2126, "eval_samples_per_second": 82.611, "eval_steps_per_second": 2.582, "step": 8500 }, { "epoch": 0.8799012040753319, "grad_norm": 9.175712585449219, "learning_rate": 1.8260018543319256e-05, "loss": 1.4995, "step": 8550 }, { "epoch": 0.8850468251517958, "grad_norm": 9.636043548583984, "learning_rate": 1.8249716699289174e-05, "loss": 1.4077, "step": 8600 }, { "epoch": 0.8901924462282598, "grad_norm": 8.578084945678711, "learning_rate": 1.8239414855259093e-05, "loss": 1.4088, "step": 8650 }, { "epoch": 0.8953380673047237, "grad_norm": 7.253017425537109, "learning_rate": 1.822911301122901e-05, "loss": 1.3464, "step": 8700 }, { "epoch": 0.9004836883811876, "grad_norm": 8.55578899383545, "learning_rate": 1.821881116719893e-05, "loss": 1.3455, "step": 8750 }, { "epoch": 0.9004836883811876, "eval_accuracy": 0.9506208300590515, "eval_loss": 0.9797450304031372, "eval_runtime": 418.1418, "eval_samples_per_second": 82.625, "eval_steps_per_second": 2.583, "step": 8750 }, { "epoch": 0.9056293094576515, "grad_norm": 9.603639602661133, "learning_rate": 1.8208509323168848e-05, "loss": 1.5172, "step": 8800 }, { "epoch": 0.9107749305341155, "grad_norm": 5.811156272888184, "learning_rate": 1.8198207479138766e-05, "loss": 1.3922, "step": 8850 }, { "epoch": 0.9159205516105794, "grad_norm": 7.18412971496582, "learning_rate": 1.8187905635108688e-05, "loss": 1.3645, "step": 8900 }, { "epoch": 0.9210661726870433, "grad_norm": 10.653360366821289, "learning_rate": 1.8177603791078603e-05, "loss": 1.3627, "step": 8950 }, { "epoch": 0.9262117937635073, "grad_norm": 9.01271915435791, "learning_rate": 1.8167301947048525e-05, "loss": 1.3896, "step": 9000 }, { "epoch": 0.9262117937635073, "eval_accuracy": 0.9506497979164124, "eval_loss": 0.9806250929832458, "eval_runtime": 417.4847, "eval_samples_per_second": 82.755, "eval_steps_per_second": 2.587, "step": 9000 }, { "epoch": 0.9313574148399711, "grad_norm": 6.072149276733398, "learning_rate": 1.8157000103018443e-05, "loss": 1.433, "step": 9050 }, { "epoch": 0.9365030359164351, "grad_norm": 5.18344783782959, "learning_rate": 1.814669825898836e-05, "loss": 1.4678, "step": 9100 }, { "epoch": 0.941648656992899, "grad_norm": 12.650690078735352, "learning_rate": 1.813639641495828e-05, "loss": 1.3206, "step": 9150 }, { "epoch": 0.946794278069363, "grad_norm": 4.13425350189209, "learning_rate": 1.8126094570928195e-05, "loss": 1.4589, "step": 9200 }, { "epoch": 0.9519398991458269, "grad_norm": 9.408120155334473, "learning_rate": 1.8115792726898117e-05, "loss": 1.3494, "step": 9250 }, { "epoch": 0.9519398991458269, "eval_accuracy": 0.9509103298187256, "eval_loss": 0.9760673642158508, "eval_runtime": 418.3905, "eval_samples_per_second": 82.576, "eval_steps_per_second": 2.581, "step": 9250 }, { "epoch": 0.9570855202222909, "grad_norm": 8.437677383422852, "learning_rate": 1.8105490882868035e-05, "loss": 1.3768, "step": 9300 }, { "epoch": 0.9622311412987548, "grad_norm": 5.862843990325928, "learning_rate": 1.8095189038837954e-05, "loss": 1.4449, "step": 9350 }, { "epoch": 0.9673767623752186, "grad_norm": 5.639468193054199, "learning_rate": 1.8084887194807872e-05, "loss": 1.4187, "step": 9400 }, { "epoch": 0.9725223834516826, "grad_norm": 5.7434401512146, "learning_rate": 1.807458535077779e-05, "loss": 1.3046, "step": 9450 }, { "epoch": 0.9776680045281465, "grad_norm": 8.578060150146484, "learning_rate": 1.806428350674771e-05, "loss": 1.3586, "step": 9500 }, { "epoch": 0.9776680045281465, "eval_accuracy": 0.9511997699737549, "eval_loss": 0.9817301034927368, "eval_runtime": 417.7965, "eval_samples_per_second": 82.693, "eval_steps_per_second": 2.585, "step": 9500 }, { "epoch": 0.9828136256046105, "grad_norm": 6.870723247528076, "learning_rate": 1.8053981662717628e-05, "loss": 1.4631, "step": 9550 }, { "epoch": 0.9879592466810744, "grad_norm": 8.596879005432129, "learning_rate": 1.8043679818687546e-05, "loss": 1.3113, "step": 9600 }, { "epoch": 0.9931048677575384, "grad_norm": 5.606679439544678, "learning_rate": 1.8033377974657464e-05, "loss": 1.2972, "step": 9650 }, { "epoch": 0.9982504888340022, "grad_norm": 1.0621393918991089, "learning_rate": 1.8023076130627383e-05, "loss": 1.3793, "step": 9700 }, { "epoch": 1.0033961099104662, "grad_norm": 1.193249225616455, "learning_rate": 1.8012774286597305e-05, "loss": 1.1729, "step": 9750 }, { "epoch": 1.0033961099104662, "eval_accuracy": 0.9509392380714417, "eval_loss": 0.9846755266189575, "eval_runtime": 417.935, "eval_samples_per_second": 82.666, "eval_steps_per_second": 2.584, "step": 9750 }, { "epoch": 1.0085417309869302, "grad_norm": 10.64986515045166, "learning_rate": 1.800247244256722e-05, "loss": 1.2009, "step": 9800 }, { "epoch": 1.013687352063394, "grad_norm": 9.815643310546875, "learning_rate": 1.799217059853714e-05, "loss": 1.2576, "step": 9850 }, { "epoch": 1.018832973139858, "grad_norm": 9.344294548034668, "learning_rate": 1.798186875450706e-05, "loss": 1.3483, "step": 9900 }, { "epoch": 1.023978594216322, "grad_norm": 2.3761701583862305, "learning_rate": 1.7971566910476975e-05, "loss": 1.2609, "step": 9950 }, { "epoch": 1.0291242152927857, "grad_norm": 9.36589527130127, "learning_rate": 1.7961265066446897e-05, "loss": 1.3099, "step": 10000 }, { "epoch": 1.0291242152927857, "eval_accuracy": 0.9513155221939087, "eval_loss": 0.9894696474075317, "eval_runtime": 417.5493, "eval_samples_per_second": 82.742, "eval_steps_per_second": 2.587, "step": 10000 }, { "epoch": 1.0342698363692497, "grad_norm": 14.563089370727539, "learning_rate": 1.7950963222416815e-05, "loss": 1.2224, "step": 10050 }, { "epoch": 1.0394154574457137, "grad_norm": 11.867334365844727, "learning_rate": 1.7940661378386734e-05, "loss": 1.3552, "step": 10100 }, { "epoch": 1.0445610785221777, "grad_norm": 1.510968565940857, "learning_rate": 1.7930359534356652e-05, "loss": 1.3508, "step": 10150 }, { "epoch": 1.0497066995986415, "grad_norm": 0.8010023832321167, "learning_rate": 1.792005769032657e-05, "loss": 1.3242, "step": 10200 }, { "epoch": 1.0548523206751055, "grad_norm": 5.283142566680908, "learning_rate": 1.790975584629649e-05, "loss": 1.2287, "step": 10250 }, { "epoch": 1.0548523206751055, "eval_accuracy": 0.951170802116394, "eval_loss": 0.9977254867553711, "eval_runtime": 417.1052, "eval_samples_per_second": 82.83, "eval_steps_per_second": 2.589, "step": 10250 }, { "epoch": 1.0599979417515695, "grad_norm": 6.146693706512451, "learning_rate": 1.7899454002266407e-05, "loss": 1.2863, "step": 10300 }, { "epoch": 1.0651435628280332, "grad_norm": 9.038339614868164, "learning_rate": 1.7889152158236326e-05, "loss": 1.2377, "step": 10350 }, { "epoch": 1.0702891839044972, "grad_norm": 8.985528945922852, "learning_rate": 1.7878850314206244e-05, "loss": 1.3058, "step": 10400 }, { "epoch": 1.0754348049809612, "grad_norm": 6.91862154006958, "learning_rate": 1.7868548470176162e-05, "loss": 1.3013, "step": 10450 }, { "epoch": 1.0805804260574252, "grad_norm": 4.811442852020264, "learning_rate": 1.785824662614608e-05, "loss": 1.3233, "step": 10500 }, { "epoch": 1.0805804260574252, "eval_accuracy": 0.9488263130187988, "eval_loss": 0.9947823286056519, "eval_runtime": 418.0737, "eval_samples_per_second": 82.639, "eval_steps_per_second": 2.583, "step": 10500 }, { "epoch": 1.085726047133889, "grad_norm": 3.9576923847198486, "learning_rate": 1.7847944782116e-05, "loss": 1.334, "step": 10550 }, { "epoch": 1.090871668210353, "grad_norm": 11.280867576599121, "learning_rate": 1.783764293808592e-05, "loss": 1.246, "step": 10600 }, { "epoch": 1.096017289286817, "grad_norm": 10.32507038116455, "learning_rate": 1.7827341094055836e-05, "loss": 1.2298, "step": 10650 }, { "epoch": 1.1011629103632807, "grad_norm": 9.05435848236084, "learning_rate": 1.7817039250025755e-05, "loss": 1.2016, "step": 10700 }, { "epoch": 1.1063085314397447, "grad_norm": 10.334163665771484, "learning_rate": 1.7806737405995676e-05, "loss": 1.3035, "step": 10750 }, { "epoch": 1.1063085314397447, "eval_accuracy": 0.9506497979164124, "eval_loss": 0.9946981072425842, "eval_runtime": 417.8974, "eval_samples_per_second": 82.673, "eval_steps_per_second": 2.584, "step": 10750 }, { "epoch": 1.1114541525162087, "grad_norm": 11.852724075317383, "learning_rate": 1.779643556196559e-05, "loss": 1.2457, "step": 10800 }, { "epoch": 1.1165997735926727, "grad_norm": 10.447225570678711, "learning_rate": 1.7786133717935513e-05, "loss": 1.2882, "step": 10850 }, { "epoch": 1.1217453946691365, "grad_norm": 3.3465206623077393, "learning_rate": 1.777583187390543e-05, "loss": 1.2365, "step": 10900 }, { "epoch": 1.1268910157456005, "grad_norm": 6.849998950958252, "learning_rate": 1.776553002987535e-05, "loss": 1.19, "step": 10950 }, { "epoch": 1.1320366368220645, "grad_norm": 11.492406845092773, "learning_rate": 1.775522818584527e-05, "loss": 1.2377, "step": 11000 }, { "epoch": 1.1320366368220645, "eval_accuracy": 0.9511129260063171, "eval_loss": 0.9914972186088562, "eval_runtime": 417.922, "eval_samples_per_second": 82.669, "eval_steps_per_second": 2.584, "step": 11000 }, { "epoch": 1.1371822578985284, "grad_norm": 7.080196857452393, "learning_rate": 1.7744926341815187e-05, "loss": 1.3028, "step": 11050 }, { "epoch": 1.1423278789749922, "grad_norm": 3.5371875762939453, "learning_rate": 1.7734624497785105e-05, "loss": 1.319, "step": 11100 }, { "epoch": 1.1474735000514562, "grad_norm": 5.618402004241943, "learning_rate": 1.7724322653755024e-05, "loss": 1.3315, "step": 11150 }, { "epoch": 1.1526191211279202, "grad_norm": 6.200303554534912, "learning_rate": 1.7714020809724942e-05, "loss": 1.2161, "step": 11200 }, { "epoch": 1.157764742204384, "grad_norm": 8.898612976074219, "learning_rate": 1.770371896569486e-05, "loss": 1.3555, "step": 11250 }, { "epoch": 1.157764742204384, "eval_accuracy": 0.9510550498962402, "eval_loss": 0.990160346031189, "eval_runtime": 417.6517, "eval_samples_per_second": 82.722, "eval_steps_per_second": 2.586, "step": 11250 }, { "epoch": 1.162910363280848, "grad_norm": 4.882264137268066, "learning_rate": 1.769341712166478e-05, "loss": 1.1874, "step": 11300 }, { "epoch": 1.168055984357312, "grad_norm": 3.1759836673736572, "learning_rate": 1.7683115277634697e-05, "loss": 1.2373, "step": 11350 }, { "epoch": 1.173201605433776, "grad_norm": 13.944663047790527, "learning_rate": 1.7672813433604616e-05, "loss": 1.2474, "step": 11400 }, { "epoch": 1.1783472265102397, "grad_norm": 6.393034934997559, "learning_rate": 1.7662511589574534e-05, "loss": 1.2838, "step": 11450 }, { "epoch": 1.1834928475867037, "grad_norm": 9.834447860717773, "learning_rate": 1.7652209745544453e-05, "loss": 1.2242, "step": 11500 }, { "epoch": 1.1834928475867037, "eval_accuracy": 0.9518075585365295, "eval_loss": 0.992717444896698, "eval_runtime": 417.5743, "eval_samples_per_second": 82.737, "eval_steps_per_second": 2.586, "step": 11500 }, { "epoch": 1.1886384686631677, "grad_norm": 7.190279006958008, "learning_rate": 1.764190790151437e-05, "loss": 1.3123, "step": 11550 }, { "epoch": 1.1937840897396317, "grad_norm": 3.1001381874084473, "learning_rate": 1.7631606057484293e-05, "loss": 1.2874, "step": 11600 }, { "epoch": 1.1989297108160955, "grad_norm": 10.424577713012695, "learning_rate": 1.7621304213454208e-05, "loss": 1.2568, "step": 11650 }, { "epoch": 1.2040753318925594, "grad_norm": 2.9702112674713135, "learning_rate": 1.761100236942413e-05, "loss": 1.2526, "step": 11700 }, { "epoch": 1.2092209529690234, "grad_norm": 4.956679821014404, "learning_rate": 1.7600700525394048e-05, "loss": 1.347, "step": 11750 }, { "epoch": 1.2092209529690234, "eval_accuracy": 0.9508523941040039, "eval_loss": 0.9882821440696716, "eval_runtime": 417.5587, "eval_samples_per_second": 82.74, "eval_steps_per_second": 2.586, "step": 11750 }, { "epoch": 1.2143665740454872, "grad_norm": 7.329675674438477, "learning_rate": 1.7590398681363963e-05, "loss": 1.3098, "step": 11800 }, { "epoch": 1.2195121951219512, "grad_norm": 2.8485448360443115, "learning_rate": 1.7580096837333885e-05, "loss": 1.2541, "step": 11850 }, { "epoch": 1.2246578161984152, "grad_norm": 13.313427925109863, "learning_rate": 1.7569794993303803e-05, "loss": 1.2791, "step": 11900 }, { "epoch": 1.2298034372748792, "grad_norm": 10.920377731323242, "learning_rate": 1.7559493149273722e-05, "loss": 1.2333, "step": 11950 }, { "epoch": 1.234949058351343, "grad_norm": 3.033597946166992, "learning_rate": 1.754919130524364e-05, "loss": 1.3827, "step": 12000 }, { "epoch": 1.234949058351343, "eval_accuracy": 0.9507366418838501, "eval_loss": 0.9942870140075684, "eval_runtime": 417.7023, "eval_samples_per_second": 82.712, "eval_steps_per_second": 2.586, "step": 12000 }, { "epoch": 1.240094679427807, "grad_norm": 1.0334879159927368, "learning_rate": 1.753888946121356e-05, "loss": 1.2732, "step": 12050 }, { "epoch": 1.245240300504271, "grad_norm": 7.173407077789307, "learning_rate": 1.7528587617183477e-05, "loss": 1.2993, "step": 12100 }, { "epoch": 1.250385921580735, "grad_norm": 15.351693153381348, "learning_rate": 1.7518285773153396e-05, "loss": 1.2947, "step": 12150 }, { "epoch": 1.2555315426571987, "grad_norm": 13.320657730102539, "learning_rate": 1.7507983929123314e-05, "loss": 1.3001, "step": 12200 }, { "epoch": 1.2606771637336627, "grad_norm": 4.0671186447143555, "learning_rate": 1.7497682085093232e-05, "loss": 1.2957, "step": 12250 }, { "epoch": 1.2606771637336627, "eval_accuracy": 0.9514023661613464, "eval_loss": 0.9864968657493591, "eval_runtime": 417.8412, "eval_samples_per_second": 82.685, "eval_steps_per_second": 2.585, "step": 12250 }, { "epoch": 1.2658227848101267, "grad_norm": 7.0425519943237305, "learning_rate": 1.748738024106315e-05, "loss": 1.1393, "step": 12300 }, { "epoch": 1.2709684058865904, "grad_norm": 4.306710243225098, "learning_rate": 1.747707839703307e-05, "loss": 1.2996, "step": 12350 }, { "epoch": 1.2761140269630544, "grad_norm": 10.586379051208496, "learning_rate": 1.7466776553002988e-05, "loss": 1.3218, "step": 12400 }, { "epoch": 1.2812596480395184, "grad_norm": 6.002781867980957, "learning_rate": 1.745647470897291e-05, "loss": 1.2138, "step": 12450 }, { "epoch": 1.2864052691159822, "grad_norm": 7.406036853790283, "learning_rate": 1.7446172864942825e-05, "loss": 1.1731, "step": 12500 }, { "epoch": 1.2864052691159822, "eval_accuracy": 0.9509682059288025, "eval_loss": 0.9963937997817993, "eval_runtime": 417.622, "eval_samples_per_second": 82.728, "eval_steps_per_second": 2.586, "step": 12500 }, { "epoch": 1.2915508901924462, "grad_norm": 11.54760456085205, "learning_rate": 1.7436077057793347e-05, "loss": 1.3326, "step": 12550 }, { "epoch": 1.2966965112689102, "grad_norm": 3.4204094409942627, "learning_rate": 1.7425775213763265e-05, "loss": 1.3575, "step": 12600 }, { "epoch": 1.3018421323453742, "grad_norm": 9.140461921691895, "learning_rate": 1.7415473369733184e-05, "loss": 1.2948, "step": 12650 }, { "epoch": 1.3069877534218381, "grad_norm": 6.069116592407227, "learning_rate": 1.7405171525703102e-05, "loss": 1.2921, "step": 12700 }, { "epoch": 1.312133374498302, "grad_norm": 13.66699504852295, "learning_rate": 1.739486968167302e-05, "loss": 1.3052, "step": 12750 }, { "epoch": 1.312133374498302, "eval_accuracy": 0.9509103298187256, "eval_loss": 0.9840078949928284, "eval_runtime": 418.1556, "eval_samples_per_second": 82.622, "eval_steps_per_second": 2.583, "step": 12750 }, { "epoch": 1.317278995574766, "grad_norm": 6.949051380157471, "learning_rate": 1.738456783764294e-05, "loss": 1.3662, "step": 12800 }, { "epoch": 1.32242461665123, "grad_norm": 9.286051750183105, "learning_rate": 1.7374265993612858e-05, "loss": 1.3673, "step": 12850 }, { "epoch": 1.3275702377276937, "grad_norm": 9.19774341583252, "learning_rate": 1.7363964149582776e-05, "loss": 1.3006, "step": 12900 }, { "epoch": 1.3327158588041577, "grad_norm": 5.003039360046387, "learning_rate": 1.7353662305552694e-05, "loss": 1.4217, "step": 12950 }, { "epoch": 1.3378614798806217, "grad_norm": 4.849103927612305, "learning_rate": 1.7343360461522613e-05, "loss": 1.1608, "step": 13000 }, { "epoch": 1.3378614798806217, "eval_accuracy": 0.9520102143287659, "eval_loss": 0.98476642370224, "eval_runtime": 417.8875, "eval_samples_per_second": 82.675, "eval_steps_per_second": 2.584, "step": 13000 }, { "epoch": 1.3430071009570854, "grad_norm": 3.944049596786499, "learning_rate": 1.7333058617492535e-05, "loss": 1.2066, "step": 13050 }, { "epoch": 1.3481527220335494, "grad_norm": 8.767118453979492, "learning_rate": 1.732275677346245e-05, "loss": 1.408, "step": 13100 }, { "epoch": 1.3532983431100134, "grad_norm": 8.175588607788086, "learning_rate": 1.7312454929432368e-05, "loss": 1.3574, "step": 13150 }, { "epoch": 1.3584439641864772, "grad_norm": 5.246455192565918, "learning_rate": 1.730215308540229e-05, "loss": 1.3171, "step": 13200 }, { "epoch": 1.3635895852629412, "grad_norm": 8.986821174621582, "learning_rate": 1.7291851241372205e-05, "loss": 1.3188, "step": 13250 }, { "epoch": 1.3635895852629412, "eval_accuracy": 0.9502446055412292, "eval_loss": 0.9888262152671814, "eval_runtime": 418.0556, "eval_samples_per_second": 82.642, "eval_steps_per_second": 2.583, "step": 13250 }, { "epoch": 1.3687352063394052, "grad_norm": 3.4874706268310547, "learning_rate": 1.7281549397342127e-05, "loss": 1.299, "step": 13300 }, { "epoch": 1.3738808274158691, "grad_norm": 5.339372158050537, "learning_rate": 1.7271247553312045e-05, "loss": 1.3015, "step": 13350 }, { "epoch": 1.3790264484923331, "grad_norm": 0.7593218684196472, "learning_rate": 1.7260945709281964e-05, "loss": 1.3159, "step": 13400 }, { "epoch": 1.384172069568797, "grad_norm": 6.2086896896362305, "learning_rate": 1.7250643865251882e-05, "loss": 1.2139, "step": 13450 }, { "epoch": 1.389317690645261, "grad_norm": 8.667464256286621, "learning_rate": 1.72403420212218e-05, "loss": 1.2855, "step": 13500 }, { "epoch": 1.389317690645261, "eval_accuracy": 0.9513733983039856, "eval_loss": 0.9957149624824524, "eval_runtime": 417.8702, "eval_samples_per_second": 82.679, "eval_steps_per_second": 2.585, "step": 13500 }, { "epoch": 1.3944633117217249, "grad_norm": 11.632777214050293, "learning_rate": 1.723004017719172e-05, "loss": 1.2705, "step": 13550 }, { "epoch": 1.3996089327981887, "grad_norm": 24.493167877197266, "learning_rate": 1.7219738333161637e-05, "loss": 1.3099, "step": 13600 }, { "epoch": 1.4047545538746526, "grad_norm": 6.20335054397583, "learning_rate": 1.7209436489131556e-05, "loss": 1.3144, "step": 13650 }, { "epoch": 1.4099001749511166, "grad_norm": 9.66215991973877, "learning_rate": 1.7199134645101474e-05, "loss": 1.2948, "step": 13700 }, { "epoch": 1.4150457960275804, "grad_norm": 3.034616470336914, "learning_rate": 1.7188832801071393e-05, "loss": 1.3313, "step": 13750 }, { "epoch": 1.4150457960275804, "eval_accuracy": 0.9511997699737549, "eval_loss": 0.9909895658493042, "eval_runtime": 417.5787, "eval_samples_per_second": 82.737, "eval_steps_per_second": 2.586, "step": 13750 }, { "epoch": 1.4201914171040444, "grad_norm": 5.056656360626221, "learning_rate": 1.7178530957041314e-05, "loss": 1.3473, "step": 13800 }, { "epoch": 1.4253370381805084, "grad_norm": 2.1632890701293945, "learning_rate": 1.716822911301123e-05, "loss": 1.2037, "step": 13850 }, { "epoch": 1.4304826592569724, "grad_norm": 8.617193222045898, "learning_rate": 1.7157927268981148e-05, "loss": 1.3059, "step": 13900 }, { "epoch": 1.4356282803334364, "grad_norm": 4.062990188598633, "learning_rate": 1.714762542495107e-05, "loss": 1.3763, "step": 13950 }, { "epoch": 1.4407739014099001, "grad_norm": 8.483048439025879, "learning_rate": 1.7137323580920985e-05, "loss": 1.2606, "step": 14000 }, { "epoch": 1.4407739014099001, "eval_accuracy": 0.9522996544837952, "eval_loss": 0.9875785708427429, "eval_runtime": 417.781, "eval_samples_per_second": 82.696, "eval_steps_per_second": 2.585, "step": 14000 }, { "epoch": 1.4459195224863641, "grad_norm": 8.226116180419922, "learning_rate": 1.7127021736890906e-05, "loss": 1.2394, "step": 14050 }, { "epoch": 1.4510651435628281, "grad_norm": 0.5191435813903809, "learning_rate": 1.711671989286082e-05, "loss": 1.219, "step": 14100 }, { "epoch": 1.4562107646392919, "grad_norm": 8.271252632141113, "learning_rate": 1.7106418048830743e-05, "loss": 1.3501, "step": 14150 }, { "epoch": 1.4613563857157559, "grad_norm": 6.9849066734313965, "learning_rate": 1.7096116204800662e-05, "loss": 1.2664, "step": 14200 }, { "epoch": 1.4665020067922199, "grad_norm": 3.286569595336914, "learning_rate": 1.7085814360770577e-05, "loss": 1.2704, "step": 14250 }, { "epoch": 1.4665020067922199, "eval_accuracy": 0.9512576460838318, "eval_loss": 0.994490385055542, "eval_runtime": 418.1263, "eval_samples_per_second": 82.628, "eval_steps_per_second": 2.583, "step": 14250 }, { "epoch": 1.4716476278686836, "grad_norm": 6.6526618003845215, "learning_rate": 1.70755125167405e-05, "loss": 1.2332, "step": 14300 }, { "epoch": 1.4767932489451476, "grad_norm": 7.4313578605651855, "learning_rate": 1.7065210672710417e-05, "loss": 1.2286, "step": 14350 }, { "epoch": 1.4819388700216116, "grad_norm": 6.093780517578125, "learning_rate": 1.7054908828680335e-05, "loss": 1.2123, "step": 14400 }, { "epoch": 1.4870844910980756, "grad_norm": 6.429713726043701, "learning_rate": 1.7044606984650254e-05, "loss": 1.2437, "step": 14450 }, { "epoch": 1.4922301121745396, "grad_norm": 8.225885391235352, "learning_rate": 1.7034305140620172e-05, "loss": 1.2292, "step": 14500 }, { "epoch": 1.4922301121745396, "eval_accuracy": 0.9502446055412292, "eval_loss": 0.9886476993560791, "eval_runtime": 417.1781, "eval_samples_per_second": 82.816, "eval_steps_per_second": 2.589, "step": 14500 }, { "epoch": 1.4973757332510034, "grad_norm": 6.095223903656006, "learning_rate": 1.7024209333470695e-05, "loss": 1.3007, "step": 14550 }, { "epoch": 1.5025213543274674, "grad_norm": 12.490996360778809, "learning_rate": 1.701390748944061e-05, "loss": 1.308, "step": 14600 }, { "epoch": 1.5076669754039314, "grad_norm": 9.118165016174316, "learning_rate": 1.700360564541053e-05, "loss": 1.174, "step": 14650 }, { "epoch": 1.5128125964803951, "grad_norm": 6.5648722648620605, "learning_rate": 1.6993303801380447e-05, "loss": 1.2648, "step": 14700 }, { "epoch": 1.5179582175568591, "grad_norm": 8.813359260559082, "learning_rate": 1.698300195735037e-05, "loss": 1.2533, "step": 14750 }, { "epoch": 1.5179582175568591, "eval_accuracy": 0.9517496824264526, "eval_loss": 0.9885143041610718, "eval_runtime": 417.9702, "eval_samples_per_second": 82.659, "eval_steps_per_second": 2.584, "step": 14750 }, { "epoch": 1.523103838633323, "grad_norm": 1.7033747434616089, "learning_rate": 1.6972700113320287e-05, "loss": 1.2576, "step": 14800 }, { "epoch": 1.5282494597097869, "grad_norm": 5.316808700561523, "learning_rate": 1.6962398269290202e-05, "loss": 1.3659, "step": 14850 }, { "epoch": 1.5333950807862509, "grad_norm": 3.3904647827148438, "learning_rate": 1.6952096425260124e-05, "loss": 1.298, "step": 14900 }, { "epoch": 1.5385407018627149, "grad_norm": 0.8259275555610657, "learning_rate": 1.6941794581230042e-05, "loss": 1.2723, "step": 14950 }, { "epoch": 1.5436863229391786, "grad_norm": 7.67642068862915, "learning_rate": 1.693149273719996e-05, "loss": 1.3099, "step": 15000 }, { "epoch": 1.5436863229391786, "eval_accuracy": 0.9517786502838135, "eval_loss": 0.9875179529190063, "eval_runtime": 416.9745, "eval_samples_per_second": 82.856, "eval_steps_per_second": 2.59, "step": 15000 }, { "epoch": 1.5488319440156428, "grad_norm": 2.3492562770843506, "learning_rate": 1.692119089316988e-05, "loss": 1.2984, "step": 15050 }, { "epoch": 1.5539775650921066, "grad_norm": 5.415560722351074, "learning_rate": 1.6910889049139797e-05, "loss": 1.2128, "step": 15100 }, { "epoch": 1.5591231861685706, "grad_norm": 12.2908935546875, "learning_rate": 1.6900587205109716e-05, "loss": 1.2689, "step": 15150 }, { "epoch": 1.5642688072450346, "grad_norm": 8.375056266784668, "learning_rate": 1.6890285361079634e-05, "loss": 1.2516, "step": 15200 }, { "epoch": 1.5694144283214984, "grad_norm": 9.067890167236328, "learning_rate": 1.6879983517049553e-05, "loss": 1.3028, "step": 15250 }, { "epoch": 1.5694144283214984, "eval_accuracy": 0.9523285627365112, "eval_loss": 0.9856404066085815, "eval_runtime": 417.1329, "eval_samples_per_second": 82.825, "eval_steps_per_second": 2.589, "step": 15250 }, { "epoch": 1.5745600493979623, "grad_norm": 10.568164825439453, "learning_rate": 1.686968167301947e-05, "loss": 1.3619, "step": 15300 }, { "epoch": 1.5797056704744263, "grad_norm": 15.765814781188965, "learning_rate": 1.685937982898939e-05, "loss": 1.3524, "step": 15350 }, { "epoch": 1.58485129155089, "grad_norm": 11.065564155578613, "learning_rate": 1.684907798495931e-05, "loss": 1.1749, "step": 15400 }, { "epoch": 1.589996912627354, "grad_norm": 7.860668659210205, "learning_rate": 1.6838776140929226e-05, "loss": 1.205, "step": 15450 }, { "epoch": 1.595142533703818, "grad_norm": 2.4386684894561768, "learning_rate": 1.6828474296899148e-05, "loss": 1.297, "step": 15500 }, { "epoch": 1.595142533703818, "eval_accuracy": 0.9513155221939087, "eval_loss": 0.9780011177062988, "eval_runtime": 418.0332, "eval_samples_per_second": 82.647, "eval_steps_per_second": 2.584, "step": 15500 }, { "epoch": 1.6002881547802819, "grad_norm": 7.5391316413879395, "learning_rate": 1.6818172452869067e-05, "loss": 1.2469, "step": 15550 }, { "epoch": 1.605433775856746, "grad_norm": 9.402176856994629, "learning_rate": 1.680787060883898e-05, "loss": 1.2285, "step": 15600 }, { "epoch": 1.6105793969332098, "grad_norm": 5.171482563018799, "learning_rate": 1.6797568764808903e-05, "loss": 1.2963, "step": 15650 }, { "epoch": 1.6157250180096736, "grad_norm": 7.366409778594971, "learning_rate": 1.678726692077882e-05, "loss": 1.2406, "step": 15700 }, { "epoch": 1.6208706390861378, "grad_norm": 10.613348007202148, "learning_rate": 1.677696507674874e-05, "loss": 1.3049, "step": 15750 }, { "epoch": 1.6208706390861378, "eval_accuracy": 0.9511997699737549, "eval_loss": 0.9873180389404297, "eval_runtime": 417.7747, "eval_samples_per_second": 82.698, "eval_steps_per_second": 2.585, "step": 15750 }, { "epoch": 1.6260162601626016, "grad_norm": 3.9607322216033936, "learning_rate": 1.676666323271866e-05, "loss": 1.2174, "step": 15800 }, { "epoch": 1.6311618812390656, "grad_norm": 8.552703857421875, "learning_rate": 1.6756361388688577e-05, "loss": 1.2789, "step": 15850 }, { "epoch": 1.6363075023155296, "grad_norm": 5.216203689575195, "learning_rate": 1.6746059544658496e-05, "loss": 1.289, "step": 15900 }, { "epoch": 1.6414531233919933, "grad_norm": 7.981589317321777, "learning_rate": 1.6735757700628414e-05, "loss": 1.3242, "step": 15950 }, { "epoch": 1.6465987444684573, "grad_norm": 9.827128410339355, "learning_rate": 1.6725455856598332e-05, "loss": 1.2974, "step": 16000 }, { "epoch": 1.6465987444684573, "eval_accuracy": 0.9522417187690735, "eval_loss": 0.9755061268806458, "eval_runtime": 418.3094, "eval_samples_per_second": 82.592, "eval_steps_per_second": 2.582, "step": 16000 }, { "epoch": 1.6517443655449213, "grad_norm": 8.793742179870605, "learning_rate": 1.671515401256825e-05, "loss": 1.2741, "step": 16050 }, { "epoch": 1.656889986621385, "grad_norm": 4.681251049041748, "learning_rate": 1.670485216853817e-05, "loss": 1.1625, "step": 16100 }, { "epoch": 1.662035607697849, "grad_norm": 9.398008346557617, "learning_rate": 1.6694550324508088e-05, "loss": 1.2795, "step": 16150 }, { "epoch": 1.667181228774313, "grad_norm": 7.628296852111816, "learning_rate": 1.6684248480478006e-05, "loss": 1.2301, "step": 16200 }, { "epoch": 1.6723268498507768, "grad_norm": 7.104902267456055, "learning_rate": 1.6673946636447928e-05, "loss": 1.2348, "step": 16250 }, { "epoch": 1.6723268498507768, "eval_accuracy": 0.952791690826416, "eval_loss": 0.980122447013855, "eval_runtime": 418.027, "eval_samples_per_second": 82.648, "eval_steps_per_second": 2.584, "step": 16250 }, { "epoch": 1.677472470927241, "grad_norm": 6.678224563598633, "learning_rate": 1.6663644792417843e-05, "loss": 1.2408, "step": 16300 }, { "epoch": 1.6826180920037048, "grad_norm": 13.851053237915039, "learning_rate": 1.665334294838776e-05, "loss": 1.2477, "step": 16350 }, { "epoch": 1.6877637130801688, "grad_norm": 3.6658806800842285, "learning_rate": 1.6643041104357683e-05, "loss": 1.3386, "step": 16400 }, { "epoch": 1.6929093341566328, "grad_norm": 5.4644927978515625, "learning_rate": 1.6632739260327598e-05, "loss": 1.2346, "step": 16450 }, { "epoch": 1.6980549552330966, "grad_norm": 1.6028341054916382, "learning_rate": 1.662243741629752e-05, "loss": 1.2904, "step": 16500 }, { "epoch": 1.6980549552330966, "eval_accuracy": 0.9520391225814819, "eval_loss": 0.9905561208724976, "eval_runtime": 417.9366, "eval_samples_per_second": 82.666, "eval_steps_per_second": 2.584, "step": 16500 }, { "epoch": 1.7032005763095606, "grad_norm": 0.9734807014465332, "learning_rate": 1.661234160914804e-05, "loss": 1.2947, "step": 16550 }, { "epoch": 1.7083461973860246, "grad_norm": 3.4319236278533936, "learning_rate": 1.6602039765117958e-05, "loss": 1.2572, "step": 16600 }, { "epoch": 1.7134918184624883, "grad_norm": 3.019766092300415, "learning_rate": 1.6591737921087876e-05, "loss": 1.2738, "step": 16650 }, { "epoch": 1.7186374395389523, "grad_norm": 9.71827507019043, "learning_rate": 1.6581436077057794e-05, "loss": 1.2686, "step": 16700 }, { "epoch": 1.7237830606154163, "grad_norm": 5.171957969665527, "learning_rate": 1.6571134233027713e-05, "loss": 1.4041, "step": 16750 }, { "epoch": 1.7237830606154163, "eval_accuracy": 0.952791690826416, "eval_loss": 0.9791179895401001, "eval_runtime": 418.3742, "eval_samples_per_second": 82.579, "eval_steps_per_second": 2.581, "step": 16750 }, { "epoch": 1.72892868169188, "grad_norm": 5.277884006500244, "learning_rate": 1.656083238899763e-05, "loss": 1.2935, "step": 16800 }, { "epoch": 1.7340743027683443, "grad_norm": 10.89902400970459, "learning_rate": 1.655053054496755e-05, "loss": 1.2501, "step": 16850 }, { "epoch": 1.739219923844808, "grad_norm": 2.373206377029419, "learning_rate": 1.6540434737818072e-05, "loss": 1.3208, "step": 16900 }, { "epoch": 1.744365544921272, "grad_norm": 1.7645074129104614, "learning_rate": 1.653013289378799e-05, "loss": 1.2486, "step": 16950 }, { "epoch": 1.749511165997736, "grad_norm": 3.979423999786377, "learning_rate": 1.651983104975791e-05, "loss": 1.2587, "step": 17000 }, { "epoch": 1.749511165997736, "eval_accuracy": 0.9519522786140442, "eval_loss": 0.9862294793128967, "eval_runtime": 417.504, "eval_samples_per_second": 82.751, "eval_steps_per_second": 2.587, "step": 17000 }, { "epoch": 1.7546567870741998, "grad_norm": 3.5347177982330322, "learning_rate": 1.6509529205727824e-05, "loss": 1.3325, "step": 17050 }, { "epoch": 1.7598024081506638, "grad_norm": 5.752897262573242, "learning_rate": 1.6499227361697746e-05, "loss": 1.3104, "step": 17100 }, { "epoch": 1.7649480292271278, "grad_norm": 8.936431884765625, "learning_rate": 1.6488925517667664e-05, "loss": 1.2504, "step": 17150 }, { "epoch": 1.7700936503035916, "grad_norm": 11.348810195922852, "learning_rate": 1.6478623673637583e-05, "loss": 1.3153, "step": 17200 }, { "epoch": 1.7752392713800556, "grad_norm": 8.096456527709961, "learning_rate": 1.64683218296075e-05, "loss": 1.328, "step": 17250 }, { "epoch": 1.7752392713800556, "eval_accuracy": 0.9529942870140076, "eval_loss": 0.9803459644317627, "eval_runtime": 417.43, "eval_samples_per_second": 82.766, "eval_steps_per_second": 2.587, "step": 17250 }, { "epoch": 1.7803848924565195, "grad_norm": 4.984877586364746, "learning_rate": 1.645801998557742e-05, "loss": 1.3417, "step": 17300 }, { "epoch": 1.7855305135329833, "grad_norm": 8.615971565246582, "learning_rate": 1.6447718141547338e-05, "loss": 1.2486, "step": 17350 }, { "epoch": 1.7906761346094475, "grad_norm": 6.480031490325928, "learning_rate": 1.6437416297517256e-05, "loss": 1.2869, "step": 17400 }, { "epoch": 1.7958217556859113, "grad_norm": 3.220890522003174, "learning_rate": 1.6427114453487175e-05, "loss": 1.3599, "step": 17450 }, { "epoch": 1.800967376762375, "grad_norm": 6.310009956359863, "learning_rate": 1.6416812609457093e-05, "loss": 1.2822, "step": 17500 }, { "epoch": 1.800967376762375, "eval_accuracy": 0.9526180028915405, "eval_loss": 0.9846508502960205, "eval_runtime": 417.34, "eval_samples_per_second": 82.784, "eval_steps_per_second": 2.588, "step": 17500 }, { "epoch": 1.8061129978388393, "grad_norm": 9.1428804397583, "learning_rate": 1.640651076542701e-05, "loss": 1.3001, "step": 17550 }, { "epoch": 1.811258618915303, "grad_norm": 11.503830909729004, "learning_rate": 1.6396208921396934e-05, "loss": 1.0848, "step": 17600 }, { "epoch": 1.816404239991767, "grad_norm": 6.229241847991943, "learning_rate": 1.638590707736685e-05, "loss": 1.3171, "step": 17650 }, { "epoch": 1.821549861068231, "grad_norm": 6.697323799133301, "learning_rate": 1.637560523333677e-05, "loss": 1.3387, "step": 17700 }, { "epoch": 1.8266954821446948, "grad_norm": 12.814096450805664, "learning_rate": 1.636530338930669e-05, "loss": 1.2401, "step": 17750 }, { "epoch": 1.8266954821446948, "eval_accuracy": 0.9528206586837769, "eval_loss": 0.9803994297981262, "eval_runtime": 417.6448, "eval_samples_per_second": 82.723, "eval_steps_per_second": 2.586, "step": 17750 }, { "epoch": 1.8318411032211588, "grad_norm": 5.222059726715088, "learning_rate": 1.6355001545276604e-05, "loss": 1.2979, "step": 17800 }, { "epoch": 1.8369867242976228, "grad_norm": 3.9017868041992188, "learning_rate": 1.6344699701246526e-05, "loss": 1.2222, "step": 17850 }, { "epoch": 1.8421323453740865, "grad_norm": 7.524175643920898, "learning_rate": 1.6334397857216444e-05, "loss": 1.27, "step": 17900 }, { "epoch": 1.8472779664505505, "grad_norm": 4.3478593826293945, "learning_rate": 1.6324096013186362e-05, "loss": 1.3109, "step": 17950 }, { "epoch": 1.8524235875270145, "grad_norm": 8.614230155944824, "learning_rate": 1.631379416915628e-05, "loss": 1.2306, "step": 18000 }, { "epoch": 1.8524235875270145, "eval_accuracy": 0.9536600112915039, "eval_loss": 0.987566351890564, "eval_runtime": 417.5552, "eval_samples_per_second": 82.741, "eval_steps_per_second": 2.586, "step": 18000 }, { "epoch": 1.8575692086034783, "grad_norm": 7.108985900878906, "learning_rate": 1.63034923251262e-05, "loss": 1.1878, "step": 18050 }, { "epoch": 1.8627148296799425, "grad_norm": 10.433032989501953, "learning_rate": 1.6293190481096118e-05, "loss": 1.2398, "step": 18100 }, { "epoch": 1.8678604507564063, "grad_norm": 10.102560043334961, "learning_rate": 1.6282888637066036e-05, "loss": 1.2576, "step": 18150 }, { "epoch": 1.8730060718328703, "grad_norm": 4.380664348602295, "learning_rate": 1.6272586793035955e-05, "loss": 1.1579, "step": 18200 }, { "epoch": 1.8781516929093343, "grad_norm": 2.1999149322509766, "learning_rate": 1.6262284949005873e-05, "loss": 1.2889, "step": 18250 }, { "epoch": 1.8781516929093343, "eval_accuracy": 0.9519233703613281, "eval_loss": 0.9859423041343689, "eval_runtime": 417.7845, "eval_samples_per_second": 82.696, "eval_steps_per_second": 2.585, "step": 18250 }, { "epoch": 1.883297313985798, "grad_norm": 5.329191207885742, "learning_rate": 1.625198310497579e-05, "loss": 1.3331, "step": 18300 }, { "epoch": 1.888442935062262, "grad_norm": 17.370649337768555, "learning_rate": 1.624168126094571e-05, "loss": 1.2957, "step": 18350 }, { "epoch": 1.893588556138726, "grad_norm": 10.373506546020508, "learning_rate": 1.6231379416915628e-05, "loss": 1.2286, "step": 18400 }, { "epoch": 1.8987341772151898, "grad_norm": 13.445988655090332, "learning_rate": 1.622107757288555e-05, "loss": 1.2513, "step": 18450 }, { "epoch": 1.9038797982916538, "grad_norm": 11.915916442871094, "learning_rate": 1.6210775728855465e-05, "loss": 1.1702, "step": 18500 }, { "epoch": 1.9038797982916538, "eval_accuracy": 0.9540941715240479, "eval_loss": 0.9839755296707153, "eval_runtime": 417.3645, "eval_samples_per_second": 82.779, "eval_steps_per_second": 2.588, "step": 18500 }, { "epoch": 1.9090254193681178, "grad_norm": 13.058712005615234, "learning_rate": 1.6200473884825383e-05, "loss": 1.3181, "step": 18550 }, { "epoch": 1.9141710404445815, "grad_norm": 8.620599746704102, "learning_rate": 1.6190172040795305e-05, "loss": 1.1976, "step": 18600 }, { "epoch": 1.9193166615210457, "grad_norm": 7.074895858764648, "learning_rate": 1.617987019676522e-05, "loss": 1.3623, "step": 18650 }, { "epoch": 1.9244622825975095, "grad_norm": 10.293702125549316, "learning_rate": 1.6169568352735142e-05, "loss": 1.2594, "step": 18700 }, { "epoch": 1.9296079036739735, "grad_norm": 7.491464138031006, "learning_rate": 1.615926650870506e-05, "loss": 1.2902, "step": 18750 }, { "epoch": 1.9296079036739735, "eval_accuracy": 0.9522128105163574, "eval_loss": 0.9844051003456116, "eval_runtime": 418.2592, "eval_samples_per_second": 82.602, "eval_steps_per_second": 2.582, "step": 18750 }, { "epoch": 1.9347535247504375, "grad_norm": 2.0575156211853027, "learning_rate": 1.614896466467498e-05, "loss": 1.3283, "step": 18800 }, { "epoch": 1.9398991458269013, "grad_norm": 10.51094913482666, "learning_rate": 1.6138662820644897e-05, "loss": 1.2987, "step": 18850 }, { "epoch": 1.9450447669033653, "grad_norm": 6.296252727508545, "learning_rate": 1.6128567013495417e-05, "loss": 1.1987, "step": 18900 }, { "epoch": 1.9501903879798292, "grad_norm": 2.280179738998413, "learning_rate": 1.6118265169465335e-05, "loss": 1.2385, "step": 18950 }, { "epoch": 1.955336009056293, "grad_norm": 5.830591678619385, "learning_rate": 1.6107963325435253e-05, "loss": 1.2772, "step": 19000 }, { "epoch": 1.955336009056293, "eval_accuracy": 0.9533126950263977, "eval_loss": 0.9861400723457336, "eval_runtime": 418.0997, "eval_samples_per_second": 82.633, "eval_steps_per_second": 2.583, "step": 19000 }, { "epoch": 1.960481630132757, "grad_norm": 6.468533515930176, "learning_rate": 1.6097661481405172e-05, "loss": 1.1906, "step": 19050 }, { "epoch": 1.965627251209221, "grad_norm": 7.839109420776367, "learning_rate": 1.608735963737509e-05, "loss": 1.3041, "step": 19100 }, { "epoch": 1.9707728722856848, "grad_norm": 12.740795135498047, "learning_rate": 1.607705779334501e-05, "loss": 1.2345, "step": 19150 }, { "epoch": 1.975918493362149, "grad_norm": 7.1893134117126465, "learning_rate": 1.606675594931493e-05, "loss": 1.2586, "step": 19200 }, { "epoch": 1.9810641144386127, "grad_norm": 14.163928031921387, "learning_rate": 1.6056454105284846e-05, "loss": 1.196, "step": 19250 }, { "epoch": 1.9810641144386127, "eval_accuracy": 0.9521838426589966, "eval_loss": 0.9835113286972046, "eval_runtime": 417.4557, "eval_samples_per_second": 82.761, "eval_steps_per_second": 2.587, "step": 19250 }, { "epoch": 1.9862097355150765, "grad_norm": 5.9427618980407715, "learning_rate": 1.6046152261254767e-05, "loss": 1.2872, "step": 19300 }, { "epoch": 1.9913553565915407, "grad_norm": 14.67308235168457, "learning_rate": 1.6035850417224686e-05, "loss": 1.2449, "step": 19350 }, { "epoch": 1.9965009776680045, "grad_norm": 3.581702947616577, "learning_rate": 1.6025548573194604e-05, "loss": 1.2435, "step": 19400 }, { "epoch": 2.0016465987444683, "grad_norm": 13.742449760437012, "learning_rate": 1.6015246729164523e-05, "loss": 1.3096, "step": 19450 }, { "epoch": 2.0067922198209325, "grad_norm": 10.677633285522461, "learning_rate": 1.600494488513444e-05, "loss": 1.1697, "step": 19500 }, { "epoch": 2.0067922198209325, "eval_accuracy": 0.9514312744140625, "eval_loss": 1.0035802125930786, "eval_runtime": 418.5248, "eval_samples_per_second": 82.549, "eval_steps_per_second": 2.58, "step": 19500 }, { "epoch": 2.0119378408973962, "grad_norm": 6.8798418045043945, "learning_rate": 1.599464304110436e-05, "loss": 1.0556, "step": 19550 }, { "epoch": 2.0170834619738605, "grad_norm": 8.785711288452148, "learning_rate": 1.5984341197074278e-05, "loss": 1.1592, "step": 19600 }, { "epoch": 2.0222290830503242, "grad_norm": 11.857321739196777, "learning_rate": 1.5974039353044196e-05, "loss": 1.1808, "step": 19650 }, { "epoch": 2.027374704126788, "grad_norm": 3.2849769592285156, "learning_rate": 1.5963737509014115e-05, "loss": 1.141, "step": 19700 }, { "epoch": 2.032520325203252, "grad_norm": 7.293135166168213, "learning_rate": 1.5953435664984033e-05, "loss": 1.1139, "step": 19750 }, { "epoch": 2.032520325203252, "eval_accuracy": 0.9516628384590149, "eval_loss": 1.0205085277557373, "eval_runtime": 417.5267, "eval_samples_per_second": 82.747, "eval_steps_per_second": 2.587, "step": 19750 }, { "epoch": 2.037665946279716, "grad_norm": 10.044347763061523, "learning_rate": 1.594313382095395e-05, "loss": 1.1959, "step": 19800 }, { "epoch": 2.0428115673561797, "grad_norm": 0.14353907108306885, "learning_rate": 1.593283197692387e-05, "loss": 1.0762, "step": 19850 }, { "epoch": 2.047957188432644, "grad_norm": 9.524683952331543, "learning_rate": 1.592253013289379e-05, "loss": 1.3522, "step": 19900 }, { "epoch": 2.0531028095091077, "grad_norm": 12.576092720031738, "learning_rate": 1.5912228288863707e-05, "loss": 1.1175, "step": 19950 }, { "epoch": 2.0582484305855715, "grad_norm": 0.6325793862342834, "learning_rate": 1.5901926444833625e-05, "loss": 1.178, "step": 20000 }, { "epoch": 2.0582484305855715, "eval_accuracy": 0.951228678226471, "eval_loss": 1.018436074256897, "eval_runtime": 417.7277, "eval_samples_per_second": 82.707, "eval_steps_per_second": 2.585, "step": 20000 }, { "epoch": 2.0633940516620357, "grad_norm": 12.589435577392578, "learning_rate": 1.5891624600803547e-05, "loss": 1.1416, "step": 20050 }, { "epoch": 2.0685396727384995, "grad_norm": 4.007430076599121, "learning_rate": 1.5881322756773462e-05, "loss": 1.1523, "step": 20100 }, { "epoch": 2.0736852938149637, "grad_norm": 4.076907157897949, "learning_rate": 1.5871020912743384e-05, "loss": 1.2561, "step": 20150 }, { "epoch": 2.0788309148914275, "grad_norm": 1.5451477766036987, "learning_rate": 1.5860719068713302e-05, "loss": 1.119, "step": 20200 }, { "epoch": 2.0839765359678912, "grad_norm": 5.0513691902160645, "learning_rate": 1.5850417224683217e-05, "loss": 1.095, "step": 20250 }, { "epoch": 2.0839765359678912, "eval_accuracy": 0.95041823387146, "eval_loss": 1.0154516696929932, "eval_runtime": 418.0138, "eval_samples_per_second": 82.65, "eval_steps_per_second": 2.584, "step": 20250 }, { "epoch": 2.0891221570443554, "grad_norm": 9.095207214355469, "learning_rate": 1.584011538065314e-05, "loss": 1.1432, "step": 20300 }, { "epoch": 2.094267778120819, "grad_norm": 9.274755477905273, "learning_rate": 1.5829813536623058e-05, "loss": 1.1455, "step": 20350 }, { "epoch": 2.099413399197283, "grad_norm": 14.58219051361084, "learning_rate": 1.5819511692592976e-05, "loss": 1.0913, "step": 20400 }, { "epoch": 2.104559020273747, "grad_norm": 12.373740196228027, "learning_rate": 1.5809209848562894e-05, "loss": 1.1671, "step": 20450 }, { "epoch": 2.109704641350211, "grad_norm": 7.772844314575195, "learning_rate": 1.5798908004532813e-05, "loss": 1.2776, "step": 20500 }, { "epoch": 2.109704641350211, "eval_accuracy": 0.9514023661613464, "eval_loss": 1.0333930253982544, "eval_runtime": 418.0428, "eval_samples_per_second": 82.645, "eval_steps_per_second": 2.583, "step": 20500 }, { "epoch": 2.1148502624266747, "grad_norm": 7.367280006408691, "learning_rate": 1.578860616050273e-05, "loss": 1.3092, "step": 20550 }, { "epoch": 2.119995883503139, "grad_norm": 10.191935539245605, "learning_rate": 1.577830431647265e-05, "loss": 1.1981, "step": 20600 }, { "epoch": 2.1251415045796027, "grad_norm": 7.1885199546813965, "learning_rate": 1.5768002472442568e-05, "loss": 1.1399, "step": 20650 }, { "epoch": 2.1302871256560665, "grad_norm": 1.4416226148605347, "learning_rate": 1.5757700628412486e-05, "loss": 1.0976, "step": 20700 }, { "epoch": 2.1354327467325307, "grad_norm": 13.531189918518066, "learning_rate": 1.5747398784382405e-05, "loss": 1.1335, "step": 20750 }, { "epoch": 2.1354327467325307, "eval_accuracy": 0.9518365263938904, "eval_loss": 1.0136040449142456, "eval_runtime": 418.6163, "eval_samples_per_second": 82.531, "eval_steps_per_second": 2.58, "step": 20750 }, { "epoch": 2.1405783678089945, "grad_norm": 0.48753559589385986, "learning_rate": 1.5737096940352323e-05, "loss": 1.1567, "step": 20800 }, { "epoch": 2.1457239888854587, "grad_norm": 1.1756892204284668, "learning_rate": 1.5726795096322242e-05, "loss": 1.2536, "step": 20850 }, { "epoch": 2.1508696099619224, "grad_norm": 13.005239486694336, "learning_rate": 1.5716493252292164e-05, "loss": 1.1717, "step": 20900 }, { "epoch": 2.156015231038386, "grad_norm": 13.348917961120605, "learning_rate": 1.570619140826208e-05, "loss": 1.1433, "step": 20950 }, { "epoch": 2.1611608521148504, "grad_norm": 4.952757835388184, "learning_rate": 1.5695889564231997e-05, "loss": 1.1885, "step": 21000 }, { "epoch": 2.1611608521148504, "eval_accuracy": 0.951170802116394, "eval_loss": 1.0185319185256958, "eval_runtime": 418.1975, "eval_samples_per_second": 82.614, "eval_steps_per_second": 2.583, "step": 21000 }, { "epoch": 2.166306473191314, "grad_norm": 10.125651359558105, "learning_rate": 1.568558772020192e-05, "loss": 1.0543, "step": 21050 }, { "epoch": 2.171452094267778, "grad_norm": 5.2072062492370605, "learning_rate": 1.5675285876171834e-05, "loss": 1.1122, "step": 21100 }, { "epoch": 2.176597715344242, "grad_norm": 3.4542808532714844, "learning_rate": 1.5664984032141756e-05, "loss": 1.17, "step": 21150 }, { "epoch": 2.181743336420706, "grad_norm": 1.4935418367385864, "learning_rate": 1.5654682188111674e-05, "loss": 1.0757, "step": 21200 }, { "epoch": 2.1868889574971697, "grad_norm": 2.735926389694214, "learning_rate": 1.5644380344081593e-05, "loss": 1.3008, "step": 21250 }, { "epoch": 2.1868889574971697, "eval_accuracy": 0.9506497979164124, "eval_loss": 1.016100287437439, "eval_runtime": 417.7311, "eval_samples_per_second": 82.706, "eval_steps_per_second": 2.585, "step": 21250 }, { "epoch": 2.192034578573634, "grad_norm": 11.821269989013672, "learning_rate": 1.563407850005151e-05, "loss": 1.1723, "step": 21300 }, { "epoch": 2.1971801996500977, "grad_norm": 1.3524460792541504, "learning_rate": 1.562377665602143e-05, "loss": 1.2517, "step": 21350 }, { "epoch": 2.2023258207265615, "grad_norm": 6.308670520782471, "learning_rate": 1.5613474811991348e-05, "loss": 1.1834, "step": 21400 }, { "epoch": 2.2074714418030257, "grad_norm": 10.960680961608887, "learning_rate": 1.5603172967961266e-05, "loss": 1.1284, "step": 21450 }, { "epoch": 2.2126170628794894, "grad_norm": 8.426880836486816, "learning_rate": 1.5592871123931185e-05, "loss": 1.28, "step": 21500 }, { "epoch": 2.2126170628794894, "eval_accuracy": 0.9507076740264893, "eval_loss": 1.0217114686965942, "eval_runtime": 418.3067, "eval_samples_per_second": 82.593, "eval_steps_per_second": 2.582, "step": 21500 }, { "epoch": 2.2177626839559537, "grad_norm": 3.2604434490203857, "learning_rate": 1.5582569279901103e-05, "loss": 1.2478, "step": 21550 }, { "epoch": 2.2229083050324174, "grad_norm": 1.98189115524292, "learning_rate": 1.557226743587102e-05, "loss": 1.1798, "step": 21600 }, { "epoch": 2.228053926108881, "grad_norm": 4.054050445556641, "learning_rate": 1.5562171628721544e-05, "loss": 1.1218, "step": 21650 }, { "epoch": 2.2331995471853454, "grad_norm": 10.367090225219727, "learning_rate": 1.555186978469146e-05, "loss": 1.2787, "step": 21700 }, { "epoch": 2.238345168261809, "grad_norm": 12.966401100158691, "learning_rate": 1.554156794066138e-05, "loss": 1.1254, "step": 21750 }, { "epoch": 2.238345168261809, "eval_accuracy": 0.9507656097412109, "eval_loss": 1.0311578512191772, "eval_runtime": 417.8078, "eval_samples_per_second": 82.691, "eval_steps_per_second": 2.585, "step": 21750 }, { "epoch": 2.243490789338273, "grad_norm": 10.402215957641602, "learning_rate": 1.55312660966313e-05, "loss": 1.2375, "step": 21800 }, { "epoch": 2.248636410414737, "grad_norm": 15.226551055908203, "learning_rate": 1.5520964252601218e-05, "loss": 1.1074, "step": 21850 }, { "epoch": 2.253782031491201, "grad_norm": 7.523915767669678, "learning_rate": 1.5510662408571136e-05, "loss": 1.0927, "step": 21900 }, { "epoch": 2.2589276525676647, "grad_norm": 8.177473068237305, "learning_rate": 1.5500360564541055e-05, "loss": 1.1691, "step": 21950 }, { "epoch": 2.264073273644129, "grad_norm": 4.812458038330078, "learning_rate": 1.5490058720510973e-05, "loss": 1.1703, "step": 22000 }, { "epoch": 2.264073273644129, "eval_accuracy": 0.9499261975288391, "eval_loss": 1.0275415182113647, "eval_runtime": 417.8493, "eval_samples_per_second": 82.683, "eval_steps_per_second": 2.585, "step": 22000 }, { "epoch": 2.2692188947205927, "grad_norm": 5.703485012054443, "learning_rate": 1.547975687648089e-05, "loss": 1.2158, "step": 22050 }, { "epoch": 2.274364515797057, "grad_norm": 10.64054012298584, "learning_rate": 1.546945503245081e-05, "loss": 1.1026, "step": 22100 }, { "epoch": 2.2795101368735207, "grad_norm": 0.8261292576789856, "learning_rate": 1.5459153188420728e-05, "loss": 1.0644, "step": 22150 }, { "epoch": 2.2846557579499844, "grad_norm": 5.98064661026001, "learning_rate": 1.5448851344390647e-05, "loss": 1.1092, "step": 22200 }, { "epoch": 2.2898013790264486, "grad_norm": 10.404533386230469, "learning_rate": 1.543854950036057e-05, "loss": 1.1686, "step": 22250 }, { "epoch": 2.2898013790264486, "eval_accuracy": 0.9511997699737549, "eval_loss": 1.0342940092086792, "eval_runtime": 417.6196, "eval_samples_per_second": 82.728, "eval_steps_per_second": 2.586, "step": 22250 }, { "epoch": 2.2949470001029124, "grad_norm": 3.5781595706939697, "learning_rate": 1.5428247656330483e-05, "loss": 1.2711, "step": 22300 }, { "epoch": 2.300092621179376, "grad_norm": 5.956209182739258, "learning_rate": 1.5417945812300402e-05, "loss": 1.2942, "step": 22350 }, { "epoch": 2.3052382422558404, "grad_norm": 0.08046738803386688, "learning_rate": 1.5407643968270324e-05, "loss": 1.2073, "step": 22400 }, { "epoch": 2.310383863332304, "grad_norm": 6.365548610687256, "learning_rate": 1.539734212424024e-05, "loss": 1.2131, "step": 22450 }, { "epoch": 2.315529484408768, "grad_norm": 6.6707258224487305, "learning_rate": 1.538704028021016e-05, "loss": 1.1445, "step": 22500 }, { "epoch": 2.315529484408768, "eval_accuracy": 0.9516628384590149, "eval_loss": 1.0127946138381958, "eval_runtime": 417.7487, "eval_samples_per_second": 82.703, "eval_steps_per_second": 2.585, "step": 22500 }, { "epoch": 2.320675105485232, "grad_norm": 0.25063377618789673, "learning_rate": 1.5376738436180076e-05, "loss": 1.1553, "step": 22550 }, { "epoch": 2.325820726561696, "grad_norm": 8.28696060180664, "learning_rate": 1.5366436592149997e-05, "loss": 1.1512, "step": 22600 }, { "epoch": 2.33096634763816, "grad_norm": 1.361279845237732, "learning_rate": 1.5356134748119916e-05, "loss": 1.2069, "step": 22650 }, { "epoch": 2.336111968714624, "grad_norm": 1.9882014989852905, "learning_rate": 1.534583290408983e-05, "loss": 1.1345, "step": 22700 }, { "epoch": 2.3412575897910877, "grad_norm": 4.8411865234375, "learning_rate": 1.5335531060059753e-05, "loss": 1.1681, "step": 22750 }, { "epoch": 2.3412575897910877, "eval_accuracy": 0.9508813619613647, "eval_loss": 1.0100795030593872, "eval_runtime": 417.4967, "eval_samples_per_second": 82.753, "eval_steps_per_second": 2.587, "step": 22750 }, { "epoch": 2.346403210867552, "grad_norm": 6.883627414703369, "learning_rate": 1.532522921602967e-05, "loss": 1.1372, "step": 22800 }, { "epoch": 2.3515488319440156, "grad_norm": 9.81013298034668, "learning_rate": 1.531492737199959e-05, "loss": 1.1393, "step": 22850 }, { "epoch": 2.3566944530204794, "grad_norm": 7.514392852783203, "learning_rate": 1.5304625527969508e-05, "loss": 1.1327, "step": 22900 }, { "epoch": 2.3618400740969436, "grad_norm": 2.8904621601104736, "learning_rate": 1.5294323683939426e-05, "loss": 1.0903, "step": 22950 }, { "epoch": 2.3669856951734074, "grad_norm": 7.99860954284668, "learning_rate": 1.5284021839909345e-05, "loss": 1.1354, "step": 23000 }, { "epoch": 2.3669856951734074, "eval_accuracy": 0.9513444900512695, "eval_loss": 1.0172919034957886, "eval_runtime": 418.1914, "eval_samples_per_second": 82.615, "eval_steps_per_second": 2.583, "step": 23000 }, { "epoch": 2.372131316249871, "grad_norm": 3.2717432975769043, "learning_rate": 1.5273719995879263e-05, "loss": 1.2517, "step": 23050 }, { "epoch": 2.3772769373263354, "grad_norm": 4.3879594802856445, "learning_rate": 1.526341815184918e-05, "loss": 1.0634, "step": 23100 }, { "epoch": 2.382422558402799, "grad_norm": 8.286825180053711, "learning_rate": 1.52531163078191e-05, "loss": 1.2095, "step": 23150 }, { "epoch": 2.3875681794792634, "grad_norm": 2.0084967613220215, "learning_rate": 1.524281446378902e-05, "loss": 1.1686, "step": 23200 }, { "epoch": 2.392713800555727, "grad_norm": 5.974940776824951, "learning_rate": 1.5232512619758939e-05, "loss": 1.1063, "step": 23250 }, { "epoch": 2.392713800555727, "eval_accuracy": 0.9516628384590149, "eval_loss": 1.0242797136306763, "eval_runtime": 417.7754, "eval_samples_per_second": 82.698, "eval_steps_per_second": 2.585, "step": 23250 }, { "epoch": 2.397859421632191, "grad_norm": 7.380057334899902, "learning_rate": 1.5222210775728857e-05, "loss": 1.1309, "step": 23300 }, { "epoch": 2.403005042708655, "grad_norm": 1.6446843147277832, "learning_rate": 1.5211908931698775e-05, "loss": 1.1869, "step": 23350 }, { "epoch": 2.408150663785119, "grad_norm": 5.716843605041504, "learning_rate": 1.5201607087668696e-05, "loss": 1.1743, "step": 23400 }, { "epoch": 2.4132962848615827, "grad_norm": 2.141338586807251, "learning_rate": 1.5191305243638612e-05, "loss": 1.1001, "step": 23450 }, { "epoch": 2.418441905938047, "grad_norm": 1.5462799072265625, "learning_rate": 1.5181003399608532e-05, "loss": 1.1696, "step": 23500 }, { "epoch": 2.418441905938047, "eval_accuracy": 0.9524732828140259, "eval_loss": 1.0314745903015137, "eval_runtime": 418.5814, "eval_samples_per_second": 82.538, "eval_steps_per_second": 2.58, "step": 23500 }, { "epoch": 2.4235875270145106, "grad_norm": 10.894279479980469, "learning_rate": 1.5170701555578449e-05, "loss": 1.1493, "step": 23550 }, { "epoch": 2.4287331480909744, "grad_norm": 10.256954193115234, "learning_rate": 1.5160399711548367e-05, "loss": 1.1486, "step": 23600 }, { "epoch": 2.4338787691674386, "grad_norm": 14.089293479919434, "learning_rate": 1.5150097867518288e-05, "loss": 1.2302, "step": 23650 }, { "epoch": 2.4390243902439024, "grad_norm": 7.1174492835998535, "learning_rate": 1.5139796023488204e-05, "loss": 1.1427, "step": 23700 }, { "epoch": 2.4441700113203666, "grad_norm": 1.6686251163482666, "learning_rate": 1.5129494179458124e-05, "loss": 1.2123, "step": 23750 }, { "epoch": 2.4441700113203666, "eval_accuracy": 0.9509971141815186, "eval_loss": 1.0296884775161743, "eval_runtime": 418.2348, "eval_samples_per_second": 82.607, "eval_steps_per_second": 2.582, "step": 23750 }, { "epoch": 2.4493156323968304, "grad_norm": 0.8798663020133972, "learning_rate": 1.5119192335428043e-05, "loss": 1.1169, "step": 23800 }, { "epoch": 2.454461253473294, "grad_norm": 2.511453151702881, "learning_rate": 1.5108890491397961e-05, "loss": 1.1688, "step": 23850 }, { "epoch": 2.4596068745497583, "grad_norm": 8.896649360656738, "learning_rate": 1.509858864736788e-05, "loss": 1.0506, "step": 23900 }, { "epoch": 2.464752495626222, "grad_norm": 12.617236137390137, "learning_rate": 1.50882868033378e-05, "loss": 1.1965, "step": 23950 }, { "epoch": 2.469898116702686, "grad_norm": 14.843036651611328, "learning_rate": 1.5077984959307717e-05, "loss": 1.1253, "step": 24000 }, { "epoch": 2.469898116702686, "eval_accuracy": 0.9508234858512878, "eval_loss": 1.0238152742385864, "eval_runtime": 418.2418, "eval_samples_per_second": 82.605, "eval_steps_per_second": 2.582, "step": 24000 }, { "epoch": 2.47504373777915, "grad_norm": 13.253664016723633, "learning_rate": 1.5067683115277637e-05, "loss": 1.1957, "step": 24050 }, { "epoch": 2.480189358855614, "grad_norm": 4.080730438232422, "learning_rate": 1.5057381271247555e-05, "loss": 1.1395, "step": 24100 }, { "epoch": 2.4853349799320776, "grad_norm": 1.9019577503204346, "learning_rate": 1.5047079427217472e-05, "loss": 1.1238, "step": 24150 }, { "epoch": 2.490480601008542, "grad_norm": 10.57223129272461, "learning_rate": 1.5036983620067993e-05, "loss": 1.1342, "step": 24200 }, { "epoch": 2.4956262220850056, "grad_norm": 11.598908424377441, "learning_rate": 1.5026681776037913e-05, "loss": 1.1703, "step": 24250 }, { "epoch": 2.4956262220850056, "eval_accuracy": 0.9505629539489746, "eval_loss": 1.0218814611434937, "eval_runtime": 418.3594, "eval_samples_per_second": 82.582, "eval_steps_per_second": 2.582, "step": 24250 }, { "epoch": 2.50077184316147, "grad_norm": 9.800350189208984, "learning_rate": 1.501637993200783e-05, "loss": 1.0947, "step": 24300 }, { "epoch": 2.5059174642379336, "grad_norm": 3.784536361694336, "learning_rate": 1.500607808797775e-05, "loss": 1.1281, "step": 24350 }, { "epoch": 2.5110630853143974, "grad_norm": 2.499333620071411, "learning_rate": 1.4995776243947668e-05, "loss": 1.1029, "step": 24400 }, { "epoch": 2.516208706390861, "grad_norm": 9.453124046325684, "learning_rate": 1.4985474399917585e-05, "loss": 1.1784, "step": 24450 }, { "epoch": 2.5213543274673254, "grad_norm": 2.7490689754486084, "learning_rate": 1.4975172555887505e-05, "loss": 1.101, "step": 24500 }, { "epoch": 2.5213543274673254, "eval_accuracy": 0.9527627229690552, "eval_loss": 1.0266767740249634, "eval_runtime": 417.6079, "eval_samples_per_second": 82.731, "eval_steps_per_second": 2.586, "step": 24500 }, { "epoch": 2.526499948543789, "grad_norm": 8.228843688964844, "learning_rate": 1.4964870711857425e-05, "loss": 1.1231, "step": 24550 }, { "epoch": 2.5316455696202533, "grad_norm": 8.344508171081543, "learning_rate": 1.4954568867827342e-05, "loss": 1.1364, "step": 24600 }, { "epoch": 2.536791190696717, "grad_norm": 2.6875457763671875, "learning_rate": 1.494426702379726e-05, "loss": 1.1778, "step": 24650 }, { "epoch": 2.541936811773181, "grad_norm": 6.898427486419678, "learning_rate": 1.493396517976718e-05, "loss": 1.1089, "step": 24700 }, { "epoch": 2.547082432849645, "grad_norm": 3.228970766067505, "learning_rate": 1.4923663335737097e-05, "loss": 1.1626, "step": 24750 }, { "epoch": 2.547082432849645, "eval_accuracy": 0.9508234858512878, "eval_loss": 1.0254093408584595, "eval_runtime": 417.4897, "eval_samples_per_second": 82.754, "eval_steps_per_second": 2.587, "step": 24750 }, { "epoch": 2.552228053926109, "grad_norm": 4.928084850311279, "learning_rate": 1.4913361491707017e-05, "loss": 1.2019, "step": 24800 }, { "epoch": 2.557373675002573, "grad_norm": 15.422240257263184, "learning_rate": 1.4903059647676936e-05, "loss": 1.1503, "step": 24850 }, { "epoch": 2.562519296079037, "grad_norm": 11.451377868652344, "learning_rate": 1.4892757803646854e-05, "loss": 1.1697, "step": 24900 }, { "epoch": 2.5676649171555006, "grad_norm": 7.738549709320068, "learning_rate": 1.4882455959616772e-05, "loss": 1.0921, "step": 24950 }, { "epoch": 2.5728105382319644, "grad_norm": 2.7136483192443848, "learning_rate": 1.4872154115586692e-05, "loss": 1.3136, "step": 25000 }, { "epoch": 2.5728105382319644, "eval_accuracy": 0.9512865543365479, "eval_loss": 1.0222209692001343, "eval_runtime": 419.1601, "eval_samples_per_second": 82.424, "eval_steps_per_second": 2.577, "step": 25000 } ], "logging_steps": 50, "max_steps": 97170, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }