{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 250, "global_step": 22737, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006597176408497163, "grad_norm": 25.778600692749023, "learning_rate": 9.600000000000001e-06, "loss": 3.9255, "step": 50 }, { "epoch": 0.013194352816994326, "grad_norm": 15.797815322875977, "learning_rate": 1.9600000000000002e-05, "loss": 1.7945, "step": 100 }, { "epoch": 0.01979152922549149, "grad_norm": 16.71779441833496, "learning_rate": 1.9957591553651104e-05, "loss": 1.5522, "step": 150 }, { "epoch": 0.02638870563398865, "grad_norm": 14.723907470703125, "learning_rate": 1.9913416088704335e-05, "loss": 1.6267, "step": 200 }, { "epoch": 0.032985882042485815, "grad_norm": 14.432233810424805, "learning_rate": 1.9869240623757565e-05, "loss": 1.5434, "step": 250 }, { "epoch": 0.032985882042485815, "eval_accuracy": 0.8714285492897034, "eval_loss": 0.8546671867370605, "eval_runtime": 14.7911, "eval_samples_per_second": 165.64, "eval_steps_per_second": 5.206, "step": 250 }, { "epoch": 0.03958305845098298, "grad_norm": 12.044981002807617, "learning_rate": 1.98250651588108e-05, "loss": 1.3113, "step": 300 }, { "epoch": 0.04618023485948014, "grad_norm": 15.790678024291992, "learning_rate": 1.978088969386403e-05, "loss": 1.3674, "step": 350 }, { "epoch": 0.0527774112679773, "grad_norm": 14.051512718200684, "learning_rate": 1.9736714228917263e-05, "loss": 1.3417, "step": 400 }, { "epoch": 0.059374587676474466, "grad_norm": 16.866775512695312, "learning_rate": 1.9692538763970493e-05, "loss": 1.2831, "step": 450 }, { "epoch": 0.06597176408497163, "grad_norm": 15.336055755615234, "learning_rate": 1.9648363299023723e-05, "loss": 1.2243, "step": 500 }, { "epoch": 0.06597176408497163, "eval_accuracy": 0.882040798664093, "eval_loss": 0.7872514724731445, "eval_runtime": 14.0019, "eval_samples_per_second": 174.976, "eval_steps_per_second": 5.499, "step": 500 }, { "epoch": 0.07256894049346879, "grad_norm": 21.990840911865234, "learning_rate": 1.9604187834076954e-05, "loss": 1.2276, "step": 550 }, { "epoch": 0.07916611690196595, "grad_norm": 14.601304054260254, "learning_rate": 1.9560012369130184e-05, "loss": 1.2502, "step": 600 }, { "epoch": 0.08576329331046312, "grad_norm": 10.78171157836914, "learning_rate": 1.9516720413482352e-05, "loss": 1.2247, "step": 650 }, { "epoch": 0.09236046971896028, "grad_norm": 14.99619197845459, "learning_rate": 1.9472544948535586e-05, "loss": 1.178, "step": 700 }, { "epoch": 0.09895764612745744, "grad_norm": 11.09481430053711, "learning_rate": 1.9428369483588816e-05, "loss": 1.2379, "step": 750 }, { "epoch": 0.09895764612745744, "eval_accuracy": 0.8930612206459045, "eval_loss": 0.7503395080566406, "eval_runtime": 14.018, "eval_samples_per_second": 174.775, "eval_steps_per_second": 5.493, "step": 750 }, { "epoch": 0.1055548225359546, "grad_norm": 12.924555778503418, "learning_rate": 1.938419401864205e-05, "loss": 1.3893, "step": 800 }, { "epoch": 0.11215199894445177, "grad_norm": 16.87848663330078, "learning_rate": 1.934001855369528e-05, "loss": 1.1852, "step": 850 }, { "epoch": 0.11874917535294893, "grad_norm": 17.876659393310547, "learning_rate": 1.929584308874851e-05, "loss": 1.1082, "step": 900 }, { "epoch": 0.1253463517614461, "grad_norm": 14.923641204833984, "learning_rate": 1.925166762380174e-05, "loss": 0.9946, "step": 950 }, { "epoch": 0.13194352816994326, "grad_norm": 20.28868865966797, "learning_rate": 1.9207492158854975e-05, "loss": 1.1834, "step": 1000 }, { "epoch": 0.13194352816994326, "eval_accuracy": 0.899591863155365, "eval_loss": 0.7308884859085083, "eval_runtime": 14.203, "eval_samples_per_second": 172.499, "eval_steps_per_second": 5.421, "step": 1000 }, { "epoch": 0.13854070457844042, "grad_norm": 13.742298126220703, "learning_rate": 1.9163316693908205e-05, "loss": 1.1556, "step": 1050 }, { "epoch": 0.14513788098693758, "grad_norm": 14.966007232666016, "learning_rate": 1.9119141228961435e-05, "loss": 1.0251, "step": 1100 }, { "epoch": 0.15173505739543475, "grad_norm": 16.00642967224121, "learning_rate": 1.907496576401467e-05, "loss": 1.1943, "step": 1150 }, { "epoch": 0.1583322338039319, "grad_norm": 13.92847728729248, "learning_rate": 1.90307902990679e-05, "loss": 1.086, "step": 1200 }, { "epoch": 0.16492941021242907, "grad_norm": 16.767595291137695, "learning_rate": 1.898661483412113e-05, "loss": 1.1236, "step": 1250 }, { "epoch": 0.16492941021242907, "eval_accuracy": 0.9008163213729858, "eval_loss": 0.6945549845695496, "eval_runtime": 14.235, "eval_samples_per_second": 172.11, "eval_steps_per_second": 5.409, "step": 1250 }, { "epoch": 0.17152658662092624, "grad_norm": 13.734739303588867, "learning_rate": 1.894243936917436e-05, "loss": 1.0485, "step": 1300 }, { "epoch": 0.1781237630294234, "grad_norm": 8.18282699584961, "learning_rate": 1.8898263904227594e-05, "loss": 0.9481, "step": 1350 }, { "epoch": 0.18472093943792056, "grad_norm": 13.874724388122559, "learning_rate": 1.8854088439280824e-05, "loss": 1.0898, "step": 1400 }, { "epoch": 0.19131811584641772, "grad_norm": 15.291620254516602, "learning_rate": 1.8809912974334058e-05, "loss": 1.0863, "step": 1450 }, { "epoch": 0.1979152922549149, "grad_norm": 21.6629581451416, "learning_rate": 1.8765737509387288e-05, "loss": 1.0756, "step": 1500 }, { "epoch": 0.1979152922549149, "eval_accuracy": 0.9036734700202942, "eval_loss": 0.6746897101402283, "eval_runtime": 14.4342, "eval_samples_per_second": 169.736, "eval_steps_per_second": 5.335, "step": 1500 }, { "epoch": 0.20451246866341205, "grad_norm": 14.420069694519043, "learning_rate": 1.872156204444052e-05, "loss": 0.9973, "step": 1550 }, { "epoch": 0.2111096450719092, "grad_norm": 7.541851043701172, "learning_rate": 1.867738657949375e-05, "loss": 1.1098, "step": 1600 }, { "epoch": 0.21770682148040638, "grad_norm": 15.377376556396484, "learning_rate": 1.8633211114546983e-05, "loss": 1.1745, "step": 1650 }, { "epoch": 0.22430399788890354, "grad_norm": 10.262870788574219, "learning_rate": 1.8589035649600213e-05, "loss": 0.9654, "step": 1700 }, { "epoch": 0.2309011742974007, "grad_norm": 11.869269371032715, "learning_rate": 1.8544860184653447e-05, "loss": 1.0919, "step": 1750 }, { "epoch": 0.2309011742974007, "eval_accuracy": 0.9093877673149109, "eval_loss": 0.649857223033905, "eval_runtime": 14.4377, "eval_samples_per_second": 169.695, "eval_steps_per_second": 5.333, "step": 1750 }, { "epoch": 0.23749835070589786, "grad_norm": 12.47613525390625, "learning_rate": 1.8500684719706677e-05, "loss": 1.0249, "step": 1800 }, { "epoch": 0.24409552711439503, "grad_norm": 11.694280624389648, "learning_rate": 1.8456509254759907e-05, "loss": 0.9863, "step": 1850 }, { "epoch": 0.2506927035228922, "grad_norm": 6.96587610244751, "learning_rate": 1.841233378981314e-05, "loss": 1.1091, "step": 1900 }, { "epoch": 0.25728987993138935, "grad_norm": 16.962194442749023, "learning_rate": 1.836815832486637e-05, "loss": 1.0989, "step": 1950 }, { "epoch": 0.2638870563398865, "grad_norm": 16.43683433532715, "learning_rate": 1.83239828599196e-05, "loss": 1.0662, "step": 2000 }, { "epoch": 0.2638870563398865, "eval_accuracy": 0.9065306186676025, "eval_loss": 0.6661304235458374, "eval_runtime": 14.2056, "eval_samples_per_second": 172.467, "eval_steps_per_second": 5.42, "step": 2000 }, { "epoch": 0.2704842327483837, "grad_norm": 12.090469360351562, "learning_rate": 1.8279807394972832e-05, "loss": 1.0456, "step": 2050 }, { "epoch": 0.27708140915688084, "grad_norm": 14.27798843383789, "learning_rate": 1.8235631930026066e-05, "loss": 1.1349, "step": 2100 }, { "epoch": 0.283678585565378, "grad_norm": 14.521726608276367, "learning_rate": 1.8191456465079296e-05, "loss": 1.0111, "step": 2150 }, { "epoch": 0.29027576197387517, "grad_norm": 9.772090911865234, "learning_rate": 1.814728100013253e-05, "loss": 1.026, "step": 2200 }, { "epoch": 0.29687293838237233, "grad_norm": 15.107865333557129, "learning_rate": 1.810310553518576e-05, "loss": 0.9415, "step": 2250 }, { "epoch": 0.29687293838237233, "eval_accuracy": 0.9073469638824463, "eval_loss": 0.6389794945716858, "eval_runtime": 14.5287, "eval_samples_per_second": 168.631, "eval_steps_per_second": 5.3, "step": 2250 }, { "epoch": 0.3034701147908695, "grad_norm": 15.88947582244873, "learning_rate": 1.805893007023899e-05, "loss": 0.9761, "step": 2300 }, { "epoch": 0.31006729119936666, "grad_norm": 13.472917556762695, "learning_rate": 1.801475460529222e-05, "loss": 0.9748, "step": 2350 }, { "epoch": 0.3166644676078638, "grad_norm": 14.00285530090332, "learning_rate": 1.7970579140345454e-05, "loss": 1.0238, "step": 2400 }, { "epoch": 0.323261644016361, "grad_norm": 15.622306823730469, "learning_rate": 1.7926403675398685e-05, "loss": 1.0456, "step": 2450 }, { "epoch": 0.32985882042485815, "grad_norm": 12.947722434997559, "learning_rate": 1.788222821045192e-05, "loss": 0.9895, "step": 2500 }, { "epoch": 0.32985882042485815, "eval_accuracy": 0.9110203981399536, "eval_loss": 0.6434822678565979, "eval_runtime": 14.6459, "eval_samples_per_second": 167.283, "eval_steps_per_second": 5.257, "step": 2500 }, { "epoch": 0.3364559968333553, "grad_norm": 16.64254379272461, "learning_rate": 1.783805274550515e-05, "loss": 0.8796, "step": 2550 }, { "epoch": 0.34305317324185247, "grad_norm": 12.790375709533691, "learning_rate": 1.779387728055838e-05, "loss": 1.0172, "step": 2600 }, { "epoch": 0.34965034965034963, "grad_norm": 13.025754928588867, "learning_rate": 1.774970181561161e-05, "loss": 1.014, "step": 2650 }, { "epoch": 0.3562475260588468, "grad_norm": 13.217813491821289, "learning_rate": 1.770552635066484e-05, "loss": 0.9748, "step": 2700 }, { "epoch": 0.36284470246734396, "grad_norm": 13.908524513244629, "learning_rate": 1.7661350885718073e-05, "loss": 0.9273, "step": 2750 }, { "epoch": 0.36284470246734396, "eval_accuracy": 0.9081632494926453, "eval_loss": 0.6303015947341919, "eval_runtime": 14.1305, "eval_samples_per_second": 173.384, "eval_steps_per_second": 5.449, "step": 2750 }, { "epoch": 0.3694418788758411, "grad_norm": 12.053607940673828, "learning_rate": 1.7617175420771304e-05, "loss": 1.0122, "step": 2800 }, { "epoch": 0.3760390552843383, "grad_norm": 13.809615135192871, "learning_rate": 1.7572999955824538e-05, "loss": 1.0054, "step": 2850 }, { "epoch": 0.38263623169283545, "grad_norm": 14.718282699584961, "learning_rate": 1.7528824490877768e-05, "loss": 0.8974, "step": 2900 }, { "epoch": 0.3892334081013326, "grad_norm": 16.11876678466797, "learning_rate": 1.7484649025930998e-05, "loss": 0.9396, "step": 2950 }, { "epoch": 0.3958305845098298, "grad_norm": 9.439668655395508, "learning_rate": 1.744047356098423e-05, "loss": 0.8734, "step": 3000 }, { "epoch": 0.3958305845098298, "eval_accuracy": 0.9048979878425598, "eval_loss": 0.6237688064575195, "eval_runtime": 14.1809, "eval_samples_per_second": 172.767, "eval_steps_per_second": 5.43, "step": 3000 }, { "epoch": 0.40242776091832694, "grad_norm": 10.298930168151855, "learning_rate": 1.7396298096037462e-05, "loss": 1.0048, "step": 3050 }, { "epoch": 0.4090249373268241, "grad_norm": 7.693696975708008, "learning_rate": 1.7352122631090693e-05, "loss": 0.9701, "step": 3100 }, { "epoch": 0.41562211373532126, "grad_norm": 16.300338745117188, "learning_rate": 1.7307947166143926e-05, "loss": 0.9924, "step": 3150 }, { "epoch": 0.4222192901438184, "grad_norm": 10.341270446777344, "learning_rate": 1.7263771701197157e-05, "loss": 0.9349, "step": 3200 }, { "epoch": 0.4288164665523156, "grad_norm": 14.08645248413086, "learning_rate": 1.721959623625039e-05, "loss": 0.974, "step": 3250 }, { "epoch": 0.4288164665523156, "eval_accuracy": 0.9118367433547974, "eval_loss": 0.6216471791267395, "eval_runtime": 14.3141, "eval_samples_per_second": 171.16, "eval_steps_per_second": 5.379, "step": 3250 }, { "epoch": 0.43541364296081275, "grad_norm": 20.50489044189453, "learning_rate": 1.717542077130362e-05, "loss": 1.0539, "step": 3300 }, { "epoch": 0.4420108193693099, "grad_norm": 17.268712997436523, "learning_rate": 1.713124530635685e-05, "loss": 0.9389, "step": 3350 }, { "epoch": 0.4486079957778071, "grad_norm": 12.712272644042969, "learning_rate": 1.708706984141008e-05, "loss": 0.9171, "step": 3400 }, { "epoch": 0.45520517218630424, "grad_norm": 12.377297401428223, "learning_rate": 1.704289437646331e-05, "loss": 0.9706, "step": 3450 }, { "epoch": 0.4618023485948014, "grad_norm": 18.502830505371094, "learning_rate": 1.6998718911516545e-05, "loss": 1.0124, "step": 3500 }, { "epoch": 0.4618023485948014, "eval_accuracy": 0.9065306186676025, "eval_loss": 0.6126046180725098, "eval_runtime": 14.1583, "eval_samples_per_second": 173.043, "eval_steps_per_second": 5.438, "step": 3500 }, { "epoch": 0.46839952500329857, "grad_norm": 17.543399810791016, "learning_rate": 1.6954543446569776e-05, "loss": 0.9215, "step": 3550 }, { "epoch": 0.47499670141179573, "grad_norm": 15.049899101257324, "learning_rate": 1.691036798162301e-05, "loss": 0.8563, "step": 3600 }, { "epoch": 0.4815938778202929, "grad_norm": 14.00575065612793, "learning_rate": 1.686619251667624e-05, "loss": 0.8249, "step": 3650 }, { "epoch": 0.48819105422879006, "grad_norm": 19.295759201049805, "learning_rate": 1.682201705172947e-05, "loss": 0.8794, "step": 3700 }, { "epoch": 0.4947882306372872, "grad_norm": 14.837241172790527, "learning_rate": 1.67778415867827e-05, "loss": 1.0013, "step": 3750 }, { "epoch": 0.4947882306372872, "eval_accuracy": 0.9077550768852234, "eval_loss": 0.6021705865859985, "eval_runtime": 14.1781, "eval_samples_per_second": 172.802, "eval_steps_per_second": 5.431, "step": 3750 }, { "epoch": 0.5013854070457844, "grad_norm": 12.423500061035156, "learning_rate": 1.6733666121835934e-05, "loss": 0.922, "step": 3800 }, { "epoch": 0.5079825834542816, "grad_norm": 13.505254745483398, "learning_rate": 1.6689490656889164e-05, "loss": 0.9168, "step": 3850 }, { "epoch": 0.5145797598627787, "grad_norm": 12.56449031829834, "learning_rate": 1.6645315191942398e-05, "loss": 0.9315, "step": 3900 }, { "epoch": 0.5211769362712759, "grad_norm": 11.239628791809082, "learning_rate": 1.660113972699563e-05, "loss": 0.9265, "step": 3950 }, { "epoch": 0.527774112679773, "grad_norm": 9.262091636657715, "learning_rate": 1.655696426204886e-05, "loss": 0.9453, "step": 4000 }, { "epoch": 0.527774112679773, "eval_accuracy": 0.9077550768852234, "eval_loss": 0.6083095669746399, "eval_runtime": 14.2575, "eval_samples_per_second": 171.839, "eval_steps_per_second": 5.401, "step": 4000 }, { "epoch": 0.5343712890882703, "grad_norm": 11.317748069763184, "learning_rate": 1.651278879710209e-05, "loss": 0.9585, "step": 4050 }, { "epoch": 0.5409684654967674, "grad_norm": 13.768712997436523, "learning_rate": 1.6468613332155323e-05, "loss": 0.9886, "step": 4100 }, { "epoch": 0.5475656419052646, "grad_norm": 11.504364967346191, "learning_rate": 1.6424437867208553e-05, "loss": 0.9081, "step": 4150 }, { "epoch": 0.5541628183137617, "grad_norm": 16.876300811767578, "learning_rate": 1.6380262402261787e-05, "loss": 0.8181, "step": 4200 }, { "epoch": 0.5607599947222589, "grad_norm": 15.651288986206055, "learning_rate": 1.6336086937315017e-05, "loss": 0.8806, "step": 4250 }, { "epoch": 0.5607599947222589, "eval_accuracy": 0.9118367433547974, "eval_loss": 0.5917608141899109, "eval_runtime": 14.7158, "eval_samples_per_second": 166.487, "eval_steps_per_second": 5.232, "step": 4250 }, { "epoch": 0.567357171130756, "grad_norm": 11.500801086425781, "learning_rate": 1.6291911472368248e-05, "loss": 0.858, "step": 4300 }, { "epoch": 0.5739543475392532, "grad_norm": 10.485420227050781, "learning_rate": 1.624773600742148e-05, "loss": 0.8781, "step": 4350 }, { "epoch": 0.5805515239477503, "grad_norm": 8.773555755615234, "learning_rate": 1.620356054247471e-05, "loss": 0.9059, "step": 4400 }, { "epoch": 0.5871487003562476, "grad_norm": 12.097881317138672, "learning_rate": 1.6159385077527942e-05, "loss": 0.8475, "step": 4450 }, { "epoch": 0.5937458767647447, "grad_norm": 9.051371574401855, "learning_rate": 1.6115209612581172e-05, "loss": 0.9649, "step": 4500 }, { "epoch": 0.5937458767647447, "eval_accuracy": 0.9057142734527588, "eval_loss": 0.5950626730918884, "eval_runtime": 14.9206, "eval_samples_per_second": 164.202, "eval_steps_per_second": 5.161, "step": 4500 }, { "epoch": 0.6003430531732419, "grad_norm": 15.799858093261719, "learning_rate": 1.6071034147634406e-05, "loss": 0.969, "step": 4550 }, { "epoch": 0.606940229581739, "grad_norm": 10.038565635681152, "learning_rate": 1.6026858682687636e-05, "loss": 0.8685, "step": 4600 }, { "epoch": 0.6135374059902362, "grad_norm": 14.452479362487793, "learning_rate": 1.598268321774087e-05, "loss": 0.9555, "step": 4650 }, { "epoch": 0.6201345823987333, "grad_norm": 14.48049259185791, "learning_rate": 1.59385077527941e-05, "loss": 0.9166, "step": 4700 }, { "epoch": 0.6267317588072305, "grad_norm": 10.772700309753418, "learning_rate": 1.589433228784733e-05, "loss": 0.877, "step": 4750 }, { "epoch": 0.6267317588072305, "eval_accuracy": 0.9073469638824463, "eval_loss": 0.5858258605003357, "eval_runtime": 14.792, "eval_samples_per_second": 165.63, "eval_steps_per_second": 5.206, "step": 4750 }, { "epoch": 0.6333289352157276, "grad_norm": 12.199923515319824, "learning_rate": 1.585015682290056e-05, "loss": 0.938, "step": 4800 }, { "epoch": 0.6399261116242249, "grad_norm": 11.47739315032959, "learning_rate": 1.5805981357953795e-05, "loss": 0.9211, "step": 4850 }, { "epoch": 0.646523288032722, "grad_norm": 12.546594619750977, "learning_rate": 1.5761805893007025e-05, "loss": 0.9699, "step": 4900 }, { "epoch": 0.6531204644412192, "grad_norm": 15.941895484924316, "learning_rate": 1.571763042806026e-05, "loss": 0.8818, "step": 4950 }, { "epoch": 0.6597176408497163, "grad_norm": 12.06876277923584, "learning_rate": 1.567345496311349e-05, "loss": 0.9814, "step": 5000 }, { "epoch": 0.6597176408497163, "eval_accuracy": 0.9175510406494141, "eval_loss": 0.5705481767654419, "eval_runtime": 14.3212, "eval_samples_per_second": 171.075, "eval_steps_per_second": 5.377, "step": 5000 }, { "epoch": 0.6663148172582135, "grad_norm": 11.047979354858398, "learning_rate": 1.562927949816672e-05, "loss": 0.8588, "step": 5050 }, { "epoch": 0.6729119936667106, "grad_norm": 13.39299488067627, "learning_rate": 1.558510403321995e-05, "loss": 0.8922, "step": 5100 }, { "epoch": 0.6795091700752078, "grad_norm": 11.451362609863281, "learning_rate": 1.554092856827318e-05, "loss": 1.0096, "step": 5150 }, { "epoch": 0.6861063464837049, "grad_norm": 3.436371326446533, "learning_rate": 1.5496753103326414e-05, "loss": 0.9217, "step": 5200 }, { "epoch": 0.6927035228922022, "grad_norm": 9.360651016235352, "learning_rate": 1.5452577638379644e-05, "loss": 0.9446, "step": 5250 }, { "epoch": 0.6927035228922022, "eval_accuracy": 0.9146938920021057, "eval_loss": 0.5739869475364685, "eval_runtime": 14.2053, "eval_samples_per_second": 172.471, "eval_steps_per_second": 5.421, "step": 5250 }, { "epoch": 0.6993006993006993, "grad_norm": 13.184717178344727, "learning_rate": 1.5408402173432878e-05, "loss": 0.9301, "step": 5300 }, { "epoch": 0.7058978757091965, "grad_norm": 9.54310417175293, "learning_rate": 1.5364226708486108e-05, "loss": 0.8436, "step": 5350 }, { "epoch": 0.7124950521176936, "grad_norm": 12.212594032287598, "learning_rate": 1.532005124353934e-05, "loss": 0.8547, "step": 5400 }, { "epoch": 0.7190922285261908, "grad_norm": 13.941079139709473, "learning_rate": 1.527587577859257e-05, "loss": 0.9552, "step": 5450 }, { "epoch": 0.7256894049346879, "grad_norm": 9.494156837463379, "learning_rate": 1.5232583822944737e-05, "loss": 0.9227, "step": 5500 }, { "epoch": 0.7256894049346879, "eval_accuracy": 0.9134693741798401, "eval_loss": 0.5912680625915527, "eval_runtime": 14.37, "eval_samples_per_second": 170.494, "eval_steps_per_second": 5.358, "step": 5500 }, { "epoch": 0.7322865813431851, "grad_norm": 15.922163963317871, "learning_rate": 1.518840835799797e-05, "loss": 0.8813, "step": 5550 }, { "epoch": 0.7388837577516822, "grad_norm": 5.2287068367004395, "learning_rate": 1.5144232893051201e-05, "loss": 0.8519, "step": 5600 }, { "epoch": 0.7454809341601795, "grad_norm": 13.272147178649902, "learning_rate": 1.5100057428104431e-05, "loss": 0.8223, "step": 5650 }, { "epoch": 0.7520781105686766, "grad_norm": 10.859210968017578, "learning_rate": 1.5055881963157663e-05, "loss": 0.8603, "step": 5700 }, { "epoch": 0.7586752869771738, "grad_norm": 13.087916374206543, "learning_rate": 1.5011706498210894e-05, "loss": 0.8208, "step": 5750 }, { "epoch": 0.7586752869771738, "eval_accuracy": 0.9151020646095276, "eval_loss": 0.5698295831680298, "eval_runtime": 14.3053, "eval_samples_per_second": 171.265, "eval_steps_per_second": 5.383, "step": 5750 }, { "epoch": 0.7652724633856709, "grad_norm": 11.395907402038574, "learning_rate": 1.4967531033264127e-05, "loss": 0.8542, "step": 5800 }, { "epoch": 0.7718696397941681, "grad_norm": 12.993699073791504, "learning_rate": 1.4923355568317358e-05, "loss": 0.7924, "step": 5850 }, { "epoch": 0.7784668162026652, "grad_norm": 12.30950927734375, "learning_rate": 1.487918010337059e-05, "loss": 0.9238, "step": 5900 }, { "epoch": 0.7850639926111624, "grad_norm": 14.112768173217773, "learning_rate": 1.483500463842382e-05, "loss": 0.8303, "step": 5950 }, { "epoch": 0.7916611690196595, "grad_norm": 12.148374557495117, "learning_rate": 1.4790829173477052e-05, "loss": 0.8254, "step": 6000 }, { "epoch": 0.7916611690196595, "eval_accuracy": 0.9159183502197266, "eval_loss": 0.5643152594566345, "eval_runtime": 14.2837, "eval_samples_per_second": 171.524, "eval_steps_per_second": 5.391, "step": 6000 }, { "epoch": 0.7982583454281568, "grad_norm": 22.816574096679688, "learning_rate": 1.4746653708530282e-05, "loss": 0.8556, "step": 6050 }, { "epoch": 0.8048555218366539, "grad_norm": 20.84126853942871, "learning_rate": 1.4702478243583516e-05, "loss": 0.9286, "step": 6100 }, { "epoch": 0.8114526982451511, "grad_norm": 10.403019905090332, "learning_rate": 1.4658302778636746e-05, "loss": 0.8776, "step": 6150 }, { "epoch": 0.8180498746536482, "grad_norm": 12.371121406555176, "learning_rate": 1.4614127313689978e-05, "loss": 0.8146, "step": 6200 }, { "epoch": 0.8246470510621454, "grad_norm": 8.280356407165527, "learning_rate": 1.4569951848743209e-05, "loss": 0.8469, "step": 6250 }, { "epoch": 0.8246470510621454, "eval_accuracy": 0.9126530885696411, "eval_loss": 0.5626720190048218, "eval_runtime": 14.5893, "eval_samples_per_second": 167.931, "eval_steps_per_second": 5.278, "step": 6250 }, { "epoch": 0.8312442274706425, "grad_norm": 12.503052711486816, "learning_rate": 1.452577638379644e-05, "loss": 0.9719, "step": 6300 }, { "epoch": 0.8378414038791397, "grad_norm": 8.480955123901367, "learning_rate": 1.4481600918849673e-05, "loss": 0.9297, "step": 6350 }, { "epoch": 0.8444385802876369, "grad_norm": 11.265844345092773, "learning_rate": 1.4437425453902903e-05, "loss": 0.896, "step": 6400 }, { "epoch": 0.8510357566961341, "grad_norm": 14.131726264953613, "learning_rate": 1.4393249988956135e-05, "loss": 0.8709, "step": 6450 }, { "epoch": 0.8576329331046312, "grad_norm": 15.310577392578125, "learning_rate": 1.4349958033308302e-05, "loss": 0.9436, "step": 6500 }, { "epoch": 0.8576329331046312, "eval_accuracy": 0.9159183502197266, "eval_loss": 0.5638322234153748, "eval_runtime": 14.7169, "eval_samples_per_second": 166.475, "eval_steps_per_second": 5.232, "step": 6500 }, { "epoch": 0.8642301095131284, "grad_norm": 9.686817169189453, "learning_rate": 1.4305782568361532e-05, "loss": 0.8938, "step": 6550 }, { "epoch": 0.8708272859216255, "grad_norm": 13.079803466796875, "learning_rate": 1.4261607103414766e-05, "loss": 0.8065, "step": 6600 }, { "epoch": 0.8774244623301227, "grad_norm": 8.047029495239258, "learning_rate": 1.4217431638467996e-05, "loss": 0.8281, "step": 6650 }, { "epoch": 0.8840216387386198, "grad_norm": 9.289030075073242, "learning_rate": 1.4173256173521228e-05, "loss": 0.8449, "step": 6700 }, { "epoch": 0.890618815147117, "grad_norm": 14.22453498840332, "learning_rate": 1.4129080708574458e-05, "loss": 0.813, "step": 6750 }, { "epoch": 0.890618815147117, "eval_accuracy": 0.9167346954345703, "eval_loss": 0.5693557262420654, "eval_runtime": 14.6084, "eval_samples_per_second": 167.711, "eval_steps_per_second": 5.271, "step": 6750 }, { "epoch": 0.8972159915556142, "grad_norm": 13.204727172851562, "learning_rate": 1.408490524362769e-05, "loss": 0.9052, "step": 6800 }, { "epoch": 0.9038131679641114, "grad_norm": 13.640901565551758, "learning_rate": 1.404072977868092e-05, "loss": 0.9501, "step": 6850 }, { "epoch": 0.9104103443726085, "grad_norm": 10.711437225341797, "learning_rate": 1.3996554313734155e-05, "loss": 0.9612, "step": 6900 }, { "epoch": 0.9170075207811057, "grad_norm": 13.457585334777832, "learning_rate": 1.3952378848787385e-05, "loss": 0.8649, "step": 6950 }, { "epoch": 0.9236046971896028, "grad_norm": 12.793722152709961, "learning_rate": 1.3908203383840615e-05, "loss": 0.7366, "step": 7000 }, { "epoch": 0.9236046971896028, "eval_accuracy": 0.9187754988670349, "eval_loss": 0.5691282153129578, "eval_runtime": 14.243, "eval_samples_per_second": 172.014, "eval_steps_per_second": 5.406, "step": 7000 }, { "epoch": 0.9302018735981, "grad_norm": 15.058178901672363, "learning_rate": 1.3864027918893847e-05, "loss": 0.9621, "step": 7050 }, { "epoch": 0.9367990500065971, "grad_norm": 14.427763938903809, "learning_rate": 1.3819852453947078e-05, "loss": 0.9154, "step": 7100 }, { "epoch": 0.9433962264150944, "grad_norm": 13.261103630065918, "learning_rate": 1.3775676989000311e-05, "loss": 0.8617, "step": 7150 }, { "epoch": 0.9499934028235915, "grad_norm": 12.778352737426758, "learning_rate": 1.3731501524053542e-05, "loss": 0.8629, "step": 7200 }, { "epoch": 0.9565905792320887, "grad_norm": 14.332444190979004, "learning_rate": 1.3687326059106774e-05, "loss": 0.899, "step": 7250 }, { "epoch": 0.9565905792320887, "eval_accuracy": 0.9159183502197266, "eval_loss": 0.5559064745903015, "eval_runtime": 14.2133, "eval_samples_per_second": 172.374, "eval_steps_per_second": 5.417, "step": 7250 }, { "epoch": 0.9631877556405858, "grad_norm": 8.828652381896973, "learning_rate": 1.3643150594160004e-05, "loss": 0.7766, "step": 7300 }, { "epoch": 0.969784932049083, "grad_norm": 11.421220779418945, "learning_rate": 1.3598975129213236e-05, "loss": 0.8968, "step": 7350 }, { "epoch": 0.9763821084575801, "grad_norm": 13.00658893585205, "learning_rate": 1.3554799664266466e-05, "loss": 0.8462, "step": 7400 }, { "epoch": 0.9829792848660773, "grad_norm": 6.505890369415283, "learning_rate": 1.35106241993197e-05, "loss": 0.8478, "step": 7450 }, { "epoch": 0.9895764612745744, "grad_norm": 9.694055557250977, "learning_rate": 1.346644873437293e-05, "loss": 0.8184, "step": 7500 }, { "epoch": 0.9895764612745744, "eval_accuracy": 0.9163265228271484, "eval_loss": 0.5564213395118713, "eval_runtime": 14.1149, "eval_samples_per_second": 173.576, "eval_steps_per_second": 5.455, "step": 7500 }, { "epoch": 0.9961736376830717, "grad_norm": 8.785748481750488, "learning_rate": 1.3422273269426162e-05, "loss": 0.8445, "step": 7550 }, { "epoch": 1.0027708140915688, "grad_norm": 12.255693435668945, "learning_rate": 1.3378097804479393e-05, "loss": 0.7305, "step": 7600 }, { "epoch": 1.0093679905000659, "grad_norm": 12.03491497039795, "learning_rate": 1.3333922339532626e-05, "loss": 0.695, "step": 7650 }, { "epoch": 1.0159651669085632, "grad_norm": 15.055414199829102, "learning_rate": 1.3289746874585857e-05, "loss": 0.779, "step": 7700 }, { "epoch": 1.0225623433170603, "grad_norm": 3.5831682682037354, "learning_rate": 1.3245571409639089e-05, "loss": 0.5876, "step": 7750 }, { "epoch": 1.0225623433170603, "eval_accuracy": 0.918367326259613, "eval_loss": 0.5775763392448425, "eval_runtime": 14.119, "eval_samples_per_second": 173.525, "eval_steps_per_second": 5.454, "step": 7750 }, { "epoch": 1.0291595197255574, "grad_norm": 14.637757301330566, "learning_rate": 1.3201395944692319e-05, "loss": 0.6372, "step": 7800 }, { "epoch": 1.0357566961340545, "grad_norm": 9.048910140991211, "learning_rate": 1.315722047974555e-05, "loss": 0.7066, "step": 7850 }, { "epoch": 1.0423538725425519, "grad_norm": 13.023659706115723, "learning_rate": 1.3113045014798781e-05, "loss": 0.6561, "step": 7900 }, { "epoch": 1.048951048951049, "grad_norm": 13.10300350189209, "learning_rate": 1.3068869549852012e-05, "loss": 0.6854, "step": 7950 }, { "epoch": 1.055548225359546, "grad_norm": 13.364474296569824, "learning_rate": 1.3024694084905245e-05, "loss": 0.7083, "step": 8000 }, { "epoch": 1.055548225359546, "eval_accuracy": 0.9212244749069214, "eval_loss": 0.5645425915718079, "eval_runtime": 14.1023, "eval_samples_per_second": 173.731, "eval_steps_per_second": 5.46, "step": 8000 }, { "epoch": 1.0621454017680432, "grad_norm": 10.857477188110352, "learning_rate": 1.2980518619958476e-05, "loss": 0.6618, "step": 8050 }, { "epoch": 1.0687425781765405, "grad_norm": 13.178641319274902, "learning_rate": 1.2936343155011708e-05, "loss": 0.6602, "step": 8100 }, { "epoch": 1.0753397545850376, "grad_norm": 8.929798126220703, "learning_rate": 1.2892167690064938e-05, "loss": 0.7141, "step": 8150 }, { "epoch": 1.0819369309935347, "grad_norm": 14.156282424926758, "learning_rate": 1.2847992225118172e-05, "loss": 0.7599, "step": 8200 }, { "epoch": 1.0885341074020318, "grad_norm": 11.46021842956543, "learning_rate": 1.2803816760171402e-05, "loss": 0.6307, "step": 8250 }, { "epoch": 1.0885341074020318, "eval_accuracy": 0.9159183502197266, "eval_loss": 0.5608085989952087, "eval_runtime": 14.088, "eval_samples_per_second": 173.907, "eval_steps_per_second": 5.466, "step": 8250 }, { "epoch": 1.0951312838105292, "grad_norm": 13.287457466125488, "learning_rate": 1.2759641295224634e-05, "loss": 0.6611, "step": 8300 }, { "epoch": 1.1017284602190263, "grad_norm": 7.9682793617248535, "learning_rate": 1.2715465830277864e-05, "loss": 0.6308, "step": 8350 }, { "epoch": 1.1083256366275234, "grad_norm": 8.86072826385498, "learning_rate": 1.2671290365331097e-05, "loss": 0.7035, "step": 8400 }, { "epoch": 1.1149228130360207, "grad_norm": 16.224716186523438, "learning_rate": 1.2627114900384327e-05, "loss": 0.683, "step": 8450 }, { "epoch": 1.1215199894445178, "grad_norm": 16.066835403442383, "learning_rate": 1.258293943543756e-05, "loss": 0.7077, "step": 8500 }, { "epoch": 1.1215199894445178, "eval_accuracy": 0.918367326259613, "eval_loss": 0.5556493401527405, "eval_runtime": 14.2677, "eval_samples_per_second": 171.717, "eval_steps_per_second": 5.397, "step": 8500 }, { "epoch": 1.128117165853015, "grad_norm": 16.001686096191406, "learning_rate": 1.2538763970490791e-05, "loss": 0.7153, "step": 8550 }, { "epoch": 1.134714342261512, "grad_norm": 10.751116752624512, "learning_rate": 1.2494588505544021e-05, "loss": 0.6186, "step": 8600 }, { "epoch": 1.1413115186700091, "grad_norm": 13.352745056152344, "learning_rate": 1.2450413040597253e-05, "loss": 0.6289, "step": 8650 }, { "epoch": 1.1479086950785065, "grad_norm": 13.567846298217773, "learning_rate": 1.2406237575650484e-05, "loss": 0.5718, "step": 8700 }, { "epoch": 1.1545058714870036, "grad_norm": 7.751793384552002, "learning_rate": 1.2362062110703717e-05, "loss": 0.5749, "step": 8750 }, { "epoch": 1.1545058714870036, "eval_accuracy": 0.9167346954345703, "eval_loss": 0.5695374011993408, "eval_runtime": 14.4147, "eval_samples_per_second": 169.965, "eval_steps_per_second": 5.342, "step": 8750 }, { "epoch": 1.1611030478955007, "grad_norm": 17.14850425720215, "learning_rate": 1.2317886645756948e-05, "loss": 0.6788, "step": 8800 }, { "epoch": 1.167700224303998, "grad_norm": 15.17955493927002, "learning_rate": 1.227371118081018e-05, "loss": 0.7731, "step": 8850 }, { "epoch": 1.174297400712495, "grad_norm": 15.97493839263916, "learning_rate": 1.222953571586341e-05, "loss": 0.6954, "step": 8900 }, { "epoch": 1.1808945771209922, "grad_norm": 13.843533515930176, "learning_rate": 1.2185360250916642e-05, "loss": 0.7404, "step": 8950 }, { "epoch": 1.1874917535294893, "grad_norm": 2.941951274871826, "learning_rate": 1.2141184785969872e-05, "loss": 0.6871, "step": 9000 }, { "epoch": 1.1874917535294893, "eval_accuracy": 0.9208163022994995, "eval_loss": 0.5665779709815979, "eval_runtime": 14.3831, "eval_samples_per_second": 170.339, "eval_steps_per_second": 5.354, "step": 9000 }, { "epoch": 1.1940889299379864, "grad_norm": 12.621596336364746, "learning_rate": 1.2097009321023106e-05, "loss": 0.6415, "step": 9050 }, { "epoch": 1.2006861063464838, "grad_norm": 15.431377410888672, "learning_rate": 1.2052833856076336e-05, "loss": 0.6517, "step": 9100 }, { "epoch": 1.2072832827549809, "grad_norm": 18.660377502441406, "learning_rate": 1.2008658391129568e-05, "loss": 0.7354, "step": 9150 }, { "epoch": 1.213880459163478, "grad_norm": 16.014867782592773, "learning_rate": 1.1964482926182799e-05, "loss": 0.7325, "step": 9200 }, { "epoch": 1.2204776355719753, "grad_norm": 15.354519844055176, "learning_rate": 1.192030746123603e-05, "loss": 0.6272, "step": 9250 }, { "epoch": 1.2204776355719753, "eval_accuracy": 0.9146938920021057, "eval_loss": 0.5714155435562134, "eval_runtime": 14.5346, "eval_samples_per_second": 168.563, "eval_steps_per_second": 5.298, "step": 9250 }, { "epoch": 1.2270748119804724, "grad_norm": 12.623982429504395, "learning_rate": 1.1876131996289261e-05, "loss": 0.7292, "step": 9300 }, { "epoch": 1.2336719883889695, "grad_norm": 9.906524658203125, "learning_rate": 1.1831956531342495e-05, "loss": 0.6325, "step": 9350 }, { "epoch": 1.2402691647974666, "grad_norm": 13.0123872756958, "learning_rate": 1.1787781066395725e-05, "loss": 0.6344, "step": 9400 }, { "epoch": 1.2468663412059637, "grad_norm": 11.591238975524902, "learning_rate": 1.1743605601448955e-05, "loss": 0.7218, "step": 9450 }, { "epoch": 1.253463517614461, "grad_norm": 6.004245758056641, "learning_rate": 1.1699430136502187e-05, "loss": 0.6815, "step": 9500 }, { "epoch": 1.253463517614461, "eval_accuracy": 0.9175510406494141, "eval_loss": 0.5650636553764343, "eval_runtime": 14.6735, "eval_samples_per_second": 166.967, "eval_steps_per_second": 5.248, "step": 9500 }, { "epoch": 1.2600606940229582, "grad_norm": 15.778864860534668, "learning_rate": 1.1655254671555418e-05, "loss": 0.7186, "step": 9550 }, { "epoch": 1.2666578704314553, "grad_norm": 9.6397123336792, "learning_rate": 1.1611079206608651e-05, "loss": 0.6145, "step": 9600 }, { "epoch": 1.2732550468399526, "grad_norm": 10.774910926818848, "learning_rate": 1.1566903741661882e-05, "loss": 0.7095, "step": 9650 }, { "epoch": 1.2798522232484497, "grad_norm": 7.923967361450195, "learning_rate": 1.1522728276715114e-05, "loss": 0.674, "step": 9700 }, { "epoch": 1.2864493996569468, "grad_norm": 15.660514831542969, "learning_rate": 1.1478552811768344e-05, "loss": 0.7405, "step": 9750 }, { "epoch": 1.2864493996569468, "eval_accuracy": 0.9200000166893005, "eval_loss": 0.5666268467903137, "eval_runtime": 14.8203, "eval_samples_per_second": 165.314, "eval_steps_per_second": 5.196, "step": 9750 }, { "epoch": 1.293046576065444, "grad_norm": 7.9538397789001465, "learning_rate": 1.1434377346821576e-05, "loss": 0.7186, "step": 9800 }, { "epoch": 1.299643752473941, "grad_norm": 10.569086074829102, "learning_rate": 1.1390201881874806e-05, "loss": 0.6352, "step": 9850 }, { "epoch": 1.3062409288824384, "grad_norm": 13.37822151184082, "learning_rate": 1.134602641692804e-05, "loss": 0.7077, "step": 9900 }, { "epoch": 1.3128381052909355, "grad_norm": 14.899065017700195, "learning_rate": 1.130185095198127e-05, "loss": 0.6873, "step": 9950 }, { "epoch": 1.3194352816994326, "grad_norm": 9.051203727722168, "learning_rate": 1.1257675487034503e-05, "loss": 0.5939, "step": 10000 }, { "epoch": 1.3194352816994326, "eval_accuracy": 0.9204081892967224, "eval_loss": 0.5752307176589966, "eval_runtime": 14.7189, "eval_samples_per_second": 166.453, "eval_steps_per_second": 5.231, "step": 10000 }, { "epoch": 1.32603245810793, "grad_norm": 9.774917602539062, "learning_rate": 1.1213500022087733e-05, "loss": 0.7171, "step": 10050 }, { "epoch": 1.332629634516427, "grad_norm": 12.088335037231445, "learning_rate": 1.1169324557140967e-05, "loss": 0.6592, "step": 10100 }, { "epoch": 1.3392268109249241, "grad_norm": 6.4445881843566895, "learning_rate": 1.1125149092194197e-05, "loss": 0.6631, "step": 10150 }, { "epoch": 1.3458239873334212, "grad_norm": 17.67377471923828, "learning_rate": 1.1080973627247427e-05, "loss": 0.7658, "step": 10200 }, { "epoch": 1.3524211637419183, "grad_norm": 9.594240188598633, "learning_rate": 1.103679816230066e-05, "loss": 0.6213, "step": 10250 }, { "epoch": 1.3524211637419183, "eval_accuracy": 0.9179591536521912, "eval_loss": 0.5678022503852844, "eval_runtime": 14.3553, "eval_samples_per_second": 170.669, "eval_steps_per_second": 5.364, "step": 10250 }, { "epoch": 1.3590183401504157, "grad_norm": 13.705334663391113, "learning_rate": 1.099262269735389e-05, "loss": 0.6486, "step": 10300 }, { "epoch": 1.3656155165589128, "grad_norm": 9.945523262023926, "learning_rate": 1.0948447232407122e-05, "loss": 0.662, "step": 10350 }, { "epoch": 1.3722126929674099, "grad_norm": 10.699588775634766, "learning_rate": 1.0904271767460352e-05, "loss": 0.6924, "step": 10400 }, { "epoch": 1.3788098693759072, "grad_norm": 11.88040828704834, "learning_rate": 1.0860096302513586e-05, "loss": 0.7106, "step": 10450 }, { "epoch": 1.3854070457844043, "grad_norm": 9.69964599609375, "learning_rate": 1.0815920837566816e-05, "loss": 0.7239, "step": 10500 }, { "epoch": 1.3854070457844043, "eval_accuracy": 0.918367326259613, "eval_loss": 0.5687153935432434, "eval_runtime": 15.0319, "eval_samples_per_second": 162.987, "eval_steps_per_second": 5.122, "step": 10500 }, { "epoch": 1.3920042221929014, "grad_norm": 7.956460475921631, "learning_rate": 1.0771745372620048e-05, "loss": 0.735, "step": 10550 }, { "epoch": 1.3986013986013985, "grad_norm": 15.08421802520752, "learning_rate": 1.0728453416972216e-05, "loss": 0.6784, "step": 10600 }, { "epoch": 1.4051985750098956, "grad_norm": 7.856141090393066, "learning_rate": 1.0684277952025447e-05, "loss": 0.6886, "step": 10650 }, { "epoch": 1.411795751418393, "grad_norm": 20.228710174560547, "learning_rate": 1.0640102487078679e-05, "loss": 0.649, "step": 10700 }, { "epoch": 1.41839292782689, "grad_norm": 8.827073097229004, "learning_rate": 1.0595927022131909e-05, "loss": 0.6133, "step": 10750 }, { "epoch": 1.41839292782689, "eval_accuracy": 0.9200000166893005, "eval_loss": 0.5682947039604187, "eval_runtime": 14.6806, "eval_samples_per_second": 166.887, "eval_steps_per_second": 5.245, "step": 10750 }, { "epoch": 1.4249901042353872, "grad_norm": 12.990625381469727, "learning_rate": 1.055175155718514e-05, "loss": 0.6635, "step": 10800 }, { "epoch": 1.4315872806438845, "grad_norm": 13.446993827819824, "learning_rate": 1.0507576092238371e-05, "loss": 0.6803, "step": 10850 }, { "epoch": 1.4381844570523816, "grad_norm": 16.174983978271484, "learning_rate": 1.0463400627291602e-05, "loss": 0.6497, "step": 10900 }, { "epoch": 1.4447816334608787, "grad_norm": 14.54861831665039, "learning_rate": 1.0419225162344835e-05, "loss": 0.6812, "step": 10950 }, { "epoch": 1.4513788098693758, "grad_norm": 15.023179054260254, "learning_rate": 1.0375049697398066e-05, "loss": 0.7493, "step": 11000 }, { "epoch": 1.4513788098693758, "eval_accuracy": 0.9220408201217651, "eval_loss": 0.5586878657341003, "eval_runtime": 14.6736, "eval_samples_per_second": 166.967, "eval_steps_per_second": 5.248, "step": 11000 }, { "epoch": 1.457975986277873, "grad_norm": 13.154565811157227, "learning_rate": 1.0330874232451298e-05, "loss": 0.6694, "step": 11050 }, { "epoch": 1.4645731626863703, "grad_norm": 10.797453880310059, "learning_rate": 1.0286698767504528e-05, "loss": 0.6782, "step": 11100 }, { "epoch": 1.4711703390948674, "grad_norm": 9.909940719604492, "learning_rate": 1.0242523302557762e-05, "loss": 0.6839, "step": 11150 }, { "epoch": 1.4777675155033645, "grad_norm": 10.230202674865723, "learning_rate": 1.0198347837610992e-05, "loss": 0.671, "step": 11200 }, { "epoch": 1.4843646919118618, "grad_norm": 10.81137752532959, "learning_rate": 1.0154172372664224e-05, "loss": 0.7648, "step": 11250 }, { "epoch": 1.4843646919118618, "eval_accuracy": 0.9208163022994995, "eval_loss": 0.5465655326843262, "eval_runtime": 14.3093, "eval_samples_per_second": 171.217, "eval_steps_per_second": 5.381, "step": 11250 }, { "epoch": 1.490961868320359, "grad_norm": 16.720306396484375, "learning_rate": 1.0109996907717454e-05, "loss": 0.7448, "step": 11300 }, { "epoch": 1.497559044728856, "grad_norm": 10.62578296661377, "learning_rate": 1.0065821442770686e-05, "loss": 0.6811, "step": 11350 }, { "epoch": 1.5041562211373534, "grad_norm": 12.68857479095459, "learning_rate": 1.0021645977823917e-05, "loss": 0.6984, "step": 11400 }, { "epoch": 1.5107533975458503, "grad_norm": 9.58633804321289, "learning_rate": 9.977470512877149e-06, "loss": 0.6676, "step": 11450 }, { "epoch": 1.5173505739543476, "grad_norm": 16.852190017700195, "learning_rate": 9.93329504793038e-06, "loss": 0.7054, "step": 11500 }, { "epoch": 1.5173505739543476, "eval_accuracy": 0.9204081892967224, "eval_loss": 0.5569261908531189, "eval_runtime": 14.3932, "eval_samples_per_second": 170.219, "eval_steps_per_second": 5.35, "step": 11500 }, { "epoch": 1.5239477503628447, "grad_norm": 12.476948738098145, "learning_rate": 9.889119582983611e-06, "loss": 0.6109, "step": 11550 }, { "epoch": 1.5305449267713418, "grad_norm": 15.437007904052734, "learning_rate": 9.844944118036843e-06, "loss": 0.7581, "step": 11600 }, { "epoch": 1.5371421031798391, "grad_norm": 14.643590927124023, "learning_rate": 9.800768653090075e-06, "loss": 0.7035, "step": 11650 }, { "epoch": 1.5437392795883362, "grad_norm": 8.441386222839355, "learning_rate": 9.756593188143305e-06, "loss": 0.6943, "step": 11700 }, { "epoch": 1.5503364559968333, "grad_norm": 17.568815231323242, "learning_rate": 9.713301232495472e-06, "loss": 0.6225, "step": 11750 }, { "epoch": 1.5503364559968333, "eval_accuracy": 0.922448992729187, "eval_loss": 0.5570839643478394, "eval_runtime": 14.4582, "eval_samples_per_second": 169.454, "eval_steps_per_second": 5.326, "step": 11750 }, { "epoch": 1.5569336324053307, "grad_norm": 9.131673812866211, "learning_rate": 9.669125767548704e-06, "loss": 0.661, "step": 11800 }, { "epoch": 1.5635308088138276, "grad_norm": 12.502500534057617, "learning_rate": 9.624950302601936e-06, "loss": 0.635, "step": 11850 }, { "epoch": 1.5701279852223249, "grad_norm": 16.2374210357666, "learning_rate": 9.580774837655166e-06, "loss": 0.613, "step": 11900 }, { "epoch": 1.576725161630822, "grad_norm": 14.896709442138672, "learning_rate": 9.536599372708398e-06, "loss": 0.6502, "step": 11950 }, { "epoch": 1.583322338039319, "grad_norm": 10.305893898010254, "learning_rate": 9.49242390776163e-06, "loss": 0.6935, "step": 12000 }, { "epoch": 1.583322338039319, "eval_accuracy": 0.9200000166893005, "eval_loss": 0.5578611493110657, "eval_runtime": 15.0346, "eval_samples_per_second": 162.958, "eval_steps_per_second": 5.122, "step": 12000 }, { "epoch": 1.5899195144478164, "grad_norm": 10.883207321166992, "learning_rate": 9.44824844281486e-06, "loss": 0.6147, "step": 12050 }, { "epoch": 1.5965166908563135, "grad_norm": 5.787095069885254, "learning_rate": 9.404072977868093e-06, "loss": 0.6575, "step": 12100 }, { "epoch": 1.6031138672648106, "grad_norm": 18.00186538696289, "learning_rate": 9.359897512921325e-06, "loss": 0.6837, "step": 12150 }, { "epoch": 1.609711043673308, "grad_norm": 11.488051414489746, "learning_rate": 9.315722047974555e-06, "loss": 0.7437, "step": 12200 }, { "epoch": 1.6163082200818049, "grad_norm": 8.060873031616211, "learning_rate": 9.271546583027787e-06, "loss": 0.6808, "step": 12250 }, { "epoch": 1.6163082200818049, "eval_accuracy": 0.9204081892967224, "eval_loss": 0.5507224202156067, "eval_runtime": 14.9999, "eval_samples_per_second": 163.335, "eval_steps_per_second": 5.133, "step": 12250 }, { "epoch": 1.6229053964903022, "grad_norm": 12.3642578125, "learning_rate": 9.227371118081019e-06, "loss": 0.6698, "step": 12300 }, { "epoch": 1.6295025728987993, "grad_norm": 15.267610549926758, "learning_rate": 9.183195653134251e-06, "loss": 0.6803, "step": 12350 }, { "epoch": 1.6360997493072964, "grad_norm": 14.562056541442871, "learning_rate": 9.139020188187481e-06, "loss": 0.676, "step": 12400 }, { "epoch": 1.6426969257157937, "grad_norm": 10.903446197509766, "learning_rate": 9.094844723240713e-06, "loss": 0.6418, "step": 12450 }, { "epoch": 1.6492941021242908, "grad_norm": 12.447013854980469, "learning_rate": 9.050669258293946e-06, "loss": 0.6042, "step": 12500 }, { "epoch": 1.6492941021242908, "eval_accuracy": 0.9187754988670349, "eval_loss": 0.556304395198822, "eval_runtime": 14.8826, "eval_samples_per_second": 164.622, "eval_steps_per_second": 5.174, "step": 12500 }, { "epoch": 1.655891278532788, "grad_norm": 14.32016658782959, "learning_rate": 9.006493793347176e-06, "loss": 0.7139, "step": 12550 }, { "epoch": 1.6624884549412853, "grad_norm": 10.937993049621582, "learning_rate": 8.962318328400406e-06, "loss": 0.6995, "step": 12600 }, { "epoch": 1.6690856313497822, "grad_norm": 9.443896293640137, "learning_rate": 8.918142863453638e-06, "loss": 0.6097, "step": 12650 }, { "epoch": 1.6756828077582795, "grad_norm": 13.650922775268555, "learning_rate": 8.87396739850687e-06, "loss": 0.6407, "step": 12700 }, { "epoch": 1.6822799841667766, "grad_norm": 13.268482208251953, "learning_rate": 8.8297919335601e-06, "loss": 0.5994, "step": 12750 }, { "epoch": 1.6822799841667766, "eval_accuracy": 0.9248979687690735, "eval_loss": 0.5621338486671448, "eval_runtime": 14.3912, "eval_samples_per_second": 170.243, "eval_steps_per_second": 5.35, "step": 12750 }, { "epoch": 1.6888771605752737, "grad_norm": 10.582622528076172, "learning_rate": 8.785616468613333e-06, "loss": 0.6642, "step": 12800 }, { "epoch": 1.695474336983771, "grad_norm": 11.034931182861328, "learning_rate": 8.741441003666565e-06, "loss": 0.6198, "step": 12850 }, { "epoch": 1.7020715133922681, "grad_norm": 13.703685760498047, "learning_rate": 8.697265538719795e-06, "loss": 0.6648, "step": 12900 }, { "epoch": 1.7086686898007653, "grad_norm": 10.968565940856934, "learning_rate": 8.653090073773027e-06, "loss": 0.5644, "step": 12950 }, { "epoch": 1.7152658662092626, "grad_norm": 12.422329902648926, "learning_rate": 8.608914608826259e-06, "loss": 0.6531, "step": 13000 }, { "epoch": 1.7152658662092626, "eval_accuracy": 0.9240816235542297, "eval_loss": 0.5617344975471497, "eval_runtime": 15.0031, "eval_samples_per_second": 163.299, "eval_steps_per_second": 5.132, "step": 13000 }, { "epoch": 1.7218630426177595, "grad_norm": 9.511701583862305, "learning_rate": 8.564739143879491e-06, "loss": 0.6159, "step": 13050 }, { "epoch": 1.7284602190262568, "grad_norm": 6.499239921569824, "learning_rate": 8.520563678932721e-06, "loss": 0.7855, "step": 13100 }, { "epoch": 1.735057395434754, "grad_norm": 7.864821910858154, "learning_rate": 8.476388213985953e-06, "loss": 0.6307, "step": 13150 }, { "epoch": 1.741654571843251, "grad_norm": 11.460110664367676, "learning_rate": 8.432212749039185e-06, "loss": 0.61, "step": 13200 }, { "epoch": 1.7482517482517483, "grad_norm": 12.433394432067871, "learning_rate": 8.388037284092416e-06, "loss": 0.6672, "step": 13250 }, { "epoch": 1.7482517482517483, "eval_accuracy": 0.9236734509468079, "eval_loss": 0.5589076280593872, "eval_runtime": 14.2338, "eval_samples_per_second": 172.125, "eval_steps_per_second": 5.41, "step": 13250 }, { "epoch": 1.7548489246602454, "grad_norm": 9.624537467956543, "learning_rate": 8.343861819145648e-06, "loss": 0.6002, "step": 13300 }, { "epoch": 1.7614461010687426, "grad_norm": 14.12790584564209, "learning_rate": 8.299686354198878e-06, "loss": 0.6638, "step": 13350 }, { "epoch": 1.7680432774772399, "grad_norm": 15.561441421508789, "learning_rate": 8.25551088925211e-06, "loss": 0.6112, "step": 13400 }, { "epoch": 1.7746404538857368, "grad_norm": 8.115078926086426, "learning_rate": 8.21133542430534e-06, "loss": 0.6236, "step": 13450 }, { "epoch": 1.781237630294234, "grad_norm": 5.141168117523193, "learning_rate": 8.167159959358572e-06, "loss": 0.6245, "step": 13500 }, { "epoch": 1.781237630294234, "eval_accuracy": 0.9220408201217651, "eval_loss": 0.557984471321106, "eval_runtime": 14.2809, "eval_samples_per_second": 171.557, "eval_steps_per_second": 5.392, "step": 13500 }, { "epoch": 1.7878348067027312, "grad_norm": 13.422981262207031, "learning_rate": 8.122984494411804e-06, "loss": 0.7146, "step": 13550 }, { "epoch": 1.7944319831112283, "grad_norm": 9.977944374084473, "learning_rate": 8.078809029465036e-06, "loss": 0.5969, "step": 13600 }, { "epoch": 1.8010291595197256, "grad_norm": 12.0841064453125, "learning_rate": 8.034633564518267e-06, "loss": 0.7246, "step": 13650 }, { "epoch": 1.8076263359282227, "grad_norm": 7.176680088043213, "learning_rate": 7.990458099571499e-06, "loss": 0.65, "step": 13700 }, { "epoch": 1.8142235123367199, "grad_norm": 16.529300689697266, "learning_rate": 7.94628263462473e-06, "loss": 0.7136, "step": 13750 }, { "epoch": 1.8142235123367199, "eval_accuracy": 0.9204081892967224, "eval_loss": 0.5532920360565186, "eval_runtime": 14.1588, "eval_samples_per_second": 173.038, "eval_steps_per_second": 5.438, "step": 13750 }, { "epoch": 1.8208206887452172, "grad_norm": 14.22460651397705, "learning_rate": 7.902107169677961e-06, "loss": 0.7062, "step": 13800 }, { "epoch": 1.827417865153714, "grad_norm": 12.760638236999512, "learning_rate": 7.857931704731193e-06, "loss": 0.6987, "step": 13850 }, { "epoch": 1.8340150415622114, "grad_norm": 18.206119537353516, "learning_rate": 7.813756239784425e-06, "loss": 0.6642, "step": 13900 }, { "epoch": 1.8406122179707085, "grad_norm": 10.713970184326172, "learning_rate": 7.769580774837655e-06, "loss": 0.6761, "step": 13950 }, { "epoch": 1.8472093943792056, "grad_norm": 15.882705688476562, "learning_rate": 7.725405309890888e-06, "loss": 0.6766, "step": 14000 }, { "epoch": 1.8472093943792056, "eval_accuracy": 0.9212244749069214, "eval_loss": 0.5655022263526917, "eval_runtime": 14.1648, "eval_samples_per_second": 172.964, "eval_steps_per_second": 5.436, "step": 14000 }, { "epoch": 1.853806570787703, "grad_norm": 14.32147216796875, "learning_rate": 7.68122984494412e-06, "loss": 0.5758, "step": 14050 }, { "epoch": 1.8604037471962, "grad_norm": 10.756980895996094, "learning_rate": 7.63705437999735e-06, "loss": 0.6594, "step": 14100 }, { "epoch": 1.8670009236046972, "grad_norm": 14.631691932678223, "learning_rate": 7.592878915050581e-06, "loss": 0.7866, "step": 14150 }, { "epoch": 1.8735981000131945, "grad_norm": 10.117656707763672, "learning_rate": 7.548703450103812e-06, "loss": 0.5798, "step": 14200 }, { "epoch": 1.8801952764216914, "grad_norm": 18.793254852294922, "learning_rate": 7.504527985157044e-06, "loss": 0.6472, "step": 14250 }, { "epoch": 1.8801952764216914, "eval_accuracy": 0.9212244749069214, "eval_loss": 0.5508715510368347, "eval_runtime": 14.2705, "eval_samples_per_second": 171.683, "eval_steps_per_second": 5.396, "step": 14250 }, { "epoch": 1.8867924528301887, "grad_norm": 6.77608585357666, "learning_rate": 7.460352520210275e-06, "loss": 0.7387, "step": 14300 }, { "epoch": 1.8933896292386858, "grad_norm": 9.065592765808105, "learning_rate": 7.416177055263507e-06, "loss": 0.6677, "step": 14350 }, { "epoch": 1.899986805647183, "grad_norm": 16.486297607421875, "learning_rate": 7.372001590316739e-06, "loss": 0.6697, "step": 14400 }, { "epoch": 1.9065839820556802, "grad_norm": 10.985074996948242, "learning_rate": 7.32782612536997e-06, "loss": 0.6711, "step": 14450 }, { "epoch": 1.9131811584641774, "grad_norm": 10.440354347229004, "learning_rate": 7.283650660423202e-06, "loss": 0.6988, "step": 14500 }, { "epoch": 1.9131811584641774, "eval_accuracy": 0.9228571653366089, "eval_loss": 0.5527560114860535, "eval_runtime": 14.6748, "eval_samples_per_second": 166.953, "eval_steps_per_second": 5.247, "step": 14500 }, { "epoch": 1.9197783348726745, "grad_norm": 10.955256462097168, "learning_rate": 7.239475195476433e-06, "loss": 0.6301, "step": 14550 }, { "epoch": 1.9263755112811718, "grad_norm": 12.683993339538574, "learning_rate": 7.195299730529665e-06, "loss": 0.6259, "step": 14600 }, { "epoch": 1.9329726876896687, "grad_norm": 12.907076835632324, "learning_rate": 7.151124265582896e-06, "loss": 0.6223, "step": 14650 }, { "epoch": 1.939569864098166, "grad_norm": 16.311803817749023, "learning_rate": 7.106948800636127e-06, "loss": 0.5702, "step": 14700 }, { "epoch": 1.9461670405066631, "grad_norm": 13.10996150970459, "learning_rate": 7.062773335689359e-06, "loss": 0.6324, "step": 14750 }, { "epoch": 1.9461670405066631, "eval_accuracy": 0.9253061413764954, "eval_loss": 0.5507711172103882, "eval_runtime": 14.1763, "eval_samples_per_second": 172.824, "eval_steps_per_second": 5.432, "step": 14750 }, { "epoch": 1.9527642169151602, "grad_norm": 10.274900436401367, "learning_rate": 7.0185978707425905e-06, "loss": 0.6409, "step": 14800 }, { "epoch": 1.9593613933236576, "grad_norm": 14.111708641052246, "learning_rate": 6.974422405795822e-06, "loss": 0.6609, "step": 14850 }, { "epoch": 1.9659585697321547, "grad_norm": 7.0049662590026855, "learning_rate": 6.930246940849053e-06, "loss": 0.6581, "step": 14900 }, { "epoch": 1.9725557461406518, "grad_norm": 8.995936393737793, "learning_rate": 6.886071475902284e-06, "loss": 0.6313, "step": 14950 }, { "epoch": 1.979152922549149, "grad_norm": 19.018213272094727, "learning_rate": 6.841896010955515e-06, "loss": 0.6191, "step": 15000 }, { "epoch": 1.979152922549149, "eval_accuracy": 0.9216326475143433, "eval_loss": 0.5451802611351013, "eval_runtime": 14.4726, "eval_samples_per_second": 169.286, "eval_steps_per_second": 5.32, "step": 15000 }, { "epoch": 1.985750098957646, "grad_norm": 14.76790714263916, "learning_rate": 6.797720546008747e-06, "loss": 0.6665, "step": 15050 }, { "epoch": 1.9923472753661433, "grad_norm": 5.290237903594971, "learning_rate": 6.753545081061978e-06, "loss": 0.5907, "step": 15100 }, { "epoch": 1.9989444517746404, "grad_norm": 14.515754699707031, "learning_rate": 6.70936961611521e-06, "loss": 0.6586, "step": 15150 }, { "epoch": 2.0055416281831375, "grad_norm": 5.708993434906006, "learning_rate": 6.665194151168442e-06, "loss": 0.5673, "step": 15200 }, { "epoch": 2.012138804591635, "grad_norm": 15.482401847839355, "learning_rate": 6.621018686221673e-06, "loss": 0.5516, "step": 15250 }, { "epoch": 2.012138804591635, "eval_accuracy": 0.923265278339386, "eval_loss": 0.5589110255241394, "eval_runtime": 14.2886, "eval_samples_per_second": 171.465, "eval_steps_per_second": 5.389, "step": 15250 }, { "epoch": 2.0187359810001317, "grad_norm": 5.612030506134033, "learning_rate": 6.576843221274905e-06, "loss": 0.5012, "step": 15300 }, { "epoch": 2.025333157408629, "grad_norm": 17.197193145751953, "learning_rate": 6.532667756328136e-06, "loss": 0.5227, "step": 15350 }, { "epoch": 2.0319303338171264, "grad_norm": 19.402557373046875, "learning_rate": 6.488492291381367e-06, "loss": 0.4449, "step": 15400 }, { "epoch": 2.0385275102256233, "grad_norm": 18.876649856567383, "learning_rate": 6.444316826434599e-06, "loss": 0.4862, "step": 15450 }, { "epoch": 2.0451246866341206, "grad_norm": 9.995197296142578, "learning_rate": 6.40014136148783e-06, "loss": 0.5413, "step": 15500 }, { "epoch": 2.0451246866341206, "eval_accuracy": 0.923265278339386, "eval_loss": 0.5642380714416504, "eval_runtime": 14.3185, "eval_samples_per_second": 171.108, "eval_steps_per_second": 5.378, "step": 15500 }, { "epoch": 2.051721863042618, "grad_norm": 9.785661697387695, "learning_rate": 6.355965896541062e-06, "loss": 0.5462, "step": 15550 }, { "epoch": 2.058319039451115, "grad_norm": 14.724440574645996, "learning_rate": 6.3117904315942935e-06, "loss": 0.5318, "step": 15600 }, { "epoch": 2.064916215859612, "grad_norm": 11.998701095581055, "learning_rate": 6.267614966647525e-06, "loss": 0.5706, "step": 15650 }, { "epoch": 2.071513392268109, "grad_norm": 3.4020655155181885, "learning_rate": 6.223439501700755e-06, "loss": 0.5055, "step": 15700 }, { "epoch": 2.0781105686766064, "grad_norm": 16.408964157104492, "learning_rate": 6.179264036753987e-06, "loss": 0.6141, "step": 15750 }, { "epoch": 2.0781105686766064, "eval_accuracy": 0.923265278339386, "eval_loss": 0.5610572695732117, "eval_runtime": 14.2227, "eval_samples_per_second": 172.26, "eval_steps_per_second": 5.414, "step": 15750 }, { "epoch": 2.0847077450851037, "grad_norm": 17.39078140258789, "learning_rate": 6.135088571807218e-06, "loss": 0.5247, "step": 15800 }, { "epoch": 2.0913049214936006, "grad_norm": 17.45914649963379, "learning_rate": 6.09091310686045e-06, "loss": 0.4817, "step": 15850 }, { "epoch": 2.097902097902098, "grad_norm": 13.649807929992676, "learning_rate": 6.0467376419136814e-06, "loss": 0.4599, "step": 15900 }, { "epoch": 2.1044992743105952, "grad_norm": 8.314746856689453, "learning_rate": 6.002562176966913e-06, "loss": 0.5676, "step": 15950 }, { "epoch": 2.111096450719092, "grad_norm": 14.881856918334961, "learning_rate": 5.958386712020145e-06, "loss": 0.3992, "step": 16000 }, { "epoch": 2.111096450719092, "eval_accuracy": 0.9236734509468079, "eval_loss": 0.5720360279083252, "eval_runtime": 14.2571, "eval_samples_per_second": 171.844, "eval_steps_per_second": 5.401, "step": 16000 }, { "epoch": 2.1176936271275895, "grad_norm": 21.33131217956543, "learning_rate": 5.914211247073376e-06, "loss": 0.5337, "step": 16050 }, { "epoch": 2.1242908035360863, "grad_norm": 14.612150192260742, "learning_rate": 5.870035782126608e-06, "loss": 0.4641, "step": 16100 }, { "epoch": 2.1308879799445837, "grad_norm": 19.05860137939453, "learning_rate": 5.825860317179839e-06, "loss": 0.5636, "step": 16150 }, { "epoch": 2.137485156353081, "grad_norm": 13.695535659790039, "learning_rate": 5.78168485223307e-06, "loss": 0.4811, "step": 16200 }, { "epoch": 2.144082332761578, "grad_norm": 11.873661041259766, "learning_rate": 5.737509387286302e-06, "loss": 0.499, "step": 16250 }, { "epoch": 2.144082332761578, "eval_accuracy": 0.9216326475143433, "eval_loss": 0.5672578811645508, "eval_runtime": 14.4037, "eval_samples_per_second": 170.096, "eval_steps_per_second": 5.346, "step": 16250 }, { "epoch": 2.150679509170075, "grad_norm": 10.252338409423828, "learning_rate": 5.693333922339533e-06, "loss": 0.5822, "step": 16300 }, { "epoch": 2.1572766855785726, "grad_norm": 21.956472396850586, "learning_rate": 5.6491584573927645e-06, "loss": 0.5935, "step": 16350 }, { "epoch": 2.1638738619870694, "grad_norm": 10.932018280029297, "learning_rate": 5.6049829924459966e-06, "loss": 0.5028, "step": 16400 }, { "epoch": 2.1704710383955668, "grad_norm": 11.411332130432129, "learning_rate": 5.560807527499228e-06, "loss": 0.5118, "step": 16450 }, { "epoch": 2.1770682148040637, "grad_norm": 12.977612495422363, "learning_rate": 5.516632062552458e-06, "loss": 0.5623, "step": 16500 }, { "epoch": 2.1770682148040637, "eval_accuracy": 0.9261224269866943, "eval_loss": 0.5655830502510071, "eval_runtime": 14.5501, "eval_samples_per_second": 168.384, "eval_steps_per_second": 5.292, "step": 16500 }, { "epoch": 2.183665391212561, "grad_norm": 16.45384979248047, "learning_rate": 5.47245659760569e-06, "loss": 0.481, "step": 16550 }, { "epoch": 2.1902625676210583, "grad_norm": 10.353941917419434, "learning_rate": 5.428281132658921e-06, "loss": 0.5461, "step": 16600 }, { "epoch": 2.196859744029555, "grad_norm": 13.859786987304688, "learning_rate": 5.3841056677121524e-06, "loss": 0.5802, "step": 16650 }, { "epoch": 2.2034569204380525, "grad_norm": 14.852931022644043, "learning_rate": 5.3399302027653845e-06, "loss": 0.5269, "step": 16700 }, { "epoch": 2.21005409684655, "grad_norm": 11.935972213745117, "learning_rate": 5.295754737818616e-06, "loss": 0.5022, "step": 16750 }, { "epoch": 2.21005409684655, "eval_accuracy": 0.9220408201217651, "eval_loss": 0.5671045184135437, "eval_runtime": 14.353, "eval_samples_per_second": 170.696, "eval_steps_per_second": 5.365, "step": 16750 }, { "epoch": 2.2166512732550467, "grad_norm": 6.890115261077881, "learning_rate": 5.251579272871848e-06, "loss": 0.5203, "step": 16800 }, { "epoch": 2.223248449663544, "grad_norm": 10.788956642150879, "learning_rate": 5.207403807925079e-06, "loss": 0.5461, "step": 16850 }, { "epoch": 2.2298456260720414, "grad_norm": 10.99864387512207, "learning_rate": 5.16322834297831e-06, "loss": 0.5711, "step": 16900 }, { "epoch": 2.2364428024805383, "grad_norm": 14.6043062210083, "learning_rate": 5.119052878031542e-06, "loss": 0.5615, "step": 16950 }, { "epoch": 2.2430399788890356, "grad_norm": 11.577956199645996, "learning_rate": 5.074877413084773e-06, "loss": 0.5748, "step": 17000 }, { "epoch": 2.2430399788890356, "eval_accuracy": 0.9257143139839172, "eval_loss": 0.560461699962616, "eval_runtime": 14.6816, "eval_samples_per_second": 166.875, "eval_steps_per_second": 5.245, "step": 17000 }, { "epoch": 2.2496371552975325, "grad_norm": 10.84367561340332, "learning_rate": 5.030701948138005e-06, "loss": 0.5272, "step": 17050 }, { "epoch": 2.25623433170603, "grad_norm": 9.228910446166992, "learning_rate": 4.9865264831912355e-06, "loss": 0.4401, "step": 17100 }, { "epoch": 2.262831508114527, "grad_norm": 5.861785411834717, "learning_rate": 4.9423510182444676e-06, "loss": 0.5158, "step": 17150 }, { "epoch": 2.269428684523024, "grad_norm": 19.159400939941406, "learning_rate": 4.898175553297699e-06, "loss": 0.5163, "step": 17200 }, { "epoch": 2.2760258609315214, "grad_norm": 9.03962516784668, "learning_rate": 4.85400008835093e-06, "loss": 0.5195, "step": 17250 }, { "epoch": 2.2760258609315214, "eval_accuracy": 0.9236734509468079, "eval_loss": 0.5647316575050354, "eval_runtime": 14.3918, "eval_samples_per_second": 170.236, "eval_steps_per_second": 5.35, "step": 17250 }, { "epoch": 2.2826230373400183, "grad_norm": 11.594718933105469, "learning_rate": 4.809824623404162e-06, "loss": 0.5235, "step": 17300 }, { "epoch": 2.2892202137485156, "grad_norm": 15.447309494018555, "learning_rate": 4.765649158457393e-06, "loss": 0.5335, "step": 17350 }, { "epoch": 2.295817390157013, "grad_norm": 8.447811126708984, "learning_rate": 4.721473693510625e-06, "loss": 0.4915, "step": 17400 }, { "epoch": 2.30241456656551, "grad_norm": 11.305243492126465, "learning_rate": 4.677298228563856e-06, "loss": 0.4915, "step": 17450 }, { "epoch": 2.309011742974007, "grad_norm": 16.881988525390625, "learning_rate": 4.6331227636170875e-06, "loss": 0.4959, "step": 17500 }, { "epoch": 2.309011742974007, "eval_accuracy": 0.923265278339386, "eval_loss": 0.5674872398376465, "eval_runtime": 14.3341, "eval_samples_per_second": 170.921, "eval_steps_per_second": 5.372, "step": 17500 }, { "epoch": 2.3156089193825045, "grad_norm": 8.994074821472168, "learning_rate": 4.588947298670319e-06, "loss": 0.5161, "step": 17550 }, { "epoch": 2.3222060957910013, "grad_norm": 14.556620597839355, "learning_rate": 4.54477183372355e-06, "loss": 0.4944, "step": 17600 }, { "epoch": 2.3288032721994987, "grad_norm": 15.484505653381348, "learning_rate": 4.500596368776782e-06, "loss": 0.5052, "step": 17650 }, { "epoch": 2.335400448607996, "grad_norm": 6.690243244171143, "learning_rate": 4.456420903830013e-06, "loss": 0.4937, "step": 17700 }, { "epoch": 2.341997625016493, "grad_norm": 12.30466365814209, "learning_rate": 4.4131289481821795e-06, "loss": 0.4695, "step": 17750 }, { "epoch": 2.341997625016493, "eval_accuracy": 0.9253061413764954, "eval_loss": 0.561543345451355, "eval_runtime": 14.3063, "eval_samples_per_second": 171.254, "eval_steps_per_second": 5.382, "step": 17750 }, { "epoch": 2.34859480142499, "grad_norm": 14.061612129211426, "learning_rate": 4.3689534832354116e-06, "loss": 0.5159, "step": 17800 }, { "epoch": 2.355191977833487, "grad_norm": 17.726974487304688, "learning_rate": 4.324778018288643e-06, "loss": 0.4992, "step": 17850 }, { "epoch": 2.3617891542419844, "grad_norm": 7.066623687744141, "learning_rate": 4.280602553341875e-06, "loss": 0.5288, "step": 17900 }, { "epoch": 2.3683863306504818, "grad_norm": 18.694576263427734, "learning_rate": 4.236427088395106e-06, "loss": 0.5247, "step": 17950 }, { "epoch": 2.3749835070589786, "grad_norm": 14.194579124450684, "learning_rate": 4.192251623448337e-06, "loss": 0.5491, "step": 18000 }, { "epoch": 2.3749835070589786, "eval_accuracy": 0.9257143139839172, "eval_loss": 0.5593844652175903, "eval_runtime": 14.4387, "eval_samples_per_second": 169.683, "eval_steps_per_second": 5.333, "step": 18000 }, { "epoch": 2.381580683467476, "grad_norm": 8.745909690856934, "learning_rate": 4.148076158501568e-06, "loss": 0.5332, "step": 18050 }, { "epoch": 2.388177859875973, "grad_norm": 7.993963241577148, "learning_rate": 4.1039006935547995e-06, "loss": 0.529, "step": 18100 }, { "epoch": 2.39477503628447, "grad_norm": 11.705142974853516, "learning_rate": 4.0597252286080315e-06, "loss": 0.5534, "step": 18150 }, { "epoch": 2.4013722126929675, "grad_norm": 15.30136775970459, "learning_rate": 4.015549763661263e-06, "loss": 0.5595, "step": 18200 }, { "epoch": 2.4079693891014644, "grad_norm": 14.33283805847168, "learning_rate": 3.971374298714495e-06, "loss": 0.573, "step": 18250 }, { "epoch": 2.4079693891014644, "eval_accuracy": 0.9261224269866943, "eval_loss": 0.5610310435295105, "eval_runtime": 14.5813, "eval_samples_per_second": 168.023, "eval_steps_per_second": 5.281, "step": 18250 }, { "epoch": 2.4145665655099617, "grad_norm": 15.337475776672363, "learning_rate": 3.928082343066661e-06, "loss": 0.4859, "step": 18300 }, { "epoch": 2.421163741918459, "grad_norm": 14.803478240966797, "learning_rate": 3.883906878119892e-06, "loss": 0.5019, "step": 18350 }, { "epoch": 2.427760918326956, "grad_norm": 17.378925323486328, "learning_rate": 3.8397314131731236e-06, "loss": 0.4771, "step": 18400 }, { "epoch": 2.4343580947354533, "grad_norm": 11.473735809326172, "learning_rate": 3.7955559482263556e-06, "loss": 0.5062, "step": 18450 }, { "epoch": 2.4409552711439506, "grad_norm": 14.394603729248047, "learning_rate": 3.7513804832795868e-06, "loss": 0.5342, "step": 18500 }, { "epoch": 2.4409552711439506, "eval_accuracy": 0.9228571653366089, "eval_loss": 0.5616511106491089, "eval_runtime": 14.3525, "eval_samples_per_second": 170.701, "eval_steps_per_second": 5.365, "step": 18500 }, { "epoch": 2.4475524475524475, "grad_norm": 13.148124694824219, "learning_rate": 3.7072050183328184e-06, "loss": 0.5275, "step": 18550 }, { "epoch": 2.454149623960945, "grad_norm": 18.74552345275879, "learning_rate": 3.66302955338605e-06, "loss": 0.576, "step": 18600 }, { "epoch": 2.4607468003694417, "grad_norm": 9.60922622680664, "learning_rate": 3.6188540884392807e-06, "loss": 0.5172, "step": 18650 }, { "epoch": 2.467343976777939, "grad_norm": 19.15401840209961, "learning_rate": 3.5746786234925123e-06, "loss": 0.5127, "step": 18700 }, { "epoch": 2.4739411531864364, "grad_norm": 14.698090553283691, "learning_rate": 3.530503158545744e-06, "loss": 0.4728, "step": 18750 }, { "epoch": 2.4739411531864364, "eval_accuracy": 0.9248979687690735, "eval_loss": 0.5650564432144165, "eval_runtime": 14.464, "eval_samples_per_second": 169.387, "eval_steps_per_second": 5.324, "step": 18750 }, { "epoch": 2.4805383295949333, "grad_norm": 11.760072708129883, "learning_rate": 3.4863276935989755e-06, "loss": 0.4256, "step": 18800 }, { "epoch": 2.4871355060034306, "grad_norm": 11.057052612304688, "learning_rate": 3.442152228652207e-06, "loss": 0.4493, "step": 18850 }, { "epoch": 2.4937326824119275, "grad_norm": 14.985106468200684, "learning_rate": 3.3979767637054383e-06, "loss": 0.4881, "step": 18900 }, { "epoch": 2.500329858820425, "grad_norm": 8.24613094329834, "learning_rate": 3.35380129875867e-06, "loss": 0.4843, "step": 18950 }, { "epoch": 2.506927035228922, "grad_norm": 12.78288459777832, "learning_rate": 3.3096258338119015e-06, "loss": 0.517, "step": 19000 }, { "epoch": 2.506927035228922, "eval_accuracy": 0.9248979687690735, "eval_loss": 0.5626258850097656, "eval_runtime": 14.0676, "eval_samples_per_second": 174.159, "eval_steps_per_second": 5.474, "step": 19000 }, { "epoch": 2.513524211637419, "grad_norm": 13.174288749694824, "learning_rate": 3.265450368865132e-06, "loss": 0.5927, "step": 19050 }, { "epoch": 2.5201213880459163, "grad_norm": 6.971681594848633, "learning_rate": 3.221274903918364e-06, "loss": 0.5687, "step": 19100 }, { "epoch": 2.5267185644544137, "grad_norm": 10.98085880279541, "learning_rate": 3.1770994389715954e-06, "loss": 0.5261, "step": 19150 }, { "epoch": 2.5333157408629106, "grad_norm": 15.29484748840332, "learning_rate": 3.132923974024827e-06, "loss": 0.5698, "step": 19200 }, { "epoch": 2.539912917271408, "grad_norm": 22.54600715637207, "learning_rate": 3.088748509078058e-06, "loss": 0.5593, "step": 19250 }, { "epoch": 2.539912917271408, "eval_accuracy": 0.9269387722015381, "eval_loss": 0.5581403374671936, "eval_runtime": 14.0758, "eval_samples_per_second": 174.057, "eval_steps_per_second": 5.47, "step": 19250 }, { "epoch": 2.546510093679905, "grad_norm": 10.993823051452637, "learning_rate": 3.0445730441312898e-06, "loss": 0.571, "step": 19300 }, { "epoch": 2.553107270088402, "grad_norm": 21.144847869873047, "learning_rate": 3.0003975791845214e-06, "loss": 0.5606, "step": 19350 }, { "epoch": 2.5597044464968994, "grad_norm": 16.376079559326172, "learning_rate": 2.956222114237753e-06, "loss": 0.4912, "step": 19400 }, { "epoch": 2.5663016229053968, "grad_norm": 13.594402313232422, "learning_rate": 2.9120466492909837e-06, "loss": 0.4805, "step": 19450 }, { "epoch": 2.5728987993138936, "grad_norm": 17.23542594909668, "learning_rate": 2.8678711843442153e-06, "loss": 0.5324, "step": 19500 }, { "epoch": 2.5728987993138936, "eval_accuracy": 0.9281632900238037, "eval_loss": 0.553718626499176, "eval_runtime": 14.1602, "eval_samples_per_second": 173.02, "eval_steps_per_second": 5.438, "step": 19500 }, { "epoch": 2.579495975722391, "grad_norm": 17.32400131225586, "learning_rate": 2.823695719397447e-06, "loss": 0.5584, "step": 19550 }, { "epoch": 2.586093152130888, "grad_norm": 5.780141830444336, "learning_rate": 2.7795202544506785e-06, "loss": 0.508, "step": 19600 }, { "epoch": 2.592690328539385, "grad_norm": 12.641766548156738, "learning_rate": 2.7353447895039097e-06, "loss": 0.5231, "step": 19650 }, { "epoch": 2.599287504947882, "grad_norm": 18.93987464904785, "learning_rate": 2.6920528338560762e-06, "loss": 0.557, "step": 19700 }, { "epoch": 2.6058846813563794, "grad_norm": 15.360589027404785, "learning_rate": 2.647877368909308e-06, "loss": 0.5338, "step": 19750 }, { "epoch": 2.6058846813563794, "eval_accuracy": 0.9257143139839172, "eval_loss": 0.551838219165802, "eval_runtime": 15.1358, "eval_samples_per_second": 161.868, "eval_steps_per_second": 5.087, "step": 19750 }, { "epoch": 2.6124818577648767, "grad_norm": 15.32451057434082, "learning_rate": 2.6037019039625394e-06, "loss": 0.5037, "step": 19800 }, { "epoch": 2.6190790341733736, "grad_norm": 11.314981460571289, "learning_rate": 2.559526439015771e-06, "loss": 0.6057, "step": 19850 }, { "epoch": 2.625676210581871, "grad_norm": 7.916543006896973, "learning_rate": 2.5153509740690026e-06, "loss": 0.5571, "step": 19900 }, { "epoch": 2.6322733869903683, "grad_norm": 17.10308837890625, "learning_rate": 2.4711755091222338e-06, "loss": 0.5177, "step": 19950 }, { "epoch": 2.638870563398865, "grad_norm": 16.19850730895996, "learning_rate": 2.427000044175465e-06, "loss": 0.4946, "step": 20000 }, { "epoch": 2.638870563398865, "eval_accuracy": 0.9253061413764954, "eval_loss": 0.5547569394111633, "eval_runtime": 14.6842, "eval_samples_per_second": 166.846, "eval_steps_per_second": 5.244, "step": 20000 }, { "epoch": 2.6454677398073625, "grad_norm": 9.776342391967773, "learning_rate": 2.3828245792286966e-06, "loss": 0.5256, "step": 20050 }, { "epoch": 2.65206491621586, "grad_norm": 14.741767883300781, "learning_rate": 2.338649114281928e-06, "loss": 0.5107, "step": 20100 }, { "epoch": 2.6586620926243567, "grad_norm": 10.714197158813477, "learning_rate": 2.2944736493351593e-06, "loss": 0.5988, "step": 20150 }, { "epoch": 2.665259269032854, "grad_norm": 16.533546447753906, "learning_rate": 2.250298184388391e-06, "loss": 0.4907, "step": 20200 }, { "epoch": 2.6718564454413514, "grad_norm": 18.46228790283203, "learning_rate": 2.2061227194416225e-06, "loss": 0.4697, "step": 20250 }, { "epoch": 2.6718564454413514, "eval_accuracy": 0.9269387722015381, "eval_loss": 0.5565572381019592, "eval_runtime": 15.4517, "eval_samples_per_second": 158.559, "eval_steps_per_second": 4.983, "step": 20250 }, { "epoch": 2.6784536218498483, "grad_norm": 11.330911636352539, "learning_rate": 2.1619472544948537e-06, "loss": 0.4897, "step": 20300 }, { "epoch": 2.6850507982583456, "grad_norm": 12.666998863220215, "learning_rate": 2.1177717895480853e-06, "loss": 0.5088, "step": 20350 }, { "epoch": 2.6916479746668425, "grad_norm": 21.95562171936035, "learning_rate": 2.0735963246013165e-06, "loss": 0.5442, "step": 20400 }, { "epoch": 2.69824515107534, "grad_norm": 13.3275785446167, "learning_rate": 2.029420859654548e-06, "loss": 0.536, "step": 20450 }, { "epoch": 2.7048423274838367, "grad_norm": 16.374469757080078, "learning_rate": 1.9852453947077792e-06, "loss": 0.551, "step": 20500 }, { "epoch": 2.7048423274838367, "eval_accuracy": 0.9269387722015381, "eval_loss": 0.5562152862548828, "eval_runtime": 14.1322, "eval_samples_per_second": 173.363, "eval_steps_per_second": 5.449, "step": 20500 }, { "epoch": 2.711439503892334, "grad_norm": 9.546135902404785, "learning_rate": 1.941069929761011e-06, "loss": 0.5038, "step": 20550 }, { "epoch": 2.7180366803008313, "grad_norm": 8.056339263916016, "learning_rate": 1.8968944648142424e-06, "loss": 0.502, "step": 20600 }, { "epoch": 2.7246338567093282, "grad_norm": 15.2578706741333, "learning_rate": 1.8527189998674738e-06, "loss": 0.5021, "step": 20650 }, { "epoch": 2.7312310331178256, "grad_norm": 9.090350151062012, "learning_rate": 1.808543534920705e-06, "loss": 0.5441, "step": 20700 }, { "epoch": 2.737828209526323, "grad_norm": 8.323760032653809, "learning_rate": 1.7643680699739366e-06, "loss": 0.4818, "step": 20750 }, { "epoch": 2.737828209526323, "eval_accuracy": 0.9285714030265808, "eval_loss": 0.554760754108429, "eval_runtime": 14.1428, "eval_samples_per_second": 173.233, "eval_steps_per_second": 5.444, "step": 20750 }, { "epoch": 2.7444253859348198, "grad_norm": 9.076456069946289, "learning_rate": 1.720192605027168e-06, "loss": 0.5012, "step": 20800 }, { "epoch": 2.751022562343317, "grad_norm": 11.11436939239502, "learning_rate": 1.6760171400803996e-06, "loss": 0.5294, "step": 20850 }, { "epoch": 2.7576197387518144, "grad_norm": 10.291386604309082, "learning_rate": 1.6318416751336307e-06, "loss": 0.4674, "step": 20900 }, { "epoch": 2.7642169151603113, "grad_norm": 19.83849334716797, "learning_rate": 1.5876662101868623e-06, "loss": 0.5436, "step": 20950 }, { "epoch": 2.7708140915688086, "grad_norm": 12.49002456665039, "learning_rate": 1.5434907452400937e-06, "loss": 0.4609, "step": 21000 }, { "epoch": 2.7708140915688086, "eval_accuracy": 0.9269387722015381, "eval_loss": 0.5537921190261841, "eval_runtime": 14.1539, "eval_samples_per_second": 173.097, "eval_steps_per_second": 5.44, "step": 21000 }, { "epoch": 2.777411267977306, "grad_norm": 11.501051902770996, "learning_rate": 1.4993152802933253e-06, "loss": 0.5015, "step": 21050 }, { "epoch": 2.784008444385803, "grad_norm": 11.332602500915527, "learning_rate": 1.4551398153465565e-06, "loss": 0.5299, "step": 21100 }, { "epoch": 2.7906056207943, "grad_norm": 14.520770072937012, "learning_rate": 1.410964350399788e-06, "loss": 0.4363, "step": 21150 }, { "epoch": 2.797202797202797, "grad_norm": 19.919044494628906, "learning_rate": 1.3667888854530195e-06, "loss": 0.5018, "step": 21200 }, { "epoch": 2.8037999736112944, "grad_norm": 17.189006805419922, "learning_rate": 1.322613420506251e-06, "loss": 0.5079, "step": 21250 }, { "epoch": 2.8037999736112944, "eval_accuracy": 0.9265305995941162, "eval_loss": 0.5549395680427551, "eval_runtime": 14.2355, "eval_samples_per_second": 172.105, "eval_steps_per_second": 5.409, "step": 21250 }, { "epoch": 2.8103971500197913, "grad_norm": 10.528189659118652, "learning_rate": 1.2784379555594823e-06, "loss": 0.4467, "step": 21300 }, { "epoch": 2.8169943264282886, "grad_norm": 10.16166877746582, "learning_rate": 1.2342624906127139e-06, "loss": 0.5769, "step": 21350 }, { "epoch": 2.823591502836786, "grad_norm": 5.988204002380371, "learning_rate": 1.1900870256659452e-06, "loss": 0.5323, "step": 21400 }, { "epoch": 2.830188679245283, "grad_norm": 18.418853759765625, "learning_rate": 1.1459115607191766e-06, "loss": 0.4714, "step": 21450 }, { "epoch": 2.83678585565378, "grad_norm": 12.283252716064453, "learning_rate": 1.1017360957724082e-06, "loss": 0.4491, "step": 21500 }, { "epoch": 2.83678585565378, "eval_accuracy": 0.9257143139839172, "eval_loss": 0.5538486242294312, "eval_runtime": 14.1053, "eval_samples_per_second": 173.693, "eval_steps_per_second": 5.459, "step": 21500 }, { "epoch": 2.8433830320622775, "grad_norm": 6.51999568939209, "learning_rate": 1.0575606308256394e-06, "loss": 0.4801, "step": 21550 }, { "epoch": 2.8499802084707744, "grad_norm": 12.273625373840332, "learning_rate": 1.013385165878871e-06, "loss": 0.5132, "step": 21600 }, { "epoch": 2.8565773848792717, "grad_norm": 14.393851280212402, "learning_rate": 9.692097009321024e-07, "loss": 0.4542, "step": 21650 }, { "epoch": 2.863174561287769, "grad_norm": 14.198440551757812, "learning_rate": 9.250342359853339e-07, "loss": 0.5015, "step": 21700 }, { "epoch": 2.869771737696266, "grad_norm": 17.55302619934082, "learning_rate": 8.808587710385653e-07, "loss": 0.4818, "step": 21750 }, { "epoch": 2.869771737696266, "eval_accuracy": 0.9277551174163818, "eval_loss": 0.5553678870201111, "eval_runtime": 14.0909, "eval_samples_per_second": 173.871, "eval_steps_per_second": 5.465, "step": 21750 }, { "epoch": 2.8763689141047633, "grad_norm": 14.505319595336914, "learning_rate": 8.366833060917967e-07, "loss": 0.5078, "step": 21800 }, { "epoch": 2.8829660905132606, "grad_norm": 15.411627769470215, "learning_rate": 7.92507841145028e-07, "loss": 0.508, "step": 21850 }, { "epoch": 2.8895632669217575, "grad_norm": 17.951257705688477, "learning_rate": 7.483323761982596e-07, "loss": 0.5331, "step": 21900 }, { "epoch": 2.896160443330255, "grad_norm": 8.112133979797363, "learning_rate": 7.041569112514909e-07, "loss": 0.5185, "step": 21950 }, { "epoch": 2.9027576197387517, "grad_norm": 16.92367172241211, "learning_rate": 6.599814463047224e-07, "loss": 0.4469, "step": 22000 }, { "epoch": 2.9027576197387517, "eval_accuracy": 0.9265305995941162, "eval_loss": 0.555115818977356, "eval_runtime": 14.0889, "eval_samples_per_second": 173.896, "eval_steps_per_second": 5.465, "step": 22000 }, { "epoch": 2.909354796147249, "grad_norm": 7.048780918121338, "learning_rate": 6.158059813579539e-07, "loss": 0.4762, "step": 22050 }, { "epoch": 2.915951972555746, "grad_norm": 16.79896354675293, "learning_rate": 5.716305164111853e-07, "loss": 0.5799, "step": 22100 }, { "epoch": 2.9225491489642432, "grad_norm": 10.826476097106934, "learning_rate": 5.274550514644168e-07, "loss": 0.4978, "step": 22150 }, { "epoch": 2.9291463253727406, "grad_norm": 12.840262413024902, "learning_rate": 4.832795865176481e-07, "loss": 0.566, "step": 22200 }, { "epoch": 2.9357435017812374, "grad_norm": 20.16173553466797, "learning_rate": 4.391041215708796e-07, "loss": 0.5837, "step": 22250 }, { "epoch": 2.9357435017812374, "eval_accuracy": 0.9269387722015381, "eval_loss": 0.5531713962554932, "eval_runtime": 14.0502, "eval_samples_per_second": 174.375, "eval_steps_per_second": 5.48, "step": 22250 }, { "epoch": 2.9423406781897348, "grad_norm": 12.399968147277832, "learning_rate": 3.94928656624111e-07, "loss": 0.5401, "step": 22300 }, { "epoch": 2.948937854598232, "grad_norm": 18.302248001098633, "learning_rate": 3.5075319167734247e-07, "loss": 0.523, "step": 22350 }, { "epoch": 2.955535031006729, "grad_norm": 13.304845809936523, "learning_rate": 3.0657772673057385e-07, "loss": 0.5913, "step": 22400 }, { "epoch": 2.9621322074152263, "grad_norm": 18.745372772216797, "learning_rate": 2.624022617838053e-07, "loss": 0.4701, "step": 22450 }, { "epoch": 2.9687293838237236, "grad_norm": 14.230325698852539, "learning_rate": 2.1822679683703673e-07, "loss": 0.5568, "step": 22500 }, { "epoch": 2.9687293838237236, "eval_accuracy": 0.92734694480896, "eval_loss": 0.5528694987297058, "eval_runtime": 14.0688, "eval_samples_per_second": 174.144, "eval_steps_per_second": 5.473, "step": 22500 }, { "epoch": 2.9753265602322205, "grad_norm": 13.408769607543945, "learning_rate": 1.7405133189026817e-07, "loss": 0.5266, "step": 22550 }, { "epoch": 2.981923736640718, "grad_norm": 14.969887733459473, "learning_rate": 1.298758669434996e-07, "loss": 0.4969, "step": 22600 }, { "epoch": 2.988520913049215, "grad_norm": 3.428957939147949, "learning_rate": 8.570040199673103e-08, "loss": 0.4917, "step": 22650 }, { "epoch": 2.995118089457712, "grad_norm": 9.547283172607422, "learning_rate": 4.152493704996245e-08, "loss": 0.5128, "step": 22700 } ], "logging_steps": 50, "max_steps": 22737, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }