{ "best_global_step": 360, "best_metric": 1.0, "best_model_checkpoint": "/projects/bffw/darora1/llm_ipc/final_models/mpi_async_n3/checkpoint-360", "epoch": 0.2158273381294964, "eval_steps": 40, "global_step": 360, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001199040767386091, "grad_norm": 19.730182647705078, "learning_rate": 2.0000000000000002e-07, "loss": 1.2585, "step": 2 }, { "epoch": 0.002398081534772182, "grad_norm": 23.144609451293945, "learning_rate": 6.000000000000001e-07, "loss": 1.3757, "step": 4 }, { "epoch": 0.0035971223021582736, "grad_norm": 17.20735740661621, "learning_rate": 1.0000000000000002e-06, "loss": 1.1802, "step": 6 }, { "epoch": 0.004796163069544364, "grad_norm": 17.95670509338379, "learning_rate": 1.4000000000000001e-06, "loss": 1.1788, "step": 8 }, { "epoch": 0.005995203836930456, "grad_norm": 15.297016143798828, "learning_rate": 1.8000000000000001e-06, "loss": 1.0785, "step": 10 }, { "epoch": 0.007194244604316547, "grad_norm": 15.091691017150879, "learning_rate": 2.2e-06, "loss": 1.0445, "step": 12 }, { "epoch": 0.008393285371702638, "grad_norm": 9.994537353515625, "learning_rate": 2.6e-06, "loss": 0.8632, "step": 14 }, { "epoch": 0.009592326139088728, "grad_norm": 7.879035472869873, "learning_rate": 3e-06, "loss": 0.6384, "step": 16 }, { "epoch": 0.01079136690647482, "grad_norm": 10.125263214111328, "learning_rate": 3.4000000000000005e-06, "loss": 0.6511, "step": 18 }, { "epoch": 0.011990407673860911, "grad_norm": 6.4151201248168945, "learning_rate": 3.8000000000000005e-06, "loss": 0.4297, "step": 20 }, { "epoch": 0.013189448441247002, "grad_norm": 4.507650375366211, "learning_rate": 4.2000000000000004e-06, "loss": 0.3096, "step": 22 }, { "epoch": 0.014388489208633094, "grad_norm": 3.6591382026672363, "learning_rate": 4.600000000000001e-06, "loss": 0.2421, "step": 24 }, { "epoch": 0.015587529976019185, "grad_norm": 3.2937803268432617, "learning_rate": 5e-06, "loss": 0.1674, "step": 26 }, { "epoch": 0.016786570743405275, "grad_norm": 3.2421910762786865, "learning_rate": 5.400000000000001e-06, "loss": 0.1182, "step": 28 }, { "epoch": 0.017985611510791366, "grad_norm": 2.621964693069458, "learning_rate": 5.8e-06, "loss": 0.1051, "step": 30 }, { "epoch": 0.019184652278177457, "grad_norm": 2.452547550201416, "learning_rate": 6.200000000000001e-06, "loss": 0.0872, "step": 32 }, { "epoch": 0.02038369304556355, "grad_norm": 1.978013038635254, "learning_rate": 6.600000000000001e-06, "loss": 0.079, "step": 34 }, { "epoch": 0.02158273381294964, "grad_norm": 1.5187039375305176, "learning_rate": 7e-06, "loss": 0.0635, "step": 36 }, { "epoch": 0.022781774580335732, "grad_norm": 1.7821204662322998, "learning_rate": 7.4e-06, "loss": 0.0496, "step": 38 }, { "epoch": 0.023980815347721823, "grad_norm": 2.5544259548187256, "learning_rate": 7.800000000000002e-06, "loss": 0.038, "step": 40 }, { "epoch": 0.023980815347721823, "eval_accuracy": 0.9842424205572071, "eval_loss": 0.044016700237989426, "eval_runtime": 153.6211, "eval_samples_per_second": 32.548, "eval_steps_per_second": 16.274, "step": 40 }, { "epoch": 0.025179856115107913, "grad_norm": 2.1579580307006836, "learning_rate": 8.2e-06, "loss": 0.0405, "step": 42 }, { "epoch": 0.026378896882494004, "grad_norm": 2.6840529441833496, "learning_rate": 8.6e-06, "loss": 0.0418, "step": 44 }, { "epoch": 0.027577937649880094, "grad_norm": 3.79921555519104, "learning_rate": 9e-06, "loss": 0.0309, "step": 46 }, { "epoch": 0.02877697841726619, "grad_norm": 2.689685344696045, "learning_rate": 9.4e-06, "loss": 0.0292, "step": 48 }, { "epoch": 0.02997601918465228, "grad_norm": 1.4302867650985718, "learning_rate": 9.800000000000001e-06, "loss": 0.0268, "step": 50 }, { "epoch": 0.03117505995203837, "grad_norm": 1.27105712890625, "learning_rate": 9.999998993000299e-06, "loss": 0.0261, "step": 52 }, { "epoch": 0.03237410071942446, "grad_norm": 1.1987839937210083, "learning_rate": 9.999990937005126e-06, "loss": 0.0199, "step": 54 }, { "epoch": 0.03357314148681055, "grad_norm": 0.746536135673523, "learning_rate": 9.999974825027756e-06, "loss": 0.0142, "step": 56 }, { "epoch": 0.03477218225419664, "grad_norm": 1.485276222229004, "learning_rate": 9.999950657094151e-06, "loss": 0.0151, "step": 58 }, { "epoch": 0.03597122302158273, "grad_norm": 1.5013028383255005, "learning_rate": 9.999918433243253e-06, "loss": 0.0129, "step": 60 }, { "epoch": 0.03717026378896882, "grad_norm": 0.8688841462135315, "learning_rate": 9.999878153526974e-06, "loss": 0.0103, "step": 62 }, { "epoch": 0.03836930455635491, "grad_norm": 0.7988501191139221, "learning_rate": 9.99982981801022e-06, "loss": 0.0098, "step": 64 }, { "epoch": 0.039568345323741004, "grad_norm": 1.101700782775879, "learning_rate": 9.999773426770864e-06, "loss": 0.0089, "step": 66 }, { "epoch": 0.0407673860911271, "grad_norm": 1.0563534498214722, "learning_rate": 9.999708979899769e-06, "loss": 0.0082, "step": 68 }, { "epoch": 0.04196642685851319, "grad_norm": 0.845487117767334, "learning_rate": 9.999636477500765e-06, "loss": 0.0082, "step": 70 }, { "epoch": 0.04316546762589928, "grad_norm": 0.8372548818588257, "learning_rate": 9.999555919690673e-06, "loss": 0.0072, "step": 72 }, { "epoch": 0.04436450839328537, "grad_norm": 0.7056983709335327, "learning_rate": 9.999467306599285e-06, "loss": 0.0058, "step": 74 }, { "epoch": 0.045563549160671464, "grad_norm": 0.49338242411613464, "learning_rate": 9.999370638369377e-06, "loss": 0.0039, "step": 76 }, { "epoch": 0.046762589928057555, "grad_norm": 0.8962873816490173, "learning_rate": 9.999265915156697e-06, "loss": 0.0049, "step": 78 }, { "epoch": 0.047961630695443645, "grad_norm": 1.2066389322280884, "learning_rate": 9.999153137129978e-06, "loss": 0.005, "step": 80 }, { "epoch": 0.047961630695443645, "eval_accuracy": 0.9984770185297835, "eval_loss": 0.0045981272123754025, "eval_runtime": 149.4888, "eval_samples_per_second": 33.447, "eval_steps_per_second": 16.724, "step": 80 }, { "epoch": 0.049160671462829736, "grad_norm": 0.41922950744628906, "learning_rate": 9.999032304470926e-06, "loss": 0.0033, "step": 82 }, { "epoch": 0.050359712230215826, "grad_norm": 0.6883206367492676, "learning_rate": 9.998903417374228e-06, "loss": 0.0024, "step": 84 }, { "epoch": 0.05155875299760192, "grad_norm": 0.4788949489593506, "learning_rate": 9.998766476047546e-06, "loss": 0.0025, "step": 86 }, { "epoch": 0.05275779376498801, "grad_norm": 0.6724292039871216, "learning_rate": 9.998621480711522e-06, "loss": 0.0019, "step": 88 }, { "epoch": 0.0539568345323741, "grad_norm": 1.373579978942871, "learning_rate": 9.998468431599768e-06, "loss": 0.003, "step": 90 }, { "epoch": 0.05515587529976019, "grad_norm": 1.0157880783081055, "learning_rate": 9.99830732895888e-06, "loss": 0.0028, "step": 92 }, { "epoch": 0.05635491606714628, "grad_norm": 1.0597639083862305, "learning_rate": 9.998138173048424e-06, "loss": 0.0021, "step": 94 }, { "epoch": 0.05755395683453238, "grad_norm": 1.0094547271728516, "learning_rate": 9.997960964140946e-06, "loss": 0.002, "step": 96 }, { "epoch": 0.05875299760191847, "grad_norm": 1.0186887979507446, "learning_rate": 9.997775702521965e-06, "loss": 0.0024, "step": 98 }, { "epoch": 0.05995203836930456, "grad_norm": 0.6651451587677002, "learning_rate": 9.997582388489975e-06, "loss": 0.0009, "step": 100 }, { "epoch": 0.06115107913669065, "grad_norm": 0.9047290682792664, "learning_rate": 9.99738102235644e-06, "loss": 0.0025, "step": 102 }, { "epoch": 0.06235011990407674, "grad_norm": 0.3253116011619568, "learning_rate": 9.997171604445803e-06, "loss": 0.0012, "step": 104 }, { "epoch": 0.06354916067146282, "grad_norm": 1.0086337327957153, "learning_rate": 9.99695413509548e-06, "loss": 0.0019, "step": 106 }, { "epoch": 0.06474820143884892, "grad_norm": 0.7249751687049866, "learning_rate": 9.996728614655854e-06, "loss": 0.0018, "step": 108 }, { "epoch": 0.06594724220623502, "grad_norm": 0.6050100326538086, "learning_rate": 9.996495043490285e-06, "loss": 0.0012, "step": 110 }, { "epoch": 0.0671462829736211, "grad_norm": 0.7636982798576355, "learning_rate": 9.996253421975103e-06, "loss": 0.0015, "step": 112 }, { "epoch": 0.0683453237410072, "grad_norm": 0.5241732001304626, "learning_rate": 9.996003750499608e-06, "loss": 0.001, "step": 114 }, { "epoch": 0.06954436450839328, "grad_norm": 0.4341820180416107, "learning_rate": 9.995746029466071e-06, "loss": 0.001, "step": 116 }, { "epoch": 0.07074340527577938, "grad_norm": 1.206058382987976, "learning_rate": 9.995480259289731e-06, "loss": 0.002, "step": 118 }, { "epoch": 0.07194244604316546, "grad_norm": 0.501316249370575, "learning_rate": 9.995206440398798e-06, "loss": 0.0011, "step": 120 }, { "epoch": 0.07194244604316546, "eval_accuracy": 0.9991422278695195, "eval_loss": 0.002768160542473197, "eval_runtime": 150.9106, "eval_samples_per_second": 33.132, "eval_steps_per_second": 16.566, "step": 120 }, { "epoch": 0.07314148681055156, "grad_norm": 0.918229877948761, "learning_rate": 9.994924573234448e-06, "loss": 0.0028, "step": 122 }, { "epoch": 0.07434052757793765, "grad_norm": 0.28935667872428894, "learning_rate": 9.994634658250825e-06, "loss": 0.0022, "step": 124 }, { "epoch": 0.07553956834532374, "grad_norm": 0.5728291869163513, "learning_rate": 9.994336695915041e-06, "loss": 0.0013, "step": 126 }, { "epoch": 0.07673860911270983, "grad_norm": 0.28649207949638367, "learning_rate": 9.994030686707171e-06, "loss": 0.0014, "step": 128 }, { "epoch": 0.07793764988009592, "grad_norm": 0.3085499703884125, "learning_rate": 9.993716631120259e-06, "loss": 0.0018, "step": 130 }, { "epoch": 0.07913669064748201, "grad_norm": 0.3971754312515259, "learning_rate": 9.993394529660307e-06, "loss": 0.0009, "step": 132 }, { "epoch": 0.0803357314148681, "grad_norm": 0.15936186909675598, "learning_rate": 9.99306438284629e-06, "loss": 0.0008, "step": 134 }, { "epoch": 0.0815347721822542, "grad_norm": 0.4664164185523987, "learning_rate": 9.992726191210139e-06, "loss": 0.0008, "step": 136 }, { "epoch": 0.08273381294964029, "grad_norm": 0.2838437557220459, "learning_rate": 9.992379955296745e-06, "loss": 0.0004, "step": 138 }, { "epoch": 0.08393285371702638, "grad_norm": 0.2806377112865448, "learning_rate": 9.992025675663966e-06, "loss": 0.0005, "step": 140 }, { "epoch": 0.08513189448441247, "grad_norm": 0.8728582859039307, "learning_rate": 9.991663352882615e-06, "loss": 0.0005, "step": 142 }, { "epoch": 0.08633093525179857, "grad_norm": 0.33335745334625244, "learning_rate": 9.991292987536469e-06, "loss": 0.0003, "step": 144 }, { "epoch": 0.08752997601918465, "grad_norm": 0.43252527713775635, "learning_rate": 9.990914580222258e-06, "loss": 0.0006, "step": 146 }, { "epoch": 0.08872901678657075, "grad_norm": 0.36625614762306213, "learning_rate": 9.990528131549674e-06, "loss": 0.0003, "step": 148 }, { "epoch": 0.08992805755395683, "grad_norm": 0.1192215234041214, "learning_rate": 9.990133642141359e-06, "loss": 0.0005, "step": 150 }, { "epoch": 0.09112709832134293, "grad_norm": 0.26630905270576477, "learning_rate": 9.989731112632917e-06, "loss": 0.0007, "step": 152 }, { "epoch": 0.09232613908872901, "grad_norm": 0.1390163153409958, "learning_rate": 9.989320543672904e-06, "loss": 0.0004, "step": 154 }, { "epoch": 0.09352517985611511, "grad_norm": 0.05628788471221924, "learning_rate": 9.988901935922826e-06, "loss": 0.0004, "step": 156 }, { "epoch": 0.09472422062350119, "grad_norm": 0.6800597310066223, "learning_rate": 9.988475290057145e-06, "loss": 0.0007, "step": 158 }, { "epoch": 0.09592326139088729, "grad_norm": 0.2103985697031021, "learning_rate": 9.988040606763272e-06, "loss": 0.001, "step": 160 }, { "epoch": 0.09592326139088729, "eval_accuracy": 0.9999359200176863, "eval_loss": 0.0002029576717177406, "eval_runtime": 153.972, "eval_samples_per_second": 32.473, "eval_steps_per_second": 16.237, "step": 160 }, { "epoch": 0.09712230215827339, "grad_norm": 0.4128981828689575, "learning_rate": 9.98759788674157e-06, "loss": 0.0004, "step": 162 }, { "epoch": 0.09832134292565947, "grad_norm": 0.46466055512428284, "learning_rate": 9.987147130705347e-06, "loss": 0.0009, "step": 164 }, { "epoch": 0.09952038369304557, "grad_norm": 0.7768604755401611, "learning_rate": 9.986688339380863e-06, "loss": 0.0005, "step": 166 }, { "epoch": 0.10071942446043165, "grad_norm": 0.5354371070861816, "learning_rate": 9.98622151350732e-06, "loss": 0.0009, "step": 168 }, { "epoch": 0.10191846522781775, "grad_norm": 0.33338215947151184, "learning_rate": 9.985746653836867e-06, "loss": 0.0005, "step": 170 }, { "epoch": 0.10311750599520383, "grad_norm": 0.15407763421535492, "learning_rate": 9.985263761134602e-06, "loss": 0.0002, "step": 172 }, { "epoch": 0.10431654676258993, "grad_norm": 0.07146434485912323, "learning_rate": 9.984772836178559e-06, "loss": 0.0003, "step": 174 }, { "epoch": 0.10551558752997602, "grad_norm": 0.12465538829565048, "learning_rate": 9.984273879759713e-06, "loss": 0.0001, "step": 176 }, { "epoch": 0.10671462829736211, "grad_norm": 0.22077329456806183, "learning_rate": 9.983766892681985e-06, "loss": 0.0002, "step": 178 }, { "epoch": 0.1079136690647482, "grad_norm": 0.06069932505488396, "learning_rate": 9.983251875762234e-06, "loss": 0.0002, "step": 180 }, { "epoch": 0.1091127098321343, "grad_norm": 0.05713481828570366, "learning_rate": 9.982728829830252e-06, "loss": 0.0001, "step": 182 }, { "epoch": 0.11031175059952038, "grad_norm": 0.09267017990350723, "learning_rate": 9.982197755728771e-06, "loss": 0.0003, "step": 184 }, { "epoch": 0.11151079136690648, "grad_norm": 0.047195322811603546, "learning_rate": 9.981658654313458e-06, "loss": 0.0, "step": 186 }, { "epoch": 0.11270983213429256, "grad_norm": 0.042289845645427704, "learning_rate": 9.981111526452912e-06, "loss": 0.0, "step": 188 }, { "epoch": 0.11390887290167866, "grad_norm": 0.08245964348316193, "learning_rate": 9.980556373028665e-06, "loss": 0.0002, "step": 190 }, { "epoch": 0.11510791366906475, "grad_norm": 0.01631307043135166, "learning_rate": 9.979993194935182e-06, "loss": 0.0, "step": 192 }, { "epoch": 0.11630695443645084, "grad_norm": 0.22155308723449707, "learning_rate": 9.979421993079853e-06, "loss": 0.0001, "step": 194 }, { "epoch": 0.11750599520383694, "grad_norm": 0.08721671253442764, "learning_rate": 9.978842768382999e-06, "loss": 0.0, "step": 196 }, { "epoch": 0.11870503597122302, "grad_norm": 0.00864589773118496, "learning_rate": 9.978255521777865e-06, "loss": 0.0, "step": 198 }, { "epoch": 0.11990407673860912, "grad_norm": 0.025131428614258766, "learning_rate": 9.977660254210623e-06, "loss": 0.0, "step": 200 }, { "epoch": 0.11990407673860912, "eval_accuracy": 0.9999842230318391, "eval_loss": 4.0982533391797915e-05, "eval_runtime": 156.2587, "eval_samples_per_second": 31.998, "eval_steps_per_second": 15.999, "step": 200 }, { "epoch": 0.1211031175059952, "grad_norm": 0.08378835767507553, "learning_rate": 9.977056966640368e-06, "loss": 0.0, "step": 202 }, { "epoch": 0.1223021582733813, "grad_norm": 0.004677386488765478, "learning_rate": 9.976445660039118e-06, "loss": 0.0, "step": 204 }, { "epoch": 0.12350119904076738, "grad_norm": 0.012990830466151237, "learning_rate": 9.975826335391808e-06, "loss": 0.0, "step": 206 }, { "epoch": 0.12470023980815348, "grad_norm": 0.06871869415044785, "learning_rate": 9.975198993696294e-06, "loss": 0.0, "step": 208 }, { "epoch": 0.12589928057553956, "grad_norm": 0.03185407817363739, "learning_rate": 9.974563635963348e-06, "loss": 0.0, "step": 210 }, { "epoch": 0.12709832134292565, "grad_norm": 0.007331969682127237, "learning_rate": 9.973920263216658e-06, "loss": 0.0, "step": 212 }, { "epoch": 0.12829736211031176, "grad_norm": 0.006134955212473869, "learning_rate": 9.973268876492827e-06, "loss": 0.0, "step": 214 }, { "epoch": 0.12949640287769784, "grad_norm": 0.002639917889609933, "learning_rate": 9.972609476841368e-06, "loss": 0.0, "step": 216 }, { "epoch": 0.13069544364508393, "grad_norm": 0.139603853225708, "learning_rate": 9.971942065324704e-06, "loss": 0.0, "step": 218 }, { "epoch": 0.13189448441247004, "grad_norm": 0.002388500142842531, "learning_rate": 9.971266643018171e-06, "loss": 0.0, "step": 220 }, { "epoch": 0.13309352517985612, "grad_norm": 0.005431812256574631, "learning_rate": 9.970583211010008e-06, "loss": 0.0, "step": 222 }, { "epoch": 0.1342925659472422, "grad_norm": 0.002608460607007146, "learning_rate": 9.969891770401358e-06, "loss": 0.0, "step": 224 }, { "epoch": 0.1354916067146283, "grad_norm": 0.008019981905817986, "learning_rate": 9.969192322306271e-06, "loss": 0.0, "step": 226 }, { "epoch": 0.1366906474820144, "grad_norm": 0.08590810745954514, "learning_rate": 9.968484867851698e-06, "loss": 0.0, "step": 228 }, { "epoch": 0.13788968824940048, "grad_norm": 0.004419062752276659, "learning_rate": 9.96776940817749e-06, "loss": 0.0, "step": 230 }, { "epoch": 0.13908872901678657, "grad_norm": 0.08247098326683044, "learning_rate": 9.967045944436392e-06, "loss": 0.0, "step": 232 }, { "epoch": 0.14028776978417265, "grad_norm": 0.001451000920496881, "learning_rate": 9.966314477794052e-06, "loss": 0.0, "step": 234 }, { "epoch": 0.14148681055155876, "grad_norm": 0.0005800220533274114, "learning_rate": 9.965575009429006e-06, "loss": 0.0, "step": 236 }, { "epoch": 0.14268585131894485, "grad_norm": 0.0006440122961066663, "learning_rate": 9.964827540532685e-06, "loss": 0.0, "step": 238 }, { "epoch": 0.14388489208633093, "grad_norm": 0.002776832552626729, "learning_rate": 9.964072072309412e-06, "loss": 0.0, "step": 240 }, { "epoch": 0.14388489208633093, "eval_accuracy": 0.9999919991999199, "eval_loss": 1.5212925063678995e-05, "eval_runtime": 187.7963, "eval_samples_per_second": 26.625, "eval_steps_per_second": 13.312, "step": 240 }, { "epoch": 0.145083932853717, "grad_norm": 0.0011048481101170182, "learning_rate": 9.963308605976397e-06, "loss": 0.0, "step": 242 }, { "epoch": 0.14628297362110312, "grad_norm": 0.06405247747898102, "learning_rate": 9.962537142763733e-06, "loss": 0.0, "step": 244 }, { "epoch": 0.1474820143884892, "grad_norm": 0.0006763112614862621, "learning_rate": 9.961757683914406e-06, "loss": 0.0, "step": 246 }, { "epoch": 0.1486810551558753, "grad_norm": 0.0009437328553758562, "learning_rate": 9.960970230684276e-06, "loss": 0.0, "step": 248 }, { "epoch": 0.1498800959232614, "grad_norm": 0.005290859844535589, "learning_rate": 9.96017478434209e-06, "loss": 0.0, "step": 250 }, { "epoch": 0.1510791366906475, "grad_norm": 0.0017193189123645425, "learning_rate": 9.959371346169466e-06, "loss": 0.0, "step": 252 }, { "epoch": 0.15227817745803357, "grad_norm": 0.00039530443609692156, "learning_rate": 9.958559917460909e-06, "loss": 0.0, "step": 254 }, { "epoch": 0.15347721822541965, "grad_norm": 0.0005756777245551348, "learning_rate": 9.957740499523787e-06, "loss": 0.0, "step": 256 }, { "epoch": 0.15467625899280577, "grad_norm": 0.2923714220523834, "learning_rate": 9.95691309367835e-06, "loss": 0.0001, "step": 258 }, { "epoch": 0.15587529976019185, "grad_norm": 0.0004579645174089819, "learning_rate": 9.95607770125771e-06, "loss": 0.0, "step": 260 }, { "epoch": 0.15707434052757793, "grad_norm": 0.00043058019946329296, "learning_rate": 9.955234323607854e-06, "loss": 0.0, "step": 262 }, { "epoch": 0.15827338129496402, "grad_norm": 0.0007559367222711444, "learning_rate": 9.954382962087628e-06, "loss": 0.0, "step": 264 }, { "epoch": 0.15947242206235013, "grad_norm": 0.0007242615101858974, "learning_rate": 9.95352361806875e-06, "loss": 0.0, "step": 266 }, { "epoch": 0.1606714628297362, "grad_norm": 0.001006856095045805, "learning_rate": 9.95265629293579e-06, "loss": 0.0, "step": 268 }, { "epoch": 0.1618705035971223, "grad_norm": 0.0025918015744537115, "learning_rate": 9.951780988086183e-06, "loss": 0.0, "step": 270 }, { "epoch": 0.1630695443645084, "grad_norm": 0.0006822652067057788, "learning_rate": 9.950897704930223e-06, "loss": 0.0002, "step": 272 }, { "epoch": 0.1642685851318945, "grad_norm": 0.0006394693627953529, "learning_rate": 9.95000644489105e-06, "loss": 0.0, "step": 274 }, { "epoch": 0.16546762589928057, "grad_norm": 0.0007676673121750355, "learning_rate": 9.949107209404664e-06, "loss": 0.0, "step": 276 }, { "epoch": 0.16666666666666666, "grad_norm": 0.001793363830074668, "learning_rate": 9.948199999919914e-06, "loss": 0.0, "step": 278 }, { "epoch": 0.16786570743405277, "grad_norm": 0.0004261991416569799, "learning_rate": 9.947284817898493e-06, "loss": 0.0, "step": 280 }, { "epoch": 0.16786570743405277, "eval_accuracy": 0.9999959789856537, "eval_loss": 2.9676788471988402e-05, "eval_runtime": 169.8758, "eval_samples_per_second": 29.433, "eval_steps_per_second": 14.717, "step": 280 }, { "epoch": 0.16906474820143885, "grad_norm": 0.0006203249213285744, "learning_rate": 9.946361664814942e-06, "loss": 0.0, "step": 282 }, { "epoch": 0.17026378896882494, "grad_norm": 0.42489224672317505, "learning_rate": 9.945430542156647e-06, "loss": 0.0003, "step": 284 }, { "epoch": 0.17146282973621102, "grad_norm": 0.0006673445459455252, "learning_rate": 9.944491451423829e-06, "loss": 0.0, "step": 286 }, { "epoch": 0.17266187050359713, "grad_norm": 0.001332795829512179, "learning_rate": 9.943544394129552e-06, "loss": 0.0, "step": 288 }, { "epoch": 0.17386091127098321, "grad_norm": 0.0024546540807932615, "learning_rate": 9.942589371799715e-06, "loss": 0.0, "step": 290 }, { "epoch": 0.1750599520383693, "grad_norm": 0.02707788720726967, "learning_rate": 9.941626385973047e-06, "loss": 0.0, "step": 292 }, { "epoch": 0.17625899280575538, "grad_norm": 0.02786150760948658, "learning_rate": 9.940655438201113e-06, "loss": 0.0, "step": 294 }, { "epoch": 0.1774580335731415, "grad_norm": 0.0014117067912593484, "learning_rate": 9.9396765300483e-06, "loss": 0.0, "step": 296 }, { "epoch": 0.17865707434052758, "grad_norm": 0.001168401911854744, "learning_rate": 9.938689663091828e-06, "loss": 0.0, "step": 298 }, { "epoch": 0.17985611510791366, "grad_norm": 0.0008562824805267155, "learning_rate": 9.937694838921734e-06, "loss": 0.0, "step": 300 }, { "epoch": 0.18105515587529977, "grad_norm": 0.0015114444540813565, "learning_rate": 9.93669205914088e-06, "loss": 0.0, "step": 302 }, { "epoch": 0.18225419664268586, "grad_norm": 0.0009819003753364086, "learning_rate": 9.93568132536494e-06, "loss": 0.0, "step": 304 }, { "epoch": 0.18345323741007194, "grad_norm": 0.0008016325882636011, "learning_rate": 9.934662639222412e-06, "loss": 0.0, "step": 306 }, { "epoch": 0.18465227817745802, "grad_norm": 0.0034802353475242853, "learning_rate": 9.9336360023546e-06, "loss": 0.0, "step": 308 }, { "epoch": 0.18585131894484413, "grad_norm": 0.001296436763368547, "learning_rate": 9.932601416415622e-06, "loss": 0.0, "step": 310 }, { "epoch": 0.18705035971223022, "grad_norm": 0.002522464143112302, "learning_rate": 9.931558883072403e-06, "loss": 0.0, "step": 312 }, { "epoch": 0.1882494004796163, "grad_norm": 0.0009957245783880353, "learning_rate": 9.930508404004668e-06, "loss": 0.0, "step": 314 }, { "epoch": 0.18944844124700239, "grad_norm": 0.0007532662712037563, "learning_rate": 9.929449980904952e-06, "loss": 0.0, "step": 316 }, { "epoch": 0.1906474820143885, "grad_norm": 0.0008860212983563542, "learning_rate": 9.928383615478586e-06, "loss": 0.0, "step": 318 }, { "epoch": 0.19184652278177458, "grad_norm": 0.0017747296951711178, "learning_rate": 9.927309309443696e-06, "loss": 0.0, "step": 320 }, { "epoch": 0.19184652278177458, "eval_accuracy": 0.9999920792079207, "eval_loss": 2.161687552870717e-05, "eval_runtime": 156.1914, "eval_samples_per_second": 32.012, "eval_steps_per_second": 16.006, "step": 320 }, { "epoch": 0.19304556354916066, "grad_norm": 0.0015335682546719909, "learning_rate": 9.9262270645312e-06, "loss": 0.0, "step": 322 }, { "epoch": 0.19424460431654678, "grad_norm": 0.0007997534703463316, "learning_rate": 9.925136882484816e-06, "loss": 0.0, "step": 324 }, { "epoch": 0.19544364508393286, "grad_norm": 0.0005852611502632499, "learning_rate": 9.924038765061042e-06, "loss": 0.0, "step": 326 }, { "epoch": 0.19664268585131894, "grad_norm": 0.0008533517247997224, "learning_rate": 9.922932714029163e-06, "loss": 0.0, "step": 328 }, { "epoch": 0.19784172661870503, "grad_norm": 0.01592393033206463, "learning_rate": 9.921818731171249e-06, "loss": 0.0, "step": 330 }, { "epoch": 0.19904076738609114, "grad_norm": 0.0006013785023242235, "learning_rate": 9.920696818282147e-06, "loss": 0.0, "step": 332 }, { "epoch": 0.20023980815347722, "grad_norm": 0.0004799037706106901, "learning_rate": 9.919566977169486e-06, "loss": 0.0, "step": 334 }, { "epoch": 0.2014388489208633, "grad_norm": 0.0005945286829955876, "learning_rate": 9.918429209653662e-06, "loss": 0.0, "step": 336 }, { "epoch": 0.2026378896882494, "grad_norm": 0.0005350305582396686, "learning_rate": 9.917283517567845e-06, "loss": 0.0, "step": 338 }, { "epoch": 0.2038369304556355, "grad_norm": 0.02156531624495983, "learning_rate": 9.916129902757977e-06, "loss": 0.0, "step": 340 }, { "epoch": 0.20503597122302158, "grad_norm": 0.0011801808141171932, "learning_rate": 9.914968367082756e-06, "loss": 0.0, "step": 342 }, { "epoch": 0.20623501199040767, "grad_norm": 0.0006892263190820813, "learning_rate": 9.913798912413653e-06, "loss": 0.0, "step": 344 }, { "epoch": 0.20743405275779375, "grad_norm": 0.0007035438320599496, "learning_rate": 9.912621540634889e-06, "loss": 0.0, "step": 346 }, { "epoch": 0.20863309352517986, "grad_norm": 0.0004865892988163978, "learning_rate": 9.911436253643445e-06, "loss": 0.0, "step": 348 }, { "epoch": 0.20983213429256595, "grad_norm": 0.00048770001740194857, "learning_rate": 9.910243053349055e-06, "loss": 0.0, "step": 350 }, { "epoch": 0.21103117505995203, "grad_norm": 0.0004005413793493062, "learning_rate": 9.909041941674205e-06, "loss": 0.0, "step": 352 }, { "epoch": 0.21223021582733814, "grad_norm": 0.00044572821934707463, "learning_rate": 9.90783292055412e-06, "loss": 0.0, "step": 354 }, { "epoch": 0.21342925659472423, "grad_norm": 0.0003730950120370835, "learning_rate": 9.906615991936781e-06, "loss": 0.0, "step": 356 }, { "epoch": 0.2146282973621103, "grad_norm": 0.001168469781987369, "learning_rate": 9.905391157782897e-06, "loss": 0.0, "step": 358 }, { "epoch": 0.2158273381294964, "grad_norm": 0.0033604097552597523, "learning_rate": 9.904158420065923e-06, "loss": 0.0, "step": 360 }, { "epoch": 0.2158273381294964, "eval_accuracy": 1.0, "eval_loss": 2.9810146315867314e-06, "eval_runtime": 148.7746, "eval_samples_per_second": 33.608, "eval_steps_per_second": 16.804, "step": 360 } ], "logging_steps": 2, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 40, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.7507577767723e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }