| { | |
| "best_global_step": 360, | |
| "best_metric": 1.0, | |
| "best_model_checkpoint": "/projects/bffw/darora1/llm_ipc/final_models/mpi_async_n3/checkpoint-360", | |
| "epoch": 0.2158273381294964, | |
| "eval_steps": 40, | |
| "global_step": 360, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.001199040767386091, | |
| "grad_norm": 19.730182647705078, | |
| "learning_rate": 2.0000000000000002e-07, | |
| "loss": 1.2585, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.002398081534772182, | |
| "grad_norm": 23.144609451293945, | |
| "learning_rate": 6.000000000000001e-07, | |
| "loss": 1.3757, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.0035971223021582736, | |
| "grad_norm": 17.20735740661621, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 1.1802, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.004796163069544364, | |
| "grad_norm": 17.95670509338379, | |
| "learning_rate": 1.4000000000000001e-06, | |
| "loss": 1.1788, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.005995203836930456, | |
| "grad_norm": 15.297016143798828, | |
| "learning_rate": 1.8000000000000001e-06, | |
| "loss": 1.0785, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.007194244604316547, | |
| "grad_norm": 15.091691017150879, | |
| "learning_rate": 2.2e-06, | |
| "loss": 1.0445, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.008393285371702638, | |
| "grad_norm": 9.994537353515625, | |
| "learning_rate": 2.6e-06, | |
| "loss": 0.8632, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.009592326139088728, | |
| "grad_norm": 7.879035472869873, | |
| "learning_rate": 3e-06, | |
| "loss": 0.6384, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.01079136690647482, | |
| "grad_norm": 10.125263214111328, | |
| "learning_rate": 3.4000000000000005e-06, | |
| "loss": 0.6511, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.011990407673860911, | |
| "grad_norm": 6.4151201248168945, | |
| "learning_rate": 3.8000000000000005e-06, | |
| "loss": 0.4297, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.013189448441247002, | |
| "grad_norm": 4.507650375366211, | |
| "learning_rate": 4.2000000000000004e-06, | |
| "loss": 0.3096, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.014388489208633094, | |
| "grad_norm": 3.6591382026672363, | |
| "learning_rate": 4.600000000000001e-06, | |
| "loss": 0.2421, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.015587529976019185, | |
| "grad_norm": 3.2937803268432617, | |
| "learning_rate": 5e-06, | |
| "loss": 0.1674, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.016786570743405275, | |
| "grad_norm": 3.2421910762786865, | |
| "learning_rate": 5.400000000000001e-06, | |
| "loss": 0.1182, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.017985611510791366, | |
| "grad_norm": 2.621964693069458, | |
| "learning_rate": 5.8e-06, | |
| "loss": 0.1051, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.019184652278177457, | |
| "grad_norm": 2.452547550201416, | |
| "learning_rate": 6.200000000000001e-06, | |
| "loss": 0.0872, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.02038369304556355, | |
| "grad_norm": 1.978013038635254, | |
| "learning_rate": 6.600000000000001e-06, | |
| "loss": 0.079, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.02158273381294964, | |
| "grad_norm": 1.5187039375305176, | |
| "learning_rate": 7e-06, | |
| "loss": 0.0635, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.022781774580335732, | |
| "grad_norm": 1.7821204662322998, | |
| "learning_rate": 7.4e-06, | |
| "loss": 0.0496, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.023980815347721823, | |
| "grad_norm": 2.5544259548187256, | |
| "learning_rate": 7.800000000000002e-06, | |
| "loss": 0.038, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.023980815347721823, | |
| "eval_accuracy": 0.9842424205572071, | |
| "eval_loss": 0.044016700237989426, | |
| "eval_runtime": 153.6211, | |
| "eval_samples_per_second": 32.548, | |
| "eval_steps_per_second": 16.274, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.025179856115107913, | |
| "grad_norm": 2.1579580307006836, | |
| "learning_rate": 8.2e-06, | |
| "loss": 0.0405, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.026378896882494004, | |
| "grad_norm": 2.6840529441833496, | |
| "learning_rate": 8.6e-06, | |
| "loss": 0.0418, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.027577937649880094, | |
| "grad_norm": 3.79921555519104, | |
| "learning_rate": 9e-06, | |
| "loss": 0.0309, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.02877697841726619, | |
| "grad_norm": 2.689685344696045, | |
| "learning_rate": 9.4e-06, | |
| "loss": 0.0292, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.02997601918465228, | |
| "grad_norm": 1.4302867650985718, | |
| "learning_rate": 9.800000000000001e-06, | |
| "loss": 0.0268, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03117505995203837, | |
| "grad_norm": 1.27105712890625, | |
| "learning_rate": 9.999998993000299e-06, | |
| "loss": 0.0261, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.03237410071942446, | |
| "grad_norm": 1.1987839937210083, | |
| "learning_rate": 9.999990937005126e-06, | |
| "loss": 0.0199, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.03357314148681055, | |
| "grad_norm": 0.746536135673523, | |
| "learning_rate": 9.999974825027756e-06, | |
| "loss": 0.0142, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.03477218225419664, | |
| "grad_norm": 1.485276222229004, | |
| "learning_rate": 9.999950657094151e-06, | |
| "loss": 0.0151, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.03597122302158273, | |
| "grad_norm": 1.5013028383255005, | |
| "learning_rate": 9.999918433243253e-06, | |
| "loss": 0.0129, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.03717026378896882, | |
| "grad_norm": 0.8688841462135315, | |
| "learning_rate": 9.999878153526974e-06, | |
| "loss": 0.0103, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.03836930455635491, | |
| "grad_norm": 0.7988501191139221, | |
| "learning_rate": 9.99982981801022e-06, | |
| "loss": 0.0098, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.039568345323741004, | |
| "grad_norm": 1.101700782775879, | |
| "learning_rate": 9.999773426770864e-06, | |
| "loss": 0.0089, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.0407673860911271, | |
| "grad_norm": 1.0563534498214722, | |
| "learning_rate": 9.999708979899769e-06, | |
| "loss": 0.0082, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.04196642685851319, | |
| "grad_norm": 0.845487117767334, | |
| "learning_rate": 9.999636477500765e-06, | |
| "loss": 0.0082, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.04316546762589928, | |
| "grad_norm": 0.8372548818588257, | |
| "learning_rate": 9.999555919690673e-06, | |
| "loss": 0.0072, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.04436450839328537, | |
| "grad_norm": 0.7056983709335327, | |
| "learning_rate": 9.999467306599285e-06, | |
| "loss": 0.0058, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.045563549160671464, | |
| "grad_norm": 0.49338242411613464, | |
| "learning_rate": 9.999370638369377e-06, | |
| "loss": 0.0039, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.046762589928057555, | |
| "grad_norm": 0.8962873816490173, | |
| "learning_rate": 9.999265915156697e-06, | |
| "loss": 0.0049, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.047961630695443645, | |
| "grad_norm": 1.2066389322280884, | |
| "learning_rate": 9.999153137129978e-06, | |
| "loss": 0.005, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.047961630695443645, | |
| "eval_accuracy": 0.9984770185297835, | |
| "eval_loss": 0.0045981272123754025, | |
| "eval_runtime": 149.4888, | |
| "eval_samples_per_second": 33.447, | |
| "eval_steps_per_second": 16.724, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.049160671462829736, | |
| "grad_norm": 0.41922950744628906, | |
| "learning_rate": 9.999032304470926e-06, | |
| "loss": 0.0033, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.050359712230215826, | |
| "grad_norm": 0.6883206367492676, | |
| "learning_rate": 9.998903417374228e-06, | |
| "loss": 0.0024, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.05155875299760192, | |
| "grad_norm": 0.4788949489593506, | |
| "learning_rate": 9.998766476047546e-06, | |
| "loss": 0.0025, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.05275779376498801, | |
| "grad_norm": 0.6724292039871216, | |
| "learning_rate": 9.998621480711522e-06, | |
| "loss": 0.0019, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.0539568345323741, | |
| "grad_norm": 1.373579978942871, | |
| "learning_rate": 9.998468431599768e-06, | |
| "loss": 0.003, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.05515587529976019, | |
| "grad_norm": 1.0157880783081055, | |
| "learning_rate": 9.99830732895888e-06, | |
| "loss": 0.0028, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.05635491606714628, | |
| "grad_norm": 1.0597639083862305, | |
| "learning_rate": 9.998138173048424e-06, | |
| "loss": 0.0021, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.05755395683453238, | |
| "grad_norm": 1.0094547271728516, | |
| "learning_rate": 9.997960964140946e-06, | |
| "loss": 0.002, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.05875299760191847, | |
| "grad_norm": 1.0186887979507446, | |
| "learning_rate": 9.997775702521965e-06, | |
| "loss": 0.0024, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.05995203836930456, | |
| "grad_norm": 0.6651451587677002, | |
| "learning_rate": 9.997582388489975e-06, | |
| "loss": 0.0009, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.06115107913669065, | |
| "grad_norm": 0.9047290682792664, | |
| "learning_rate": 9.99738102235644e-06, | |
| "loss": 0.0025, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.06235011990407674, | |
| "grad_norm": 0.3253116011619568, | |
| "learning_rate": 9.997171604445803e-06, | |
| "loss": 0.0012, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.06354916067146282, | |
| "grad_norm": 1.0086337327957153, | |
| "learning_rate": 9.99695413509548e-06, | |
| "loss": 0.0019, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.06474820143884892, | |
| "grad_norm": 0.7249751687049866, | |
| "learning_rate": 9.996728614655854e-06, | |
| "loss": 0.0018, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.06594724220623502, | |
| "grad_norm": 0.6050100326538086, | |
| "learning_rate": 9.996495043490285e-06, | |
| "loss": 0.0012, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0671462829736211, | |
| "grad_norm": 0.7636982798576355, | |
| "learning_rate": 9.996253421975103e-06, | |
| "loss": 0.0015, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.0683453237410072, | |
| "grad_norm": 0.5241732001304626, | |
| "learning_rate": 9.996003750499608e-06, | |
| "loss": 0.001, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.06954436450839328, | |
| "grad_norm": 0.4341820180416107, | |
| "learning_rate": 9.995746029466071e-06, | |
| "loss": 0.001, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.07074340527577938, | |
| "grad_norm": 1.206058382987976, | |
| "learning_rate": 9.995480259289731e-06, | |
| "loss": 0.002, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.07194244604316546, | |
| "grad_norm": 0.501316249370575, | |
| "learning_rate": 9.995206440398798e-06, | |
| "loss": 0.0011, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.07194244604316546, | |
| "eval_accuracy": 0.9991422278695195, | |
| "eval_loss": 0.002768160542473197, | |
| "eval_runtime": 150.9106, | |
| "eval_samples_per_second": 33.132, | |
| "eval_steps_per_second": 16.566, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.07314148681055156, | |
| "grad_norm": 0.918229877948761, | |
| "learning_rate": 9.994924573234448e-06, | |
| "loss": 0.0028, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.07434052757793765, | |
| "grad_norm": 0.28935667872428894, | |
| "learning_rate": 9.994634658250825e-06, | |
| "loss": 0.0022, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.07553956834532374, | |
| "grad_norm": 0.5728291869163513, | |
| "learning_rate": 9.994336695915041e-06, | |
| "loss": 0.0013, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.07673860911270983, | |
| "grad_norm": 0.28649207949638367, | |
| "learning_rate": 9.994030686707171e-06, | |
| "loss": 0.0014, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.07793764988009592, | |
| "grad_norm": 0.3085499703884125, | |
| "learning_rate": 9.993716631120259e-06, | |
| "loss": 0.0018, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.07913669064748201, | |
| "grad_norm": 0.3971754312515259, | |
| "learning_rate": 9.993394529660307e-06, | |
| "loss": 0.0009, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.0803357314148681, | |
| "grad_norm": 0.15936186909675598, | |
| "learning_rate": 9.99306438284629e-06, | |
| "loss": 0.0008, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.0815347721822542, | |
| "grad_norm": 0.4664164185523987, | |
| "learning_rate": 9.992726191210139e-06, | |
| "loss": 0.0008, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.08273381294964029, | |
| "grad_norm": 0.2838437557220459, | |
| "learning_rate": 9.992379955296745e-06, | |
| "loss": 0.0004, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.08393285371702638, | |
| "grad_norm": 0.2806377112865448, | |
| "learning_rate": 9.992025675663966e-06, | |
| "loss": 0.0005, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.08513189448441247, | |
| "grad_norm": 0.8728582859039307, | |
| "learning_rate": 9.991663352882615e-06, | |
| "loss": 0.0005, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.08633093525179857, | |
| "grad_norm": 0.33335745334625244, | |
| "learning_rate": 9.991292987536469e-06, | |
| "loss": 0.0003, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.08752997601918465, | |
| "grad_norm": 0.43252527713775635, | |
| "learning_rate": 9.990914580222258e-06, | |
| "loss": 0.0006, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.08872901678657075, | |
| "grad_norm": 0.36625614762306213, | |
| "learning_rate": 9.990528131549674e-06, | |
| "loss": 0.0003, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.08992805755395683, | |
| "grad_norm": 0.1192215234041214, | |
| "learning_rate": 9.990133642141359e-06, | |
| "loss": 0.0005, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.09112709832134293, | |
| "grad_norm": 0.26630905270576477, | |
| "learning_rate": 9.989731112632917e-06, | |
| "loss": 0.0007, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.09232613908872901, | |
| "grad_norm": 0.1390163153409958, | |
| "learning_rate": 9.989320543672904e-06, | |
| "loss": 0.0004, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.09352517985611511, | |
| "grad_norm": 0.05628788471221924, | |
| "learning_rate": 9.988901935922826e-06, | |
| "loss": 0.0004, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.09472422062350119, | |
| "grad_norm": 0.6800597310066223, | |
| "learning_rate": 9.988475290057145e-06, | |
| "loss": 0.0007, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.09592326139088729, | |
| "grad_norm": 0.2103985697031021, | |
| "learning_rate": 9.988040606763272e-06, | |
| "loss": 0.001, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.09592326139088729, | |
| "eval_accuracy": 0.9999359200176863, | |
| "eval_loss": 0.0002029576717177406, | |
| "eval_runtime": 153.972, | |
| "eval_samples_per_second": 32.473, | |
| "eval_steps_per_second": 16.237, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.09712230215827339, | |
| "grad_norm": 0.4128981828689575, | |
| "learning_rate": 9.98759788674157e-06, | |
| "loss": 0.0004, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.09832134292565947, | |
| "grad_norm": 0.46466055512428284, | |
| "learning_rate": 9.987147130705347e-06, | |
| "loss": 0.0009, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.09952038369304557, | |
| "grad_norm": 0.7768604755401611, | |
| "learning_rate": 9.986688339380863e-06, | |
| "loss": 0.0005, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.10071942446043165, | |
| "grad_norm": 0.5354371070861816, | |
| "learning_rate": 9.98622151350732e-06, | |
| "loss": 0.0009, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.10191846522781775, | |
| "grad_norm": 0.33338215947151184, | |
| "learning_rate": 9.985746653836867e-06, | |
| "loss": 0.0005, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.10311750599520383, | |
| "grad_norm": 0.15407763421535492, | |
| "learning_rate": 9.985263761134602e-06, | |
| "loss": 0.0002, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.10431654676258993, | |
| "grad_norm": 0.07146434485912323, | |
| "learning_rate": 9.984772836178559e-06, | |
| "loss": 0.0003, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.10551558752997602, | |
| "grad_norm": 0.12465538829565048, | |
| "learning_rate": 9.984273879759713e-06, | |
| "loss": 0.0001, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.10671462829736211, | |
| "grad_norm": 0.22077329456806183, | |
| "learning_rate": 9.983766892681985e-06, | |
| "loss": 0.0002, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.1079136690647482, | |
| "grad_norm": 0.06069932505488396, | |
| "learning_rate": 9.983251875762234e-06, | |
| "loss": 0.0002, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.1091127098321343, | |
| "grad_norm": 0.05713481828570366, | |
| "learning_rate": 9.982728829830252e-06, | |
| "loss": 0.0001, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.11031175059952038, | |
| "grad_norm": 0.09267017990350723, | |
| "learning_rate": 9.982197755728771e-06, | |
| "loss": 0.0003, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.11151079136690648, | |
| "grad_norm": 0.047195322811603546, | |
| "learning_rate": 9.981658654313458e-06, | |
| "loss": 0.0, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.11270983213429256, | |
| "grad_norm": 0.042289845645427704, | |
| "learning_rate": 9.981111526452912e-06, | |
| "loss": 0.0, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.11390887290167866, | |
| "grad_norm": 0.08245964348316193, | |
| "learning_rate": 9.980556373028665e-06, | |
| "loss": 0.0002, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.11510791366906475, | |
| "grad_norm": 0.01631307043135166, | |
| "learning_rate": 9.979993194935182e-06, | |
| "loss": 0.0, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.11630695443645084, | |
| "grad_norm": 0.22155308723449707, | |
| "learning_rate": 9.979421993079853e-06, | |
| "loss": 0.0001, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.11750599520383694, | |
| "grad_norm": 0.08721671253442764, | |
| "learning_rate": 9.978842768382999e-06, | |
| "loss": 0.0, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.11870503597122302, | |
| "grad_norm": 0.00864589773118496, | |
| "learning_rate": 9.978255521777865e-06, | |
| "loss": 0.0, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.11990407673860912, | |
| "grad_norm": 0.025131428614258766, | |
| "learning_rate": 9.977660254210623e-06, | |
| "loss": 0.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.11990407673860912, | |
| "eval_accuracy": 0.9999842230318391, | |
| "eval_loss": 4.0982533391797915e-05, | |
| "eval_runtime": 156.2587, | |
| "eval_samples_per_second": 31.998, | |
| "eval_steps_per_second": 15.999, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.1211031175059952, | |
| "grad_norm": 0.08378835767507553, | |
| "learning_rate": 9.977056966640368e-06, | |
| "loss": 0.0, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.1223021582733813, | |
| "grad_norm": 0.004677386488765478, | |
| "learning_rate": 9.976445660039118e-06, | |
| "loss": 0.0, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.12350119904076738, | |
| "grad_norm": 0.012990830466151237, | |
| "learning_rate": 9.975826335391808e-06, | |
| "loss": 0.0, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.12470023980815348, | |
| "grad_norm": 0.06871869415044785, | |
| "learning_rate": 9.975198993696294e-06, | |
| "loss": 0.0, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.12589928057553956, | |
| "grad_norm": 0.03185407817363739, | |
| "learning_rate": 9.974563635963348e-06, | |
| "loss": 0.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.12709832134292565, | |
| "grad_norm": 0.007331969682127237, | |
| "learning_rate": 9.973920263216658e-06, | |
| "loss": 0.0, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.12829736211031176, | |
| "grad_norm": 0.006134955212473869, | |
| "learning_rate": 9.973268876492827e-06, | |
| "loss": 0.0, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.12949640287769784, | |
| "grad_norm": 0.002639917889609933, | |
| "learning_rate": 9.972609476841368e-06, | |
| "loss": 0.0, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.13069544364508393, | |
| "grad_norm": 0.139603853225708, | |
| "learning_rate": 9.971942065324704e-06, | |
| "loss": 0.0, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.13189448441247004, | |
| "grad_norm": 0.002388500142842531, | |
| "learning_rate": 9.971266643018171e-06, | |
| "loss": 0.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.13309352517985612, | |
| "grad_norm": 0.005431812256574631, | |
| "learning_rate": 9.970583211010008e-06, | |
| "loss": 0.0, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.1342925659472422, | |
| "grad_norm": 0.002608460607007146, | |
| "learning_rate": 9.969891770401358e-06, | |
| "loss": 0.0, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.1354916067146283, | |
| "grad_norm": 0.008019981905817986, | |
| "learning_rate": 9.969192322306271e-06, | |
| "loss": 0.0, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.1366906474820144, | |
| "grad_norm": 0.08590810745954514, | |
| "learning_rate": 9.968484867851698e-06, | |
| "loss": 0.0, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.13788968824940048, | |
| "grad_norm": 0.004419062752276659, | |
| "learning_rate": 9.96776940817749e-06, | |
| "loss": 0.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.13908872901678657, | |
| "grad_norm": 0.08247098326683044, | |
| "learning_rate": 9.967045944436392e-06, | |
| "loss": 0.0, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.14028776978417265, | |
| "grad_norm": 0.001451000920496881, | |
| "learning_rate": 9.966314477794052e-06, | |
| "loss": 0.0, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.14148681055155876, | |
| "grad_norm": 0.0005800220533274114, | |
| "learning_rate": 9.965575009429006e-06, | |
| "loss": 0.0, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.14268585131894485, | |
| "grad_norm": 0.0006440122961066663, | |
| "learning_rate": 9.964827540532685e-06, | |
| "loss": 0.0, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.14388489208633093, | |
| "grad_norm": 0.002776832552626729, | |
| "learning_rate": 9.964072072309412e-06, | |
| "loss": 0.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.14388489208633093, | |
| "eval_accuracy": 0.9999919991999199, | |
| "eval_loss": 1.5212925063678995e-05, | |
| "eval_runtime": 187.7963, | |
| "eval_samples_per_second": 26.625, | |
| "eval_steps_per_second": 13.312, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.145083932853717, | |
| "grad_norm": 0.0011048481101170182, | |
| "learning_rate": 9.963308605976397e-06, | |
| "loss": 0.0, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.14628297362110312, | |
| "grad_norm": 0.06405247747898102, | |
| "learning_rate": 9.962537142763733e-06, | |
| "loss": 0.0, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.1474820143884892, | |
| "grad_norm": 0.0006763112614862621, | |
| "learning_rate": 9.961757683914406e-06, | |
| "loss": 0.0, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.1486810551558753, | |
| "grad_norm": 0.0009437328553758562, | |
| "learning_rate": 9.960970230684276e-06, | |
| "loss": 0.0, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.1498800959232614, | |
| "grad_norm": 0.005290859844535589, | |
| "learning_rate": 9.96017478434209e-06, | |
| "loss": 0.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.1510791366906475, | |
| "grad_norm": 0.0017193189123645425, | |
| "learning_rate": 9.959371346169466e-06, | |
| "loss": 0.0, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.15227817745803357, | |
| "grad_norm": 0.00039530443609692156, | |
| "learning_rate": 9.958559917460909e-06, | |
| "loss": 0.0, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.15347721822541965, | |
| "grad_norm": 0.0005756777245551348, | |
| "learning_rate": 9.957740499523787e-06, | |
| "loss": 0.0, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.15467625899280577, | |
| "grad_norm": 0.2923714220523834, | |
| "learning_rate": 9.95691309367835e-06, | |
| "loss": 0.0001, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.15587529976019185, | |
| "grad_norm": 0.0004579645174089819, | |
| "learning_rate": 9.95607770125771e-06, | |
| "loss": 0.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.15707434052757793, | |
| "grad_norm": 0.00043058019946329296, | |
| "learning_rate": 9.955234323607854e-06, | |
| "loss": 0.0, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.15827338129496402, | |
| "grad_norm": 0.0007559367222711444, | |
| "learning_rate": 9.954382962087628e-06, | |
| "loss": 0.0, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.15947242206235013, | |
| "grad_norm": 0.0007242615101858974, | |
| "learning_rate": 9.95352361806875e-06, | |
| "loss": 0.0, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.1606714628297362, | |
| "grad_norm": 0.001006856095045805, | |
| "learning_rate": 9.95265629293579e-06, | |
| "loss": 0.0, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.1618705035971223, | |
| "grad_norm": 0.0025918015744537115, | |
| "learning_rate": 9.951780988086183e-06, | |
| "loss": 0.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.1630695443645084, | |
| "grad_norm": 0.0006822652067057788, | |
| "learning_rate": 9.950897704930223e-06, | |
| "loss": 0.0002, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.1642685851318945, | |
| "grad_norm": 0.0006394693627953529, | |
| "learning_rate": 9.95000644489105e-06, | |
| "loss": 0.0, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.16546762589928057, | |
| "grad_norm": 0.0007676673121750355, | |
| "learning_rate": 9.949107209404664e-06, | |
| "loss": 0.0, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.16666666666666666, | |
| "grad_norm": 0.001793363830074668, | |
| "learning_rate": 9.948199999919914e-06, | |
| "loss": 0.0, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.16786570743405277, | |
| "grad_norm": 0.0004261991416569799, | |
| "learning_rate": 9.947284817898493e-06, | |
| "loss": 0.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.16786570743405277, | |
| "eval_accuracy": 0.9999959789856537, | |
| "eval_loss": 2.9676788471988402e-05, | |
| "eval_runtime": 169.8758, | |
| "eval_samples_per_second": 29.433, | |
| "eval_steps_per_second": 14.717, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.16906474820143885, | |
| "grad_norm": 0.0006203249213285744, | |
| "learning_rate": 9.946361664814942e-06, | |
| "loss": 0.0, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.17026378896882494, | |
| "grad_norm": 0.42489224672317505, | |
| "learning_rate": 9.945430542156647e-06, | |
| "loss": 0.0003, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.17146282973621102, | |
| "grad_norm": 0.0006673445459455252, | |
| "learning_rate": 9.944491451423829e-06, | |
| "loss": 0.0, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.17266187050359713, | |
| "grad_norm": 0.001332795829512179, | |
| "learning_rate": 9.943544394129552e-06, | |
| "loss": 0.0, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.17386091127098321, | |
| "grad_norm": 0.0024546540807932615, | |
| "learning_rate": 9.942589371799715e-06, | |
| "loss": 0.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.1750599520383693, | |
| "grad_norm": 0.02707788720726967, | |
| "learning_rate": 9.941626385973047e-06, | |
| "loss": 0.0, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.17625899280575538, | |
| "grad_norm": 0.02786150760948658, | |
| "learning_rate": 9.940655438201113e-06, | |
| "loss": 0.0, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.1774580335731415, | |
| "grad_norm": 0.0014117067912593484, | |
| "learning_rate": 9.9396765300483e-06, | |
| "loss": 0.0, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.17865707434052758, | |
| "grad_norm": 0.001168401911854744, | |
| "learning_rate": 9.938689663091828e-06, | |
| "loss": 0.0, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.17985611510791366, | |
| "grad_norm": 0.0008562824805267155, | |
| "learning_rate": 9.937694838921734e-06, | |
| "loss": 0.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.18105515587529977, | |
| "grad_norm": 0.0015114444540813565, | |
| "learning_rate": 9.93669205914088e-06, | |
| "loss": 0.0, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.18225419664268586, | |
| "grad_norm": 0.0009819003753364086, | |
| "learning_rate": 9.93568132536494e-06, | |
| "loss": 0.0, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.18345323741007194, | |
| "grad_norm": 0.0008016325882636011, | |
| "learning_rate": 9.934662639222412e-06, | |
| "loss": 0.0, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.18465227817745802, | |
| "grad_norm": 0.0034802353475242853, | |
| "learning_rate": 9.9336360023546e-06, | |
| "loss": 0.0, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.18585131894484413, | |
| "grad_norm": 0.001296436763368547, | |
| "learning_rate": 9.932601416415622e-06, | |
| "loss": 0.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.18705035971223022, | |
| "grad_norm": 0.002522464143112302, | |
| "learning_rate": 9.931558883072403e-06, | |
| "loss": 0.0, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.1882494004796163, | |
| "grad_norm": 0.0009957245783880353, | |
| "learning_rate": 9.930508404004668e-06, | |
| "loss": 0.0, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.18944844124700239, | |
| "grad_norm": 0.0007532662712037563, | |
| "learning_rate": 9.929449980904952e-06, | |
| "loss": 0.0, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.1906474820143885, | |
| "grad_norm": 0.0008860212983563542, | |
| "learning_rate": 9.928383615478586e-06, | |
| "loss": 0.0, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.19184652278177458, | |
| "grad_norm": 0.0017747296951711178, | |
| "learning_rate": 9.927309309443696e-06, | |
| "loss": 0.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.19184652278177458, | |
| "eval_accuracy": 0.9999920792079207, | |
| "eval_loss": 2.161687552870717e-05, | |
| "eval_runtime": 156.1914, | |
| "eval_samples_per_second": 32.012, | |
| "eval_steps_per_second": 16.006, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.19304556354916066, | |
| "grad_norm": 0.0015335682546719909, | |
| "learning_rate": 9.9262270645312e-06, | |
| "loss": 0.0, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.19424460431654678, | |
| "grad_norm": 0.0007997534703463316, | |
| "learning_rate": 9.925136882484816e-06, | |
| "loss": 0.0, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.19544364508393286, | |
| "grad_norm": 0.0005852611502632499, | |
| "learning_rate": 9.924038765061042e-06, | |
| "loss": 0.0, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.19664268585131894, | |
| "grad_norm": 0.0008533517247997224, | |
| "learning_rate": 9.922932714029163e-06, | |
| "loss": 0.0, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.19784172661870503, | |
| "grad_norm": 0.01592393033206463, | |
| "learning_rate": 9.921818731171249e-06, | |
| "loss": 0.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.19904076738609114, | |
| "grad_norm": 0.0006013785023242235, | |
| "learning_rate": 9.920696818282147e-06, | |
| "loss": 0.0, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.20023980815347722, | |
| "grad_norm": 0.0004799037706106901, | |
| "learning_rate": 9.919566977169486e-06, | |
| "loss": 0.0, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.2014388489208633, | |
| "grad_norm": 0.0005945286829955876, | |
| "learning_rate": 9.918429209653662e-06, | |
| "loss": 0.0, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.2026378896882494, | |
| "grad_norm": 0.0005350305582396686, | |
| "learning_rate": 9.917283517567845e-06, | |
| "loss": 0.0, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.2038369304556355, | |
| "grad_norm": 0.02156531624495983, | |
| "learning_rate": 9.916129902757977e-06, | |
| "loss": 0.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.20503597122302158, | |
| "grad_norm": 0.0011801808141171932, | |
| "learning_rate": 9.914968367082756e-06, | |
| "loss": 0.0, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.20623501199040767, | |
| "grad_norm": 0.0006892263190820813, | |
| "learning_rate": 9.913798912413653e-06, | |
| "loss": 0.0, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.20743405275779375, | |
| "grad_norm": 0.0007035438320599496, | |
| "learning_rate": 9.912621540634889e-06, | |
| "loss": 0.0, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.20863309352517986, | |
| "grad_norm": 0.0004865892988163978, | |
| "learning_rate": 9.911436253643445e-06, | |
| "loss": 0.0, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.20983213429256595, | |
| "grad_norm": 0.00048770001740194857, | |
| "learning_rate": 9.910243053349055e-06, | |
| "loss": 0.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.21103117505995203, | |
| "grad_norm": 0.0004005413793493062, | |
| "learning_rate": 9.909041941674205e-06, | |
| "loss": 0.0, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.21223021582733814, | |
| "grad_norm": 0.00044572821934707463, | |
| "learning_rate": 9.90783292055412e-06, | |
| "loss": 0.0, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.21342925659472423, | |
| "grad_norm": 0.0003730950120370835, | |
| "learning_rate": 9.906615991936781e-06, | |
| "loss": 0.0, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.2146282973621103, | |
| "grad_norm": 0.001168469781987369, | |
| "learning_rate": 9.905391157782897e-06, | |
| "loss": 0.0, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.2158273381294964, | |
| "grad_norm": 0.0033604097552597523, | |
| "learning_rate": 9.904158420065923e-06, | |
| "loss": 0.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.2158273381294964, | |
| "eval_accuracy": 1.0, | |
| "eval_loss": 2.9810146315867314e-06, | |
| "eval_runtime": 148.7746, | |
| "eval_samples_per_second": 33.608, | |
| "eval_steps_per_second": 16.804, | |
| "step": 360 | |
| } | |
| ], | |
| "logging_steps": 2, | |
| "max_steps": 5000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 40, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 8.7507577767723e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |